From 3e4093db58e4f56a811c6ea69a4e9549be7d7d71 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Fri, 25 Jul 2025 15:10:05 +0200
Subject: [PATCH] use HTMLParser and add options from CLI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/backend/mets_gbs_backend.py | 14 +++++---------
 docling/cli/main.py                 | 10 ++++++++++
 docling/datamodel/document.py       |  2 +-
 tests/test_backend_mets_gbs.py      | 20 ++++----------------
 4 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py
index 3c06a872..4ed100b3 100644
--- a/docling/backend/mets_gbs_backend.py
+++ b/docling/backend/mets_gbs_backend.py
@@ -305,13 +305,13 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
         ocr_file = self._tar.extractfile(ocr_info.path)
         assert ocr_file is not None
         ocr_content = ocr_file.read()
-        ocr_root: etree._Element = etree.fromstring(ocr_content)
+        parser = etree.HTMLParser()
+        ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
 
         line_cells: List[TextCell] = []
         word_cells: List[TextCell] = []
 
-        ns = {"x": "http://www.w3.org/1999/xhtml"}
-        page_div = ocr_root.xpath("//x:div[@class='ocr_page']", namespaces=ns)
+        page_div = ocr_root.xpath("//div[@class='ocr_page']")
 
         size = Size(width=im.size[0], height=im.size[1])
         if page_div:
@@ -326,9 +326,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
         im = im.convert("RGB")
 
         # Extract all ocrx_word spans
-        for ix, word in enumerate(
-            ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns)
-        ):
+        for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
             text = "".join(word.itertext()).strip()
             title = word.attrib.get("title", "")
             rect = _extract_rect(title)
@@ -347,9 +345,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
 
         # Extract all ocr_line spans
         # line: etree._Element
-        for ix, line in enumerate(
-            ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns)
-        ):
+        for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
             text = "".join(line.itertext()).strip()
             title = line.attrib.get("title", "")
             rect = _extract_rect(title)
diff --git a/docling/cli/main.py b/docling/cli/main.py
index ae275ea9..8ed127a6 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -26,6 +26,7 @@ from rich.console import Console
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
@@ -601,9 +602,18 @@ def convert(  # noqa: C901
                 backend=backend,  # pdf_backend
             )
 
+            # METS GBS options
+            mets_gbs_options = pipeline_options.model_copy()
+            mets_gbs_options.do_ocr = False
+            mets_gbs_format_option = PdfFormatOption(
+                pipeline_options=mets_gbs_options,
+                backend=MetsGbsDocumentBackend,
+            )
+
             format_options = {
                 InputFormat.PDF: pdf_format_option,
                 InputFormat.IMAGE: pdf_format_option,
+                InputFormat.XML_METS_GBS: mets_gbs_format_option,
             }
 
         elif pipeline == ProcessingPipeline.VLM:
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index a9a3c9b1..b1ca0372 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -482,7 +482,7 @@ class _DocumentConversionInput(BaseModel):
                 if member.name.endswith(".xml"):
                     file = tar.extractfile(member)
                     if file is not None:
-                        content_str = file.read().decode()
+                        content_str = file.read().decode(errors="ignore")
                         if "http://www.loc.gov/METS/" in content_str:
                             return "application/mets+xml"
         return None
diff --git a/tests/test_backend_mets_gbs.py b/tests/test_backend_mets_gbs.py
index c8be4327..894579ec 100644
--- a/tests/test_backend_mets_gbs.py
+++ b/tests/test_backend_mets_gbs.py
@@ -9,7 +9,7 @@ from docling.datamodel.document import InputDocument
 
 @pytest.fixture
 def test_doc_path():
-    return Path("/Users/dol/Downloads/32044009881525.tar.gz")
+    return Path("/Users/dol/Downloads/32044009881525_select.tar.gz")
 
 
 def _get_backend(pdf_doc):
@@ -39,7 +39,7 @@ def test_process_pages(test_doc_path):
 
 def test_get_text_from_rect(test_doc_path):
     doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
-    page_backend: MetsGbsPageBackend = doc_backend.load_page(9)
+    page_backend: MetsGbsPageBackend = doc_backend.load_page(0)
 
     # Get the title text of the DocLayNet paper
     textpiece = page_backend.get_text_in_rect(
@@ -56,7 +56,7 @@ def test_get_text_from_rect(test_doc_path):
 
 def test_crop_page_image(test_doc_path):
     doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
-    page_backend: MetsGbsPageBackend = doc_backend.load_page(9)
+    page_backend: MetsGbsPageBackend = doc_backend.load_page(0)
 
     page_backend.get_page_image(
         scale=2, cropbox=BoundingBox(l=270, t=587, r=1385, b=1995)
@@ -68,22 +68,10 @@ def test_crop_page_image(test_doc_path):
     doc_backend.unload()
 
 
-def test_crop_page_image_jp2(test_doc_path):
-    doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
-    page_backend: MetsGbsPageBackend = doc_backend.load_page(1)
-
-    page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=160, t=29, r=732, b=173))
-    # im.show()
-
-    # Explicitly clean up resources
-    page_backend.unload()
-    doc_backend.unload()
-
-
 def test_num_pages(test_doc_path):
     doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
     assert doc_backend.is_valid()
-    assert doc_backend.page_count() == 276
+    assert doc_backend.page_count() == 3
 
     # Explicitly clean up resources to prevent race conditions in CI
     doc_backend.unload()