use HTMLParser and add options from CLI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-07-25 15:10:05 +02:00 · 2025-07-25 15:10:05 +02:00 · 3e4093db58
commit 3e4093db58
parent 7c3f9b7ab1
4 changed files with 20 additions and 26 deletions
--- a/docling/backend/mets_gbs_backend.py
+++ b/docling/backend/mets_gbs_backend.py
@ -305,13 +305,13 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
        ocr_file = self._tar.extractfile(ocr_info.path)
        assert ocr_file is not None
        ocr_content = ocr_file.read()
-        ocr_root: etree._Element = etree.fromstring(ocr_content)
+        parser = etree.HTMLParser()
        ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
        line_cells: List[TextCell] = []
        word_cells: List[TextCell] = []
-        ns = {"x": "http://www.w3.org/1999/xhtml"}
+        page_div = ocr_root.xpath("//div[@class='ocr_page']")
        page_div = ocr_root.xpath("//x:div[@class='ocr_page']", namespaces=ns)
        size = Size(width=im.size[0], height=im.size[1])
        if page_div:
@ -326,9 +326,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
        im = im.convert("RGB")
        # Extract all ocrx_word spans
-        for ix, word in enumerate(
+        for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
            ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns)
        ):
            text = "".join(word.itertext()).strip()
            title = word.attrib.get("title", "")
            rect = _extract_rect(title)
@ -347,9 +345,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
        # Extract all ocr_line spans
        # line: etree._Element
-        for ix, line in enumerate(
+        for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
            ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns)
        ):
            text = "".join(line.itertext()).strip()
            title = line.attrib.get("title", "")
            rect = _extract_rect(title)
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -26,6 +26,7 @@ from rich.console import Console
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
@ -601,9 +602,18 @@ def convert(  # noqa: C901
                backend=backend,  # pdf_backend
            )
            # METS GBS options
            mets_gbs_options = pipeline_options.model_copy()
            mets_gbs_options.do_ocr = False
            mets_gbs_format_option = PdfFormatOption(
                pipeline_options=mets_gbs_options,
                backend=MetsGbsDocumentBackend,
            )
            format_options = {
                InputFormat.PDF: pdf_format_option,
                InputFormat.IMAGE: pdf_format_option,
                InputFormat.XML_METS_GBS: mets_gbs_format_option,
            }
        elif pipeline == ProcessingPipeline.VLM:
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -482,7 +482,7 @@ class _DocumentConversionInput(BaseModel):
                if member.name.endswith(".xml"):
                    file = tar.extractfile(member)
                    if file is not None:
-                        content_str = file.read().decode()
+                        content_str = file.read().decode(errors="ignore")
                        if "http://www.loc.gov/METS/" in content_str:
                            return "application/mets+xml"
        return None
--- a/tests/test_backend_mets_gbs.py
+++ b/tests/test_backend_mets_gbs.py
@ -9,7 +9,7 @@ from docling.datamodel.document import InputDocument
@pytest.fixture
 def test_doc_path():
-    return Path("/Users/dol/Downloads/32044009881525.tar.gz")
+    return Path("/Users/dol/Downloads/32044009881525_select.tar.gz")
 def _get_backend(pdf_doc):
@ -39,7 +39,7 @@ def test_process_pages(test_doc_path):
 def test_get_text_from_rect(test_doc_path):
    doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
-    page_backend: MetsGbsPageBackend = doc_backend.load_page(9)
+    page_backend: MetsGbsPageBackend = doc_backend.load_page(0)
    # Get the title text of the DocLayNet paper
    textpiece = page_backend.get_text_in_rect(
@ -56,7 +56,7 @@ def test_get_text_from_rect(test_doc_path):
 def test_crop_page_image(test_doc_path):
    doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
-    page_backend: MetsGbsPageBackend = doc_backend.load_page(9)
+    page_backend: MetsGbsPageBackend = doc_backend.load_page(0)
    page_backend.get_page_image(
        scale=2, cropbox=BoundingBox(l=270, t=587, r=1385, b=1995)
@ -68,22 +68,10 @@ def test_crop_page_image(test_doc_path):
    doc_backend.unload()
 def test_crop_page_image_jp2(test_doc_path):
    doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
    page_backend: MetsGbsPageBackend = doc_backend.load_page(1)
    page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=160, t=29, r=732, b=173))
    # im.show()
    # Explicitly clean up resources
    page_backend.unload()
    doc_backend.unload()
 def test_num_pages(test_doc_path):
    doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
    assert doc_backend.is_valid()
-    assert doc_backend.page_count() == 276
+    assert doc_backend.page_count() == 3
    # Explicitly clean up resources to prevent race conditions in CI
    doc_backend.unload()