From 3e4093db58e4f56a811c6ea69a4e9549be7d7d71 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 25 Jul 2025 15:10:05 +0200 Subject: [PATCH] use HTMLParser and add options from CLI Signed-off-by: Michele Dolfi --- docling/backend/mets_gbs_backend.py | 14 +++++--------- docling/cli/main.py | 10 ++++++++++ docling/datamodel/document.py | 2 +- tests/test_backend_mets_gbs.py | 20 ++++---------------- 4 files changed, 20 insertions(+), 26 deletions(-) diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py index 3c06a872..4ed100b3 100644 --- a/docling/backend/mets_gbs_backend.py +++ b/docling/backend/mets_gbs_backend.py @@ -305,13 +305,13 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend): ocr_file = self._tar.extractfile(ocr_info.path) assert ocr_file is not None ocr_content = ocr_file.read() - ocr_root: etree._Element = etree.fromstring(ocr_content) + parser = etree.HTMLParser() + ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser) line_cells: List[TextCell] = [] word_cells: List[TextCell] = [] - ns = {"x": "http://www.w3.org/1999/xhtml"} - page_div = ocr_root.xpath("//x:div[@class='ocr_page']", namespaces=ns) + page_div = ocr_root.xpath("//div[@class='ocr_page']") size = Size(width=im.size[0], height=im.size[1]) if page_div: @@ -326,9 +326,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend): im = im.convert("RGB") # Extract all ocrx_word spans - for ix, word in enumerate( - ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns) - ): + for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")): text = "".join(word.itertext()).strip() title = word.attrib.get("title", "") rect = _extract_rect(title) @@ -347,9 +345,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend): # Extract all ocr_line spans # line: etree._Element - for ix, line in enumerate( - ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns) - ): + for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")): text = "".join(line.itertext()).strip() title = line.attrib.get("title", "") rect = _extract_rect(title) diff --git a/docling/cli/main.py b/docling/cli/main.py index ae275ea9..8ed127a6 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -26,6 +26,7 @@ from rich.console import Console from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend +from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions @@ -601,9 +602,18 @@ def convert( # noqa: C901 backend=backend, # pdf_backend ) + # METS GBS options + mets_gbs_options = pipeline_options.model_copy() + mets_gbs_options.do_ocr = False + mets_gbs_format_option = PdfFormatOption( + pipeline_options=mets_gbs_options, + backend=MetsGbsDocumentBackend, + ) + format_options = { InputFormat.PDF: pdf_format_option, InputFormat.IMAGE: pdf_format_option, + InputFormat.XML_METS_GBS: mets_gbs_format_option, } elif pipeline == ProcessingPipeline.VLM: diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index a9a3c9b1..b1ca0372 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -482,7 +482,7 @@ class _DocumentConversionInput(BaseModel): if member.name.endswith(".xml"): file = tar.extractfile(member) if file is not None: - content_str = file.read().decode() + content_str = file.read().decode(errors="ignore") if "http://www.loc.gov/METS/" in content_str: return "application/mets+xml" return None diff --git a/tests/test_backend_mets_gbs.py b/tests/test_backend_mets_gbs.py index c8be4327..894579ec 100644 --- a/tests/test_backend_mets_gbs.py +++ b/tests/test_backend_mets_gbs.py @@ -9,7 +9,7 @@ from docling.datamodel.document import InputDocument @pytest.fixture def test_doc_path(): - return Path("/Users/dol/Downloads/32044009881525.tar.gz") + return Path("/Users/dol/Downloads/32044009881525_select.tar.gz") def _get_backend(pdf_doc): @@ -39,7 +39,7 @@ def test_process_pages(test_doc_path): def test_get_text_from_rect(test_doc_path): doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path) - page_backend: MetsGbsPageBackend = doc_backend.load_page(9) + page_backend: MetsGbsPageBackend = doc_backend.load_page(0) # Get the title text of the DocLayNet paper textpiece = page_backend.get_text_in_rect( @@ -56,7 +56,7 @@ def test_get_text_from_rect(test_doc_path): def test_crop_page_image(test_doc_path): doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path) - page_backend: MetsGbsPageBackend = doc_backend.load_page(9) + page_backend: MetsGbsPageBackend = doc_backend.load_page(0) page_backend.get_page_image( scale=2, cropbox=BoundingBox(l=270, t=587, r=1385, b=1995) @@ -68,22 +68,10 @@ def test_crop_page_image(test_doc_path): doc_backend.unload() -def test_crop_page_image_jp2(test_doc_path): - doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path) - page_backend: MetsGbsPageBackend = doc_backend.load_page(1) - - page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=160, t=29, r=732, b=173)) - # im.show() - - # Explicitly clean up resources - page_backend.unload() - doc_backend.unload() - - def test_num_pages(test_doc_path): doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path) assert doc_backend.is_valid() - assert doc_backend.page_count() == 276 + assert doc_backend.page_count() == 3 # Explicitly clean up resources to prevent race conditions in CI doc_backend.unload()