From 7bc1c1ac3d6aa17506e26ff79dd2f0028f02f461 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Thu, 24 Jul 2025 19:18:29 +0200 Subject: [PATCH 1/8] add backend for METS with Google Books profile Signed-off-by: Michele Dolfi --- docling/backend/mets_gbs_backend.py | 387 ++++++++++++++++++++++++++++ docling/datamodel/base_models.py | 3 + docling/datamodel/document.py | 47 +++- docling/document_converter.py | 4 + docling/pipeline/base_pipeline.py | 11 +- tests/test_backend_mets_gbs.py | 89 +++++++ 6 files changed, 528 insertions(+), 13 deletions(-) create mode 100644 docling/backend/mets_gbs_backend.py create mode 100644 tests/test_backend_mets_gbs.py diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py new file mode 100644 index 00000000..f2a7d2b5 --- /dev/null +++ b/docling/backend/mets_gbs_backend.py @@ -0,0 +1,387 @@ +"""Backend for GBS Google Books schema.""" + +import logging +import tarfile +from collections.abc import Iterable +from dataclasses import dataclass +from enum import Enum +from io import BytesIO +from pathlib import Path +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union + +from docling_core.types.doc import BoundingBox, CoordOrigin, Size +from docling_core.types.doc.page import ( + BoundingRectangle, + PdfPageBoundaryType, + PdfPageGeometry, + SegmentedPdfPage, + TextCell, +) +from lxml import etree +from PIL import Image +from PIL.Image import Image as PILImage + +from docling.backend.abstract_backend import PaginatedDocumentBackend +from docling.backend.pdf_backend import PdfPageBackend +from docling.datamodel.base_models import InputFormat + +if TYPE_CHECKING: + from docling.datamodel.document import InputDocument + +_log = logging.getLogger(__name__) + + +def _get_pdf_page_geometry( + size: Size, +) -> PdfPageGeometry: + boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX + + bbox_tuple = (0, 0, size.width, size.height) + bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.TOPLEFT) + + return PdfPageGeometry( + angle=0.0, + rect=BoundingRectangle.from_bounding_box(bbox), + boundary_type=boundary_type, + art_bbox=bbox, + bleed_bbox=bbox, + crop_bbox=bbox, + media_bbox=bbox, + trim_bbox=bbox, + ) + + +class MetsGbsPageBackend(PdfPageBackend): + def __init__(self, parsed_page: SegmentedPdfPage, page_im: PILImage): + self._im = page_im + self._dpage = parsed_page + self.valid = parsed_page is not None + + def is_valid(self) -> bool: + return self.valid + + def get_text_in_rect(self, bbox: BoundingBox) -> str: + # Find intersecting cells on the page + text_piece = "" + page_size = self.get_size() + + scale = ( + 1 # FIX - Replace with param in get_text_in_rect across backends (optional) + ) + + for i, cell in enumerate(self._dpage.textline_cells): + cell_bbox = ( + cell.rect.to_bounding_box() + .to_top_left_origin(page_height=page_size.height) + .scaled(scale) + ) + + overlap_frac = cell_bbox.intersection_over_self(bbox) + + if overlap_frac > 0.5: + if len(text_piece) > 0: + text_piece += " " + text_piece += cell.text + + return text_piece + + def get_segmented_page(self) -> Optional[SegmentedPdfPage]: + return self._dpage + + def get_text_cells(self) -> Iterable[TextCell]: + return self._dpage.textline_cells + + def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: + AREA_THRESHOLD = 0 # 32 * 32 + + images = self._dpage.bitmap_resources + + for img in images: + cropbox = img.rect.to_bounding_box().to_top_left_origin( + self.get_size().height + ) + + if cropbox.area() > AREA_THRESHOLD: + cropbox = cropbox.scaled(scale=scale) + + yield cropbox + + def get_page_image( + self, scale: float = 1, cropbox: Optional[BoundingBox] = None + ) -> Image.Image: + page_size = self.get_size() + assert ( + page_size.width == self._im.size[0] and page_size.height == self._im.size[1] + ) + + if not cropbox: + cropbox = BoundingBox( + l=0, + r=page_size.width, + t=0, + b=page_size.height, + coord_origin=CoordOrigin.TOPLEFT, + ) + + image = self._im.resize( + size=(round(page_size.width * scale), round(page_size.height * scale)) + ).crop(cropbox.scaled(scale=scale).as_tuple()) + return image + + def get_size(self) -> Size: + return Size( + width=self._dpage.dimension.width, height=self._dpage.dimension.height + ) + + def unload(self): + self._ppage = None + self._dpage = None + + +class _UseType(str, Enum): + IMAGE = "image" + OCR = "OCR" + COORD_OCR = "coordOCR" + + +@dataclass +class _FileInfo: + file_id: str + mimetype: str + path: str + use: _UseType + + +@dataclass +class _PageFiles: + image: Optional[_FileInfo] = None + ocr: Optional[_FileInfo] = None + coordOCR: Optional[_FileInfo] = None + + +def _extract_rect(title_str: str) -> Optional[BoundingRectangle]: + """ + Extracts bbox from title string like 'bbox 279 177 306 214;x_wconf 97' + """ + parts = title_str.split(";") + for part in parts: + part = part.strip() + if part.startswith("bbox "): + try: + coords = part.split()[1:] + rect = BoundingRectangle.from_bounding_box( + bbox=BoundingBox.from_tuple( + tuple(map(int, coords)), origin=CoordOrigin.TOPLEFT + ) + ) + return rect + except Exception: + return None + return None + + +def _extract_confidence(title_str) -> float: + """Extracts x_wconf (OCR confidence) value from title string.""" + for part in title_str.split(";"): + part = part.strip() + if part.startswith("x_wconf"): + try: + return float(part.split()[1]) / 100.0 + except Exception: + return 1 + return 1 + + +class MetsGbsDocumentBackend(PaginatedDocumentBackend): + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + self._tar: tarfile.TarFile = ( + tarfile.open(name=self.path_or_stream, mode="r:gz") + if isinstance(self.path_or_stream, Path) + else tarfile.open(fileobj=self.path_or_stream, mode="r:gz") + ) + self.root_mets: Optional[etree._Element] = None + self.page_map: Dict[int, _PageFiles] = {} + + for member in self._tar.getmembers(): + if member.name.endswith(".xml"): + file = self._tar.extractfile(member) + if file is not None: + content = file.read() + self.root_mets = self._validate_mets_xml(content) + if self.root_mets is not None: + break + + if self.root_mets is None: + raise RuntimeError( + f"METS GBS backend could not load document {self.document_hash}." + ) + + ns = { + "mets": "http://www.loc.gov/METS/", + "xlink": "http://www.w3.org/1999/xlink", + "xsi": "http://www.w3.org/2001/XMLSchema-instance", + "gbs": "http://books.google.com/gbs", + "premis": "info:lc/xmlns/premis-v2", + "marc": "http://www.loc.gov/MARC21/slim", + } + + file_info_by_id: Dict[str, _FileInfo] = {} + + for filegrp in self.root_mets.xpath(".//mets:fileGrp", namespaces=ns): + use_raw = filegrp.get("USE") + try: + use = _UseType(use_raw) + except ValueError: + continue # Ignore unknown USE types + + for file_elem in filegrp.xpath("./mets:file", namespaces=ns): + file_id = file_elem.get("ID") + mimetype = file_elem.get("MIMETYPE") + flocat_elem = file_elem.find("mets:FLocat", namespaces=ns) + href = ( + flocat_elem.get("{http://www.w3.org/1999/xlink}href") + if flocat_elem is not None + else None + ) + if href is None: + continue + + file_info_by_id[file_id] = _FileInfo( + file_id=file_id, mimetype=mimetype, path=href, use=use + ) + + USE_TO_ATTR = { + _UseType.IMAGE: "image", + _UseType.OCR: "ocr", + _UseType.COORD_OCR: "coordOCR", + } + + for div in self.root_mets.xpath('.//mets:div[@TYPE="page"]', namespaces=ns): + order_str = div.get("ORDER") + if not order_str: + continue + try: + page_no = int(order_str) - 1 # make 0-index pages + except ValueError: + continue + + page_files = _PageFiles() + + for fptr in div.xpath("./mets:fptr", namespaces=ns): + file_id = fptr.get("FILEID") + file_info = file_info_by_id.get(file_id) + + if file_info: + attr = USE_TO_ATTR.get(file_info.use) + if attr: + setattr(page_files, attr, file_info) + + self.page_map[page_no] = page_files + + def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]: + root: etree._Element = etree.fromstring(xml_string) + if ( + root.tag == "{http://www.loc.gov/METS/}mets" + and root.get("PROFILE") == "gbs" + ): + return root + + _log.warning(f"The root element is not with PROFILE='gbs': {root}") + return None + + def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]: + # TODO: use better fallbacks... + image_info = self.page_map[page_no].image + assert image_info is not None + ocr_info = self.page_map[page_no].coordOCR + assert ocr_info is not None + + image_file = self._tar.extractfile(image_info.path) + assert image_file is not None + buf = BytesIO(image_file.read()) + im: PILImage = Image.open(buf) + ocr_file = self._tar.extractfile(ocr_info.path) + assert ocr_file is not None + ocr_content = ocr_file.read() + ocr_root: etree._Element = etree.fromstring(ocr_content) + + line_cells: List[TextCell] = [] + word_cells: List[TextCell] = [] + + ns = {"x": "http://www.w3.org/1999/xhtml"} + page_div = ocr_root.xpath("//x:div[@class='ocr_page']", namespaces=ns) + + size = Size(width=im.size[0], height=im.size[1]) + if page_div: + title = page_div[0].attrib.get("title", "") + rect = _extract_rect(title) + if rect: + size = Size(width=rect.width, height=rect.height) + else: + _log.error(f"Could not find ocr_page for page {page_no}") + + im = im.resize(size=(round(size.width), round(size.height))) + im = im.convert("RGB") + + # Extract all ocrx_word spans + for word in ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns): + text = "".join(word.itertext()).strip() + title = word.attrib.get("title", "") + rect = _extract_rect(title) + conf = _extract_confidence(title) + if rect: + word_cells.append( + TextCell( + text=text, orig=text, rect=rect, from_ocr=True, confidence=conf + ) + ) + + # Extract all ocr_line spans + # line: etree._Element + for line in ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns): + text = "".join(line.itertext()).strip() + title = line.attrib.get("title", "") + rect = _extract_rect(title) + conf = _extract_confidence(title) + if rect: + line_cells.append( + TextCell( + text=text, orig=text, rect=rect, from_ocr=True, confidence=conf + ) + ) + + page = SegmentedPdfPage( + dimension=_get_pdf_page_geometry(size), + textline_cells=line_cells, + char_cells=[], + word_cells=word_cells, + has_textlines=True, + has_words=True, + has_chars=False, + ) + return page, im + + def page_count(self) -> int: + return len(self.page_map) + + def load_page(self, page_no: int) -> MetsGbsPageBackend: + # TODO: is this thread-safe? + page, im = self._parse_page(page_no) + return MetsGbsPageBackend(parsed_page=page, page_im=im) + + def is_valid(self) -> bool: + return self.root_mets is not None and self.page_count() > 0 + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return {InputFormat.XML_METS_GBS} + + @classmethod + def supports_pagination(cls) -> bool: + return True + + def unload(self): + super().unload() + self._tar.close() diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index c753ac60..6825e125 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -56,6 +56,7 @@ class InputFormat(str, Enum): XLSX = "xlsx" XML_USPTO = "xml_uspto" XML_JATS = "xml_jats" + XML_METS_GBS = "xml_mets_gbs" JSON_DOCLING = "json_docling" AUDIO = "audio" @@ -81,6 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = { InputFormat.CSV: ["csv"], InputFormat.XLSX: ["xlsx", "xlsm"], InputFormat.XML_USPTO: ["xml", "txt"], + InputFormat.XML_METS_GBS: ["tar.gz"], InputFormat.JSON_DOCLING: ["json"], InputFormat.AUDIO: ["wav", "mp3"], } @@ -113,6 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = { "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ], InputFormat.XML_USPTO: ["application/xml", "text/plain"], + InputFormat.XML_METS_GBS: ["application/mets+xml"], InputFormat.JSON_DOCLING: ["application/json"], InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"], } diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 9f5cf82c..a9a3c9b1 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -1,6 +1,7 @@ import csv import logging import re +import tarfile from collections.abc import Iterable from enum import Enum from io import BytesIO @@ -314,21 +315,25 @@ class _DocumentConversionInput(BaseModel): elif objname.endswith(".pptx"): mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation" + if mime is not None and mime.lower() == "application/gzip": + if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj): + mime = detected_mime + mime = mime or _DocumentConversionInput._detect_html_xhtml(content) mime = mime or _DocumentConversionInput._detect_csv(content) mime = mime or "text/plain" formats = MimeTypeToFormat.get(mime, []) _log.info(f"detected formats: {formats}") - if formats: - if len(formats) == 1 and mime not in ("text/plain"): - return formats[0] - else: # ambiguity in formats - return _DocumentConversionInput._guess_from_content( - content, mime, formats - ) - else: - return None + input_format: Optional[InputFormat] = None + if len(formats) == 1: + input_format = formats[0] + + if content: + input_format = _DocumentConversionInput._guess_from_content( + content, mime, formats + ) + return input_format @staticmethod def _guess_from_content( @@ -337,6 +342,9 @@ class _DocumentConversionInput(BaseModel): """Guess the input format of a document by checking part of its content.""" input_format: Optional[InputFormat] = None + if len(formats) == 1: + input_format = formats[0] + if mime == "application/xml": content_str = content.decode("utf-8") match_doctype = re.search(r"]+>", content_str) @@ -457,3 +465,24 @@ class _DocumentConversionInput(BaseModel): return None return None + + @staticmethod + def _detect_mets_gbs( + obj: Union[Path, DocumentStream], + ) -> Optional[Literal["application/mets+xml"]]: + content = obj if isinstance(obj, Path) else obj.stream + tar: tarfile.TarFile + member: tarfile.TarInfo + with tarfile.open( + name=content if isinstance(content, Path) else None, + fileobj=content if isinstance(content, BytesIO) else None, + mode="r:gz", + ) as tar: + for member in tar.getmembers(): + if member.name.endswith(".xml"): + file = tar.extractfile(member) + if file is not None: + content_str = file.read().decode() + if "http://www.loc.gov/METS/" in content_str: + return "application/mets+xml" + return None diff --git a/docling/document_converter.py b/docling/document_converter.py index f3bcb89e..fea14f38 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -17,6 +17,7 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.json.docling_json_backend import DoclingJSONBackend from docling.backend.md_backend import MarkdownDocumentBackend +from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend @@ -156,6 +157,9 @@ def _get_default_option(format: InputFormat) -> FormatOption: InputFormat.XML_JATS: FormatOption( pipeline_cls=SimplePipeline, backend=JatsDocumentBackend ), + InputFormat.XML_METS_GBS: FormatOption( + pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend + ), InputFormat.IMAGE: FormatOption( pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend ), diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 6944a355..5c289ad9 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -8,7 +8,10 @@ from typing import Any, Callable, List from docling_core.types.doc import NodeItem -from docling.backend.abstract_backend import AbstractDocumentBackend +from docling.backend.abstract_backend import ( + AbstractDocumentBackend, + PaginatedDocumentBackend, +) from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import ( ConversionStatus, @@ -126,10 +129,10 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. yield from page_batch def _build_document(self, conv_res: ConversionResult) -> ConversionResult: - if not isinstance(conv_res.input._backend, PdfDocumentBackend): + if not isinstance(conv_res.input._backend, PaginatedDocumentBackend): raise RuntimeError( - f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. " - f"Can not convert this with a PDF pipeline. " + f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a paginated backend. " + f"Can not convert this with a paginated PDF pipeline. " f"Please check your format configuration on DocumentConverter." ) # conv_res.status = ConversionStatus.FAILURE diff --git a/tests/test_backend_mets_gbs.py b/tests/test_backend_mets_gbs.py new file mode 100644 index 00000000..c8be4327 --- /dev/null +++ b/tests/test_backend_mets_gbs.py @@ -0,0 +1,89 @@ +from pathlib import Path + +import pytest + +from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend, MetsGbsPageBackend +from docling.datamodel.base_models import BoundingBox, InputFormat +from docling.datamodel.document import InputDocument + + +@pytest.fixture +def test_doc_path(): + return Path("/Users/dol/Downloads/32044009881525.tar.gz") + + +def _get_backend(pdf_doc): + in_doc = InputDocument( + path_or_stream=pdf_doc, + format=InputFormat.PDF, + backend=MetsGbsDocumentBackend, + ) + + doc_backend = in_doc._backend + return doc_backend + + +def test_process_pages(test_doc_path): + doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path) + + for page_index in range(doc_backend.page_count()): + page_backend: MetsGbsPageBackend = doc_backend.load_page(page_index) + list(page_backend.get_text_cells()) + + # Clean up page backend after each iteration + page_backend.unload() + + # Explicitly clean up document backend to prevent race conditions in CI + doc_backend.unload() + + +def test_get_text_from_rect(test_doc_path): + doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path) + page_backend: MetsGbsPageBackend = doc_backend.load_page(9) + + # Get the title text of the DocLayNet paper + textpiece = page_backend.get_text_in_rect( + bbox=BoundingBox(l=275, t=263, r=1388, b=311) + ) + ref = "recently become prevalent that he who speaks" + + assert textpiece.strip() == ref + + # Explicitly clean up resources + page_backend.unload() + doc_backend.unload() + + +def test_crop_page_image(test_doc_path): + doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path) + page_backend: MetsGbsPageBackend = doc_backend.load_page(9) + + page_backend.get_page_image( + scale=2, cropbox=BoundingBox(l=270, t=587, r=1385, b=1995) + ) + # im.show() + + # Explicitly clean up resources + page_backend.unload() + doc_backend.unload() + + +def test_crop_page_image_jp2(test_doc_path): + doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path) + page_backend: MetsGbsPageBackend = doc_backend.load_page(1) + + page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=160, t=29, r=732, b=173)) + # im.show() + + # Explicitly clean up resources + page_backend.unload() + doc_backend.unload() + + +def test_num_pages(test_doc_path): + doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path) + assert doc_backend.is_valid() + assert doc_backend.page_count() == 276 + + # Explicitly clean up resources to prevent race conditions in CI + doc_backend.unload() From 7c3f9b7ab129b415e10aaac981affe68112f5737 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Fri, 25 Jul 2025 11:55:48 +0200 Subject: [PATCH 2/8] Fixes for cell indexing Signed-off-by: Christoph Auer --- docling/backend/mets_gbs_backend.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py index f2a7d2b5..3c06a872 100644 --- a/docling/backend/mets_gbs_backend.py +++ b/docling/backend/mets_gbs_backend.py @@ -326,7 +326,9 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend): im = im.convert("RGB") # Extract all ocrx_word spans - for word in ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns): + for ix, word in enumerate( + ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns) + ): text = "".join(word.itertext()).strip() title = word.attrib.get("title", "") rect = _extract_rect(title) @@ -334,13 +336,20 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend): if rect: word_cells.append( TextCell( - text=text, orig=text, rect=rect, from_ocr=True, confidence=conf + index=ix, + text=text, + orig=text, + rect=rect, + from_ocr=True, + confidence=conf, ) ) # Extract all ocr_line spans # line: etree._Element - for line in ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns): + for ix, line in enumerate( + ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns) + ): text = "".join(line.itertext()).strip() title = line.attrib.get("title", "") rect = _extract_rect(title) @@ -348,7 +357,12 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend): if rect: line_cells.append( TextCell( - text=text, orig=text, rect=rect, from_ocr=True, confidence=conf + index=ix, + text=text, + orig=text, + rect=rect, + from_ocr=True, + confidence=conf, ) ) From 3e4093db58e4f56a811c6ea69a4e9549be7d7d71 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 25 Jul 2025 15:10:05 +0200 Subject: [PATCH 3/8] use HTMLParser and add options from CLI Signed-off-by: Michele Dolfi --- docling/backend/mets_gbs_backend.py | 14 +++++--------- docling/cli/main.py | 10 ++++++++++ docling/datamodel/document.py | 2 +- tests/test_backend_mets_gbs.py | 20 ++++---------------- 4 files changed, 20 insertions(+), 26 deletions(-) diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py index 3c06a872..4ed100b3 100644 --- a/docling/backend/mets_gbs_backend.py +++ b/docling/backend/mets_gbs_backend.py @@ -305,13 +305,13 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend): ocr_file = self._tar.extractfile(ocr_info.path) assert ocr_file is not None ocr_content = ocr_file.read() - ocr_root: etree._Element = etree.fromstring(ocr_content) + parser = etree.HTMLParser() + ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser) line_cells: List[TextCell] = [] word_cells: List[TextCell] = [] - ns = {"x": "http://www.w3.org/1999/xhtml"} - page_div = ocr_root.xpath("//x:div[@class='ocr_page']", namespaces=ns) + page_div = ocr_root.xpath("//div[@class='ocr_page']") size = Size(width=im.size[0], height=im.size[1]) if page_div: @@ -326,9 +326,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend): im = im.convert("RGB") # Extract all ocrx_word spans - for ix, word in enumerate( - ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns) - ): + for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")): text = "".join(word.itertext()).strip() title = word.attrib.get("title", "") rect = _extract_rect(title) @@ -347,9 +345,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend): # Extract all ocr_line spans # line: etree._Element - for ix, line in enumerate( - ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns) - ): + for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")): text = "".join(line.itertext()).strip() title = line.attrib.get("title", "") rect = _extract_rect(title) diff --git a/docling/cli/main.py b/docling/cli/main.py index ae275ea9..8ed127a6 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -26,6 +26,7 @@ from rich.console import Console from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend +from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions @@ -601,9 +602,18 @@ def convert( # noqa: C901 backend=backend, # pdf_backend ) + # METS GBS options + mets_gbs_options = pipeline_options.model_copy() + mets_gbs_options.do_ocr = False + mets_gbs_format_option = PdfFormatOption( + pipeline_options=mets_gbs_options, + backend=MetsGbsDocumentBackend, + ) + format_options = { InputFormat.PDF: pdf_format_option, InputFormat.IMAGE: pdf_format_option, + InputFormat.XML_METS_GBS: mets_gbs_format_option, } elif pipeline == ProcessingPipeline.VLM: diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index a9a3c9b1..b1ca0372 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -482,7 +482,7 @@ class _DocumentConversionInput(BaseModel): if member.name.endswith(".xml"): file = tar.extractfile(member) if file is not None: - content_str = file.read().decode() + content_str = file.read().decode(errors="ignore") if "http://www.loc.gov/METS/" in content_str: return "application/mets+xml" return None diff --git a/tests/test_backend_mets_gbs.py b/tests/test_backend_mets_gbs.py index c8be4327..894579ec 100644 --- a/tests/test_backend_mets_gbs.py +++ b/tests/test_backend_mets_gbs.py @@ -9,7 +9,7 @@ from docling.datamodel.document import InputDocument @pytest.fixture def test_doc_path(): - return Path("/Users/dol/Downloads/32044009881525.tar.gz") + return Path("/Users/dol/Downloads/32044009881525_select.tar.gz") def _get_backend(pdf_doc): @@ -39,7 +39,7 @@ def test_process_pages(test_doc_path): def test_get_text_from_rect(test_doc_path): doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path) - page_backend: MetsGbsPageBackend = doc_backend.load_page(9) + page_backend: MetsGbsPageBackend = doc_backend.load_page(0) # Get the title text of the DocLayNet paper textpiece = page_backend.get_text_in_rect( @@ -56,7 +56,7 @@ def test_get_text_from_rect(test_doc_path): def test_crop_page_image(test_doc_path): doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path) - page_backend: MetsGbsPageBackend = doc_backend.load_page(9) + page_backend: MetsGbsPageBackend = doc_backend.load_page(0) page_backend.get_page_image( scale=2, cropbox=BoundingBox(l=270, t=587, r=1385, b=1995) @@ -68,22 +68,10 @@ def test_crop_page_image(test_doc_path): doc_backend.unload() -def test_crop_page_image_jp2(test_doc_path): - doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path) - page_backend: MetsGbsPageBackend = doc_backend.load_page(1) - - page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=160, t=29, r=732, b=173)) - # im.show() - - # Explicitly clean up resources - page_backend.unload() - doc_backend.unload() - - def test_num_pages(test_doc_path): doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path) assert doc_backend.is_valid() - assert doc_backend.page_count() == 276 + assert doc_backend.page_count() == 3 # Explicitly clean up resources to prevent race conditions in CI doc_backend.unload() From bbb735d2de2f32eef3324565471f0b43955a703d Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 25 Jul 2025 15:13:10 +0200 Subject: [PATCH 4/8] fix typing and unloading Signed-off-by: Michele Dolfi --- docling/backend/mets_gbs_backend.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py index 4ed100b3..29e216a2 100644 --- a/docling/backend/mets_gbs_backend.py +++ b/docling/backend/mets_gbs_backend.py @@ -133,9 +133,11 @@ class MetsGbsPageBackend(PdfPageBackend): width=self._dpage.dimension.width, height=self._dpage.dimension.height ) - def unload(self): - self._ppage = None - self._dpage = None + def unload(self) -> None: + if hasattr(self, "_im"): + delattr(self, "_im") + if hasattr(self, "_dpage"): + delattr(self, "_dpage") class _UseType(str, Enum): @@ -392,6 +394,6 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend): def supports_pagination(cls) -> bool: return True - def unload(self): + def unload(self) -> None: super().unload() self._tar.close() From 79c59cb2b0a8bc25d02003bddbbe3c8955b43b6f Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 25 Jul 2025 15:15:05 +0200 Subject: [PATCH 5/8] restore guess format Signed-off-by: Michele Dolfi --- docling/datamodel/document.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index b1ca0372..b9832346 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -325,15 +325,15 @@ class _DocumentConversionInput(BaseModel): formats = MimeTypeToFormat.get(mime, []) _log.info(f"detected formats: {formats}") - input_format: Optional[InputFormat] = None - if len(formats) == 1: - input_format = formats[0] - - if content: - input_format = _DocumentConversionInput._guess_from_content( - content, mime, formats - ) - return input_format + if formats: + if len(formats) == 1 and mime not in ("text/plain"): + return formats[0] + else: # ambiguity in formats + return _DocumentConversionInput._guess_from_content( + content, mime, formats + ) + else: + return None @staticmethod def _guess_from_content( @@ -342,9 +342,6 @@ class _DocumentConversionInput(BaseModel): """Guess the input format of a document by checking part of its content.""" input_format: Optional[InputFormat] = None - if len(formats) == 1: - input_format = formats[0] - if mime == "application/xml": content_str = content.decode("utf-8") match_doctype = re.search(r"]+>", content_str) From 46b904e059231105bf6cfdedc82b3a940d8eb186 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 25 Jul 2025 15:16:41 +0200 Subject: [PATCH 6/8] rename inputformat Signed-off-by: Michele Dolfi --- docling/backend/mets_gbs_backend.py | 2 +- docling/cli/main.py | 2 +- docling/datamodel/base_models.py | 6 +++--- docling/document_converter.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py index 29e216a2..b3ca2fb2 100644 --- a/docling/backend/mets_gbs_backend.py +++ b/docling/backend/mets_gbs_backend.py @@ -388,7 +388,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend): @classmethod def supported_formats(cls) -> Set[InputFormat]: - return {InputFormat.XML_METS_GBS} + return {InputFormat.METS_GBS} @classmethod def supports_pagination(cls) -> bool: diff --git a/docling/cli/main.py b/docling/cli/main.py index 8ed127a6..db9dfc00 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -613,7 +613,7 @@ def convert( # noqa: C901 format_options = { InputFormat.PDF: pdf_format_option, InputFormat.IMAGE: pdf_format_option, - InputFormat.XML_METS_GBS: mets_gbs_format_option, + InputFormat.METS_GBS: mets_gbs_format_option, } elif pipeline == ProcessingPipeline.VLM: diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 6825e125..8edefe38 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -56,7 +56,7 @@ class InputFormat(str, Enum): XLSX = "xlsx" XML_USPTO = "xml_uspto" XML_JATS = "xml_jats" - XML_METS_GBS = "xml_mets_gbs" + METS_GBS = "xml_mets_gbs" JSON_DOCLING = "json_docling" AUDIO = "audio" @@ -82,7 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = { InputFormat.CSV: ["csv"], InputFormat.XLSX: ["xlsx", "xlsm"], InputFormat.XML_USPTO: ["xml", "txt"], - InputFormat.XML_METS_GBS: ["tar.gz"], + InputFormat.METS_GBS: ["tar.gz"], InputFormat.JSON_DOCLING: ["json"], InputFormat.AUDIO: ["wav", "mp3"], } @@ -115,7 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = { "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ], InputFormat.XML_USPTO: ["application/xml", "text/plain"], - InputFormat.XML_METS_GBS: ["application/mets+xml"], + InputFormat.METS_GBS: ["application/mets+xml"], InputFormat.JSON_DOCLING: ["application/json"], InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"], } diff --git a/docling/document_converter.py b/docling/document_converter.py index fea14f38..855a5caa 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -157,7 +157,7 @@ def _get_default_option(format: InputFormat) -> FormatOption: InputFormat.XML_JATS: FormatOption( pipeline_cls=SimplePipeline, backend=JatsDocumentBackend ), - InputFormat.XML_METS_GBS: FormatOption( + InputFormat.METS_GBS: FormatOption( pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend ), InputFormat.IMAGE: FormatOption( From 9da610e95b8f47ec516ea224226792761366fb21 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 25 Jul 2025 15:19:53 +0200 Subject: [PATCH 7/8] use PdfDocumentBackend Signed-off-by: Michele Dolfi --- docling/backend/mets_gbs_backend.py | 4 ++-- docling/backend/pdf_backend.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py index b3ca2fb2..09ac84ae 100644 --- a/docling/backend/mets_gbs_backend.py +++ b/docling/backend/mets_gbs_backend.py @@ -22,7 +22,7 @@ from PIL import Image from PIL.Image import Image as PILImage from docling.backend.abstract_backend import PaginatedDocumentBackend -from docling.backend.pdf_backend import PdfPageBackend +from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.datamodel.base_models import InputFormat if TYPE_CHECKING: @@ -194,7 +194,7 @@ def _extract_confidence(title_str) -> float: return 1 -class MetsGbsDocumentBackend(PaginatedDocumentBackend): +class MetsGbsDocumentBackend(PdfDocumentBackend): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py index 1b0d612e..e14b147a 100644 --- a/docling/backend/pdf_backend.py +++ b/docling/backend/pdf_backend.py @@ -84,9 +84,9 @@ class PdfDocumentBackend(PaginatedDocumentBackend): buf.seek(0) self.path_or_stream = buf - else: + elif self.input_format not in self.supported_formats(): raise RuntimeError( - f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend." + f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}." ) @abstractmethod @@ -99,7 +99,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend): @classmethod def supported_formats(cls) -> Set[InputFormat]: - return {InputFormat.PDF} + return {InputFormat.PDF, InputFormat.IMAGE} @classmethod def supports_pagination(cls) -> bool: From 5f5a3cd9148561cdf3b2b1048fbc6d24b261b728 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Fri, 25 Jul 2025 15:36:09 +0200 Subject: [PATCH 8/8] use test file from test folder (still missing) Signed-off-by: Michele Dolfi --- tests/test_backend_mets_gbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_backend_mets_gbs.py b/tests/test_backend_mets_gbs.py index 894579ec..7003e957 100644 --- a/tests/test_backend_mets_gbs.py +++ b/tests/test_backend_mets_gbs.py @@ -9,7 +9,7 @@ from docling.datamodel.document import InputDocument @pytest.fixture def test_doc_path(): - return Path("/Users/dol/Downloads/32044009881525_select.tar.gz") + return Path("tests/data/mets_gbs/32044009881525_select.tar.gz") def _get_backend(pdf_doc):