From 7bc1c1ac3d6aa17506e26ff79dd2f0028f02f461 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Thu, 24 Jul 2025 19:18:29 +0200
Subject: [PATCH 1/8] add backend for METS with Google Books profile

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/backend/mets_gbs_backend.py | 387 ++++++++++++++++++++++++++++
 docling/datamodel/base_models.py    |   3 +
 docling/datamodel/document.py       |  47 +++-
 docling/document_converter.py       |   4 +
 docling/pipeline/base_pipeline.py   |  11 +-
 tests/test_backend_mets_gbs.py      |  89 +++++++
 6 files changed, 528 insertions(+), 13 deletions(-)
 create mode 100644 docling/backend/mets_gbs_backend.py
 create mode 100644 tests/test_backend_mets_gbs.py

diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py
new file mode 100644
index 00000000..f2a7d2b5
--- /dev/null
+++ b/docling/backend/mets_gbs_backend.py
@@ -0,0 +1,387 @@
+"""Backend for GBS Google Books schema."""
+
+import logging
+import tarfile
+from collections.abc import Iterable
+from dataclasses import dataclass
+from enum import Enum
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
+
+from docling_core.types.doc import BoundingBox, CoordOrigin, Size
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    PdfPageBoundaryType,
+    PdfPageGeometry,
+    SegmentedPdfPage,
+    TextCell,
+)
+from lxml import etree
+from PIL import Image
+from PIL.Image import Image as PILImage
+
+from docling.backend.abstract_backend import PaginatedDocumentBackend
+from docling.backend.pdf_backend import PdfPageBackend
+from docling.datamodel.base_models import InputFormat
+
+if TYPE_CHECKING:
+    from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+def _get_pdf_page_geometry(
+    size: Size,
+) -> PdfPageGeometry:
+    boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX
+
+    bbox_tuple = (0, 0, size.width, size.height)
+    bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.TOPLEFT)
+
+    return PdfPageGeometry(
+        angle=0.0,
+        rect=BoundingRectangle.from_bounding_box(bbox),
+        boundary_type=boundary_type,
+        art_bbox=bbox,
+        bleed_bbox=bbox,
+        crop_bbox=bbox,
+        media_bbox=bbox,
+        trim_bbox=bbox,
+    )
+
+
+class MetsGbsPageBackend(PdfPageBackend):
+    def __init__(self, parsed_page: SegmentedPdfPage, page_im: PILImage):
+        self._im = page_im
+        self._dpage = parsed_page
+        self.valid = parsed_page is not None
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        # Find intersecting cells on the page
+        text_piece = ""
+        page_size = self.get_size()
+
+        scale = (
+            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
+        )
+
+        for i, cell in enumerate(self._dpage.textline_cells):
+            cell_bbox = (
+                cell.rect.to_bounding_box()
+                .to_top_left_origin(page_height=page_size.height)
+                .scaled(scale)
+            )
+
+            overlap_frac = cell_bbox.intersection_over_self(bbox)
+
+            if overlap_frac > 0.5:
+                if len(text_piece) > 0:
+                    text_piece += " "
+                text_piece += cell.text
+
+        return text_piece
+
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        return self._dpage
+
+    def get_text_cells(self) -> Iterable[TextCell]:
+        return self._dpage.textline_cells
+
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        AREA_THRESHOLD = 0  # 32 * 32
+
+        images = self._dpage.bitmap_resources
+
+        for img in images:
+            cropbox = img.rect.to_bounding_box().to_top_left_origin(
+                self.get_size().height
+            )
+
+            if cropbox.area() > AREA_THRESHOLD:
+                cropbox = cropbox.scaled(scale=scale)
+
+                yield cropbox
+
+    def get_page_image(
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+        page_size = self.get_size()
+        assert (
+            page_size.width == self._im.size[0] and page_size.height == self._im.size[1]
+        )
+
+        if not cropbox:
+            cropbox = BoundingBox(
+                l=0,
+                r=page_size.width,
+                t=0,
+                b=page_size.height,
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
+
+        image = self._im.resize(
+            size=(round(page_size.width * scale), round(page_size.height * scale))
+        ).crop(cropbox.scaled(scale=scale).as_tuple())
+        return image
+
+    def get_size(self) -> Size:
+        return Size(
+            width=self._dpage.dimension.width, height=self._dpage.dimension.height
+        )
+
+    def unload(self):
+        self._ppage = None
+        self._dpage = None
+
+
+class _UseType(str, Enum):
+    IMAGE = "image"
+    OCR = "OCR"
+    COORD_OCR = "coordOCR"
+
+
+@dataclass
+class _FileInfo:
+    file_id: str
+    mimetype: str
+    path: str
+    use: _UseType
+
+
+@dataclass
+class _PageFiles:
+    image: Optional[_FileInfo] = None
+    ocr: Optional[_FileInfo] = None
+    coordOCR: Optional[_FileInfo] = None
+
+
+def _extract_rect(title_str: str) -> Optional[BoundingRectangle]:
+    """
+    Extracts bbox from title string like 'bbox 279 177 306 214;x_wconf 97'
+    """
+    parts = title_str.split(";")
+    for part in parts:
+        part = part.strip()
+        if part.startswith("bbox "):
+            try:
+                coords = part.split()[1:]
+                rect = BoundingRectangle.from_bounding_box(
+                    bbox=BoundingBox.from_tuple(
+                        tuple(map(int, coords)), origin=CoordOrigin.TOPLEFT
+                    )
+                )
+                return rect
+            except Exception:
+                return None
+    return None
+
+
+def _extract_confidence(title_str) -> float:
+    """Extracts x_wconf (OCR confidence) value from title string."""
+    for part in title_str.split(";"):
+        part = part.strip()
+        if part.startswith("x_wconf"):
+            try:
+                return float(part.split()[1]) / 100.0
+            except Exception:
+                return 1
+    return 1
+
+
+class MetsGbsDocumentBackend(PaginatedDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
+        self._tar: tarfile.TarFile = (
+            tarfile.open(name=self.path_or_stream, mode="r:gz")
+            if isinstance(self.path_or_stream, Path)
+            else tarfile.open(fileobj=self.path_or_stream, mode="r:gz")
+        )
+        self.root_mets: Optional[etree._Element] = None
+        self.page_map: Dict[int, _PageFiles] = {}
+
+        for member in self._tar.getmembers():
+            if member.name.endswith(".xml"):
+                file = self._tar.extractfile(member)
+                if file is not None:
+                    content = file.read()
+                    self.root_mets = self._validate_mets_xml(content)
+                    if self.root_mets is not None:
+                        break
+
+        if self.root_mets is None:
+            raise RuntimeError(
+                f"METS GBS backend could not load document {self.document_hash}."
+            )
+
+        ns = {
+            "mets": "http://www.loc.gov/METS/",
+            "xlink": "http://www.w3.org/1999/xlink",
+            "xsi": "http://www.w3.org/2001/XMLSchema-instance",
+            "gbs": "http://books.google.com/gbs",
+            "premis": "info:lc/xmlns/premis-v2",
+            "marc": "http://www.loc.gov/MARC21/slim",
+        }
+
+        file_info_by_id: Dict[str, _FileInfo] = {}
+
+        for filegrp in self.root_mets.xpath(".//mets:fileGrp", namespaces=ns):
+            use_raw = filegrp.get("USE")
+            try:
+                use = _UseType(use_raw)
+            except ValueError:
+                continue  # Ignore unknown USE types
+
+            for file_elem in filegrp.xpath("./mets:file", namespaces=ns):
+                file_id = file_elem.get("ID")
+                mimetype = file_elem.get("MIMETYPE")
+                flocat_elem = file_elem.find("mets:FLocat", namespaces=ns)
+                href = (
+                    flocat_elem.get("{http://www.w3.org/1999/xlink}href")
+                    if flocat_elem is not None
+                    else None
+                )
+                if href is None:
+                    continue
+
+                file_info_by_id[file_id] = _FileInfo(
+                    file_id=file_id, mimetype=mimetype, path=href, use=use
+                )
+
+        USE_TO_ATTR = {
+            _UseType.IMAGE: "image",
+            _UseType.OCR: "ocr",
+            _UseType.COORD_OCR: "coordOCR",
+        }
+
+        for div in self.root_mets.xpath('.//mets:div[@TYPE="page"]', namespaces=ns):
+            order_str = div.get("ORDER")
+            if not order_str:
+                continue
+            try:
+                page_no = int(order_str) - 1  # make 0-index pages
+            except ValueError:
+                continue
+
+            page_files = _PageFiles()
+
+            for fptr in div.xpath("./mets:fptr", namespaces=ns):
+                file_id = fptr.get("FILEID")
+                file_info = file_info_by_id.get(file_id)
+
+                if file_info:
+                    attr = USE_TO_ATTR.get(file_info.use)
+                    if attr:
+                        setattr(page_files, attr, file_info)
+
+            self.page_map[page_no] = page_files
+
+    def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]:
+        root: etree._Element = etree.fromstring(xml_string)
+        if (
+            root.tag == "{http://www.loc.gov/METS/}mets"
+            and root.get("PROFILE") == "gbs"
+        ):
+            return root
+
+        _log.warning(f"The root element is not <mets:mets> with PROFILE='gbs': {root}")
+        return None
+
+    def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]:
+        # TODO: use better fallbacks...
+        image_info = self.page_map[page_no].image
+        assert image_info is not None
+        ocr_info = self.page_map[page_no].coordOCR
+        assert ocr_info is not None
+
+        image_file = self._tar.extractfile(image_info.path)
+        assert image_file is not None
+        buf = BytesIO(image_file.read())
+        im: PILImage = Image.open(buf)
+        ocr_file = self._tar.extractfile(ocr_info.path)
+        assert ocr_file is not None
+        ocr_content = ocr_file.read()
+        ocr_root: etree._Element = etree.fromstring(ocr_content)
+
+        line_cells: List[TextCell] = []
+        word_cells: List[TextCell] = []
+
+        ns = {"x": "http://www.w3.org/1999/xhtml"}
+        page_div = ocr_root.xpath("//x:div[@class='ocr_page']", namespaces=ns)
+
+        size = Size(width=im.size[0], height=im.size[1])
+        if page_div:
+            title = page_div[0].attrib.get("title", "")
+            rect = _extract_rect(title)
+            if rect:
+                size = Size(width=rect.width, height=rect.height)
+        else:
+            _log.error(f"Could not find ocr_page for page {page_no}")
+
+        im = im.resize(size=(round(size.width), round(size.height)))
+        im = im.convert("RGB")
+
+        # Extract all ocrx_word spans
+        for word in ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns):
+            text = "".join(word.itertext()).strip()
+            title = word.attrib.get("title", "")
+            rect = _extract_rect(title)
+            conf = _extract_confidence(title)
+            if rect:
+                word_cells.append(
+                    TextCell(
+                        text=text, orig=text, rect=rect, from_ocr=True, confidence=conf
+                    )
+                )
+
+        # Extract all ocr_line spans
+        # line: etree._Element
+        for line in ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns):
+            text = "".join(line.itertext()).strip()
+            title = line.attrib.get("title", "")
+            rect = _extract_rect(title)
+            conf = _extract_confidence(title)
+            if rect:
+                line_cells.append(
+                    TextCell(
+                        text=text, orig=text, rect=rect, from_ocr=True, confidence=conf
+                    )
+                )
+
+        page = SegmentedPdfPage(
+            dimension=_get_pdf_page_geometry(size),
+            textline_cells=line_cells,
+            char_cells=[],
+            word_cells=word_cells,
+            has_textlines=True,
+            has_words=True,
+            has_chars=False,
+        )
+        return page, im
+
+    def page_count(self) -> int:
+        return len(self.page_map)
+
+    def load_page(self, page_no: int) -> MetsGbsPageBackend:
+        # TODO: is this thread-safe?
+        page, im = self._parse_page(page_no)
+        return MetsGbsPageBackend(parsed_page=page, page_im=im)
+
+    def is_valid(self) -> bool:
+        return self.root_mets is not None and self.page_count() > 0
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.XML_METS_GBS}
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return True
+
+    def unload(self):
+        super().unload()
+        self._tar.close()
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index c753ac60..6825e125 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -56,6 +56,7 @@ class InputFormat(str, Enum):
     XLSX = "xlsx"
     XML_USPTO = "xml_uspto"
     XML_JATS = "xml_jats"
+    XML_METS_GBS = "xml_mets_gbs"
     JSON_DOCLING = "json_docling"
     AUDIO = "audio"
 
@@ -81,6 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.CSV: ["csv"],
     InputFormat.XLSX: ["xlsx", "xlsm"],
     InputFormat.XML_USPTO: ["xml", "txt"],
+    InputFormat.XML_METS_GBS: ["tar.gz"],
     InputFormat.JSON_DOCLING: ["json"],
     InputFormat.AUDIO: ["wav", "mp3"],
 }
@@ -113,6 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
     ],
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
+    InputFormat.XML_METS_GBS: ["application/mets+xml"],
     InputFormat.JSON_DOCLING: ["application/json"],
     InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
 }
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index 9f5cf82c..a9a3c9b1 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -1,6 +1,7 @@
 import csv
 import logging
 import re
+import tarfile
 from collections.abc import Iterable
 from enum import Enum
 from io import BytesIO
@@ -314,21 +315,25 @@ class _DocumentConversionInput(BaseModel):
                 elif objname.endswith(".pptx"):
                     mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
 
+        if mime is not None and mime.lower() == "application/gzip":
+            if detected_mime := _DocumentConversionInput._detect_mets_gbs(obj):
+                mime = detected_mime
+
         mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
         mime = mime or _DocumentConversionInput._detect_csv(content)
         mime = mime or "text/plain"
         formats = MimeTypeToFormat.get(mime, [])
         _log.info(f"detected formats: {formats}")
 
-        if formats:
-            if len(formats) == 1 and mime not in ("text/plain"):
-                return formats[0]
-            else:  # ambiguity in formats
-                return _DocumentConversionInput._guess_from_content(
-                    content, mime, formats
-                )
-        else:
-            return None
+        input_format: Optional[InputFormat] = None
+        if len(formats) == 1:
+            input_format = formats[0]
+
+        if content:
+            input_format = _DocumentConversionInput._guess_from_content(
+                content, mime, formats
+            )
+        return input_format
 
     @staticmethod
     def _guess_from_content(
@@ -337,6 +342,9 @@ class _DocumentConversionInput(BaseModel):
         """Guess the input format of a document by checking part of its content."""
         input_format: Optional[InputFormat] = None
 
+        if len(formats) == 1:
+            input_format = formats[0]
+
         if mime == "application/xml":
             content_str = content.decode("utf-8")
             match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
@@ -457,3 +465,24 @@ class _DocumentConversionInput(BaseModel):
             return None
 
         return None
+
+    @staticmethod
+    def _detect_mets_gbs(
+        obj: Union[Path, DocumentStream],
+    ) -> Optional[Literal["application/mets+xml"]]:
+        content = obj if isinstance(obj, Path) else obj.stream
+        tar: tarfile.TarFile
+        member: tarfile.TarInfo
+        with tarfile.open(
+            name=content if isinstance(content, Path) else None,
+            fileobj=content if isinstance(content, BytesIO) else None,
+            mode="r:gz",
+        ) as tar:
+            for member in tar.getmembers():
+                if member.name.endswith(".xml"):
+                    file = tar.extractfile(member)
+                    if file is not None:
+                        content_str = file.read().decode()
+                        if "http://www.loc.gov/METS/" in content_str:
+                            return "application/mets+xml"
+        return None
diff --git a/docling/document_converter.py b/docling/document_converter.py
index f3bcb89e..fea14f38 100644
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -17,6 +17,7 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.json.docling_json_backend import DoclingJSONBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
+from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
@@ -156,6 +157,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.XML_JATS: FormatOption(
             pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
         ),
+        InputFormat.XML_METS_GBS: FormatOption(
+            pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
+        ),
         InputFormat.IMAGE: FormatOption(
             pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
         ),
diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py
index 6944a355..5c289ad9 100644
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@@ -8,7 +8,10 @@ from typing import Any, Callable, List
 
 from docling_core.types.doc import NodeItem
 
-from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.abstract_backend import (
+    AbstractDocumentBackend,
+    PaginatedDocumentBackend,
+)
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import (
     ConversionStatus,
@@ -126,10 +129,10 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
         yield from page_batch
 
     def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
-        if not isinstance(conv_res.input._backend, PdfDocumentBackend):
+        if not isinstance(conv_res.input._backend, PaginatedDocumentBackend):
             raise RuntimeError(
-                f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
-                f"Can not convert this with a PDF pipeline. "
+                f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a paginated backend. "
+                f"Can not convert this with a paginated PDF pipeline. "
                 f"Please check your format configuration on DocumentConverter."
             )
             # conv_res.status = ConversionStatus.FAILURE
diff --git a/tests/test_backend_mets_gbs.py b/tests/test_backend_mets_gbs.py
new file mode 100644
index 00000000..c8be4327
--- /dev/null
+++ b/tests/test_backend_mets_gbs.py
@@ -0,0 +1,89 @@
+from pathlib import Path
+
+import pytest
+
+from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend, MetsGbsPageBackend
+from docling.datamodel.base_models import BoundingBox, InputFormat
+from docling.datamodel.document import InputDocument
+
+
+@pytest.fixture
+def test_doc_path():
+    return Path("/Users/dol/Downloads/32044009881525.tar.gz")
+
+
+def _get_backend(pdf_doc):
+    in_doc = InputDocument(
+        path_or_stream=pdf_doc,
+        format=InputFormat.PDF,
+        backend=MetsGbsDocumentBackend,
+    )
+
+    doc_backend = in_doc._backend
+    return doc_backend
+
+
+def test_process_pages(test_doc_path):
+    doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
+
+    for page_index in range(doc_backend.page_count()):
+        page_backend: MetsGbsPageBackend = doc_backend.load_page(page_index)
+        list(page_backend.get_text_cells())
+
+        # Clean up page backend after each iteration
+        page_backend.unload()
+
+    # Explicitly clean up document backend to prevent race conditions in CI
+    doc_backend.unload()
+
+
+def test_get_text_from_rect(test_doc_path):
+    doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
+    page_backend: MetsGbsPageBackend = doc_backend.load_page(9)
+
+    # Get the title text of the DocLayNet paper
+    textpiece = page_backend.get_text_in_rect(
+        bbox=BoundingBox(l=275, t=263, r=1388, b=311)
+    )
+    ref = "recently become prevalent that he who speaks"
+
+    assert textpiece.strip() == ref
+
+    # Explicitly clean up resources
+    page_backend.unload()
+    doc_backend.unload()
+
+
+def test_crop_page_image(test_doc_path):
+    doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
+    page_backend: MetsGbsPageBackend = doc_backend.load_page(9)
+
+    page_backend.get_page_image(
+        scale=2, cropbox=BoundingBox(l=270, t=587, r=1385, b=1995)
+    )
+    # im.show()
+
+    # Explicitly clean up resources
+    page_backend.unload()
+    doc_backend.unload()
+
+
+def test_crop_page_image_jp2(test_doc_path):
+    doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
+    page_backend: MetsGbsPageBackend = doc_backend.load_page(1)
+
+    page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=160, t=29, r=732, b=173))
+    # im.show()
+
+    # Explicitly clean up resources
+    page_backend.unload()
+    doc_backend.unload()
+
+
+def test_num_pages(test_doc_path):
+    doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
+    assert doc_backend.is_valid()
+    assert doc_backend.page_count() == 276
+
+    # Explicitly clean up resources to prevent race conditions in CI
+    doc_backend.unload()

From 7c3f9b7ab129b415e10aaac981affe68112f5737 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Fri, 25 Jul 2025 11:55:48 +0200
Subject: [PATCH 2/8] Fixes for cell indexing

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/backend/mets_gbs_backend.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py
index f2a7d2b5..3c06a872 100644
--- a/docling/backend/mets_gbs_backend.py
+++ b/docling/backend/mets_gbs_backend.py
@@ -326,7 +326,9 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
         im = im.convert("RGB")
 
         # Extract all ocrx_word spans
-        for word in ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns):
+        for ix, word in enumerate(
+            ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns)
+        ):
             text = "".join(word.itertext()).strip()
             title = word.attrib.get("title", "")
             rect = _extract_rect(title)
@@ -334,13 +336,20 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
             if rect:
                 word_cells.append(
                     TextCell(
-                        text=text, orig=text, rect=rect, from_ocr=True, confidence=conf
+                        index=ix,
+                        text=text,
+                        orig=text,
+                        rect=rect,
+                        from_ocr=True,
+                        confidence=conf,
                     )
                 )
 
         # Extract all ocr_line spans
         # line: etree._Element
-        for line in ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns):
+        for ix, line in enumerate(
+            ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns)
+        ):
             text = "".join(line.itertext()).strip()
             title = line.attrib.get("title", "")
             rect = _extract_rect(title)
@@ -348,7 +357,12 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
             if rect:
                 line_cells.append(
                     TextCell(
-                        text=text, orig=text, rect=rect, from_ocr=True, confidence=conf
+                        index=ix,
+                        text=text,
+                        orig=text,
+                        rect=rect,
+                        from_ocr=True,
+                        confidence=conf,
                     )
                 )
 

From 3e4093db58e4f56a811c6ea69a4e9549be7d7d71 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Fri, 25 Jul 2025 15:10:05 +0200
Subject: [PATCH 3/8] use HTMLParser and add options from CLI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/backend/mets_gbs_backend.py | 14 +++++---------
 docling/cli/main.py                 | 10 ++++++++++
 docling/datamodel/document.py       |  2 +-
 tests/test_backend_mets_gbs.py      | 20 ++++----------------
 4 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py
index 3c06a872..4ed100b3 100644
--- a/docling/backend/mets_gbs_backend.py
+++ b/docling/backend/mets_gbs_backend.py
@@ -305,13 +305,13 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
         ocr_file = self._tar.extractfile(ocr_info.path)
         assert ocr_file is not None
         ocr_content = ocr_file.read()
-        ocr_root: etree._Element = etree.fromstring(ocr_content)
+        parser = etree.HTMLParser()
+        ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
 
         line_cells: List[TextCell] = []
         word_cells: List[TextCell] = []
 
-        ns = {"x": "http://www.w3.org/1999/xhtml"}
-        page_div = ocr_root.xpath("//x:div[@class='ocr_page']", namespaces=ns)
+        page_div = ocr_root.xpath("//div[@class='ocr_page']")
 
         size = Size(width=im.size[0], height=im.size[1])
         if page_div:
@@ -326,9 +326,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
         im = im.convert("RGB")
 
         # Extract all ocrx_word spans
-        for ix, word in enumerate(
-            ocr_root.xpath("//x:span[@class='ocrx_word']", namespaces=ns)
-        ):
+        for ix, word in enumerate(ocr_root.xpath("//span[@class='ocrx_word']")):
             text = "".join(word.itertext()).strip()
             title = word.attrib.get("title", "")
             rect = _extract_rect(title)
@@ -347,9 +345,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
 
         # Extract all ocr_line spans
         # line: etree._Element
-        for ix, line in enumerate(
-            ocr_root.xpath("//x:span[@class='ocr_line']", namespaces=ns)
-        ):
+        for ix, line in enumerate(ocr_root.xpath("//span[@class='ocr_line']")):
             text = "".join(line.itertext()).strip()
             title = line.attrib.get("title", "")
             rect = _extract_rect(title)
diff --git a/docling/cli/main.py b/docling/cli/main.py
index ae275ea9..8ed127a6 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -26,6 +26,7 @@ from rich.console import Console
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
@@ -601,9 +602,18 @@ def convert(  # noqa: C901
                 backend=backend,  # pdf_backend
             )
 
+            # METS GBS options
+            mets_gbs_options = pipeline_options.model_copy()
+            mets_gbs_options.do_ocr = False
+            mets_gbs_format_option = PdfFormatOption(
+                pipeline_options=mets_gbs_options,
+                backend=MetsGbsDocumentBackend,
+            )
+
             format_options = {
                 InputFormat.PDF: pdf_format_option,
                 InputFormat.IMAGE: pdf_format_option,
+                InputFormat.XML_METS_GBS: mets_gbs_format_option,
             }
 
         elif pipeline == ProcessingPipeline.VLM:
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index a9a3c9b1..b1ca0372 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -482,7 +482,7 @@ class _DocumentConversionInput(BaseModel):
                 if member.name.endswith(".xml"):
                     file = tar.extractfile(member)
                     if file is not None:
-                        content_str = file.read().decode()
+                        content_str = file.read().decode(errors="ignore")
                         if "http://www.loc.gov/METS/" in content_str:
                             return "application/mets+xml"
         return None
diff --git a/tests/test_backend_mets_gbs.py b/tests/test_backend_mets_gbs.py
index c8be4327..894579ec 100644
--- a/tests/test_backend_mets_gbs.py
+++ b/tests/test_backend_mets_gbs.py
@@ -9,7 +9,7 @@ from docling.datamodel.document import InputDocument
 
 @pytest.fixture
 def test_doc_path():
-    return Path("/Users/dol/Downloads/32044009881525.tar.gz")
+    return Path("/Users/dol/Downloads/32044009881525_select.tar.gz")
 
 
 def _get_backend(pdf_doc):
@@ -39,7 +39,7 @@ def test_process_pages(test_doc_path):
 
 def test_get_text_from_rect(test_doc_path):
     doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
-    page_backend: MetsGbsPageBackend = doc_backend.load_page(9)
+    page_backend: MetsGbsPageBackend = doc_backend.load_page(0)
 
     # Get the title text of the DocLayNet paper
     textpiece = page_backend.get_text_in_rect(
@@ -56,7 +56,7 @@ def test_get_text_from_rect(test_doc_path):
 
 def test_crop_page_image(test_doc_path):
     doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
-    page_backend: MetsGbsPageBackend = doc_backend.load_page(9)
+    page_backend: MetsGbsPageBackend = doc_backend.load_page(0)
 
     page_backend.get_page_image(
         scale=2, cropbox=BoundingBox(l=270, t=587, r=1385, b=1995)
@@ -68,22 +68,10 @@ def test_crop_page_image(test_doc_path):
     doc_backend.unload()
 
 
-def test_crop_page_image_jp2(test_doc_path):
-    doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
-    page_backend: MetsGbsPageBackend = doc_backend.load_page(1)
-
-    page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=160, t=29, r=732, b=173))
-    # im.show()
-
-    # Explicitly clean up resources
-    page_backend.unload()
-    doc_backend.unload()
-
-
 def test_num_pages(test_doc_path):
     doc_backend: MetsGbsDocumentBackend = _get_backend(test_doc_path)
     assert doc_backend.is_valid()
-    assert doc_backend.page_count() == 276
+    assert doc_backend.page_count() == 3
 
     # Explicitly clean up resources to prevent race conditions in CI
     doc_backend.unload()

From bbb735d2de2f32eef3324565471f0b43955a703d Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Fri, 25 Jul 2025 15:13:10 +0200
Subject: [PATCH 4/8] fix typing and unloading

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/backend/mets_gbs_backend.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py
index 4ed100b3..29e216a2 100644
--- a/docling/backend/mets_gbs_backend.py
+++ b/docling/backend/mets_gbs_backend.py
@@ -133,9 +133,11 @@ class MetsGbsPageBackend(PdfPageBackend):
             width=self._dpage.dimension.width, height=self._dpage.dimension.height
         )
 
-    def unload(self):
-        self._ppage = None
-        self._dpage = None
+    def unload(self) -> None:
+        if hasattr(self, "_im"):
+            delattr(self, "_im")
+        if hasattr(self, "_dpage"):
+            delattr(self, "_dpage")
 
 
 class _UseType(str, Enum):
@@ -392,6 +394,6 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
     def supports_pagination(cls) -> bool:
         return True
 
-    def unload(self):
+    def unload(self) -> None:
         super().unload()
         self._tar.close()

From 79c59cb2b0a8bc25d02003bddbbe3c8955b43b6f Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Fri, 25 Jul 2025 15:15:05 +0200
Subject: [PATCH 5/8] restore guess format

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/datamodel/document.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index b1ca0372..b9832346 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -325,15 +325,15 @@ class _DocumentConversionInput(BaseModel):
         formats = MimeTypeToFormat.get(mime, [])
         _log.info(f"detected formats: {formats}")
 
-        input_format: Optional[InputFormat] = None
-        if len(formats) == 1:
-            input_format = formats[0]
-
-        if content:
-            input_format = _DocumentConversionInput._guess_from_content(
-                content, mime, formats
-            )
-        return input_format
+        if formats:
+            if len(formats) == 1 and mime not in ("text/plain"):
+                return formats[0]
+            else:  # ambiguity in formats
+                return _DocumentConversionInput._guess_from_content(
+                    content, mime, formats
+                )
+        else:
+            return None
 
     @staticmethod
     def _guess_from_content(
@@ -342,9 +342,6 @@ class _DocumentConversionInput(BaseModel):
         """Guess the input format of a document by checking part of its content."""
         input_format: Optional[InputFormat] = None
 
-        if len(formats) == 1:
-            input_format = formats[0]
-
         if mime == "application/xml":
             content_str = content.decode("utf-8")
             match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)

From 46b904e059231105bf6cfdedc82b3a940d8eb186 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Fri, 25 Jul 2025 15:16:41 +0200
Subject: [PATCH 6/8] rename inputformat

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/backend/mets_gbs_backend.py | 2 +-
 docling/cli/main.py                 | 2 +-
 docling/datamodel/base_models.py    | 6 +++---
 docling/document_converter.py       | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py
index 29e216a2..b3ca2fb2 100644
--- a/docling/backend/mets_gbs_backend.py
+++ b/docling/backend/mets_gbs_backend.py
@@ -388,7 +388,7 @@ class MetsGbsDocumentBackend(PaginatedDocumentBackend):
 
     @classmethod
     def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.XML_METS_GBS}
+        return {InputFormat.METS_GBS}
 
     @classmethod
     def supports_pagination(cls) -> bool:
diff --git a/docling/cli/main.py b/docling/cli/main.py
index 8ed127a6..db9dfc00 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -613,7 +613,7 @@ def convert(  # noqa: C901
             format_options = {
                 InputFormat.PDF: pdf_format_option,
                 InputFormat.IMAGE: pdf_format_option,
-                InputFormat.XML_METS_GBS: mets_gbs_format_option,
+                InputFormat.METS_GBS: mets_gbs_format_option,
             }
 
         elif pipeline == ProcessingPipeline.VLM:
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index 6825e125..8edefe38 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -56,7 +56,7 @@ class InputFormat(str, Enum):
     XLSX = "xlsx"
     XML_USPTO = "xml_uspto"
     XML_JATS = "xml_jats"
-    XML_METS_GBS = "xml_mets_gbs"
+    METS_GBS = "xml_mets_gbs"
     JSON_DOCLING = "json_docling"
     AUDIO = "audio"
 
@@ -82,7 +82,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.CSV: ["csv"],
     InputFormat.XLSX: ["xlsx", "xlsm"],
     InputFormat.XML_USPTO: ["xml", "txt"],
-    InputFormat.XML_METS_GBS: ["tar.gz"],
+    InputFormat.METS_GBS: ["tar.gz"],
     InputFormat.JSON_DOCLING: ["json"],
     InputFormat.AUDIO: ["wav", "mp3"],
 }
@@ -115,7 +115,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
     ],
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
-    InputFormat.XML_METS_GBS: ["application/mets+xml"],
+    InputFormat.METS_GBS: ["application/mets+xml"],
     InputFormat.JSON_DOCLING: ["application/json"],
     InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
 }
diff --git a/docling/document_converter.py b/docling/document_converter.py
index fea14f38..855a5caa 100644
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -157,7 +157,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.XML_JATS: FormatOption(
             pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
         ),
-        InputFormat.XML_METS_GBS: FormatOption(
+        InputFormat.METS_GBS: FormatOption(
             pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
         ),
         InputFormat.IMAGE: FormatOption(

From 9da610e95b8f47ec516ea224226792761366fb21 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Fri, 25 Jul 2025 15:19:53 +0200
Subject: [PATCH 7/8] use PdfDocumentBackend

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/backend/mets_gbs_backend.py | 4 ++--
 docling/backend/pdf_backend.py      | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docling/backend/mets_gbs_backend.py b/docling/backend/mets_gbs_backend.py
index b3ca2fb2..09ac84ae 100644
--- a/docling/backend/mets_gbs_backend.py
+++ b/docling/backend/mets_gbs_backend.py
@@ -22,7 +22,7 @@ from PIL import Image
 from PIL.Image import Image as PILImage
 
 from docling.backend.abstract_backend import PaginatedDocumentBackend
-from docling.backend.pdf_backend import PdfPageBackend
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import InputFormat
 
 if TYPE_CHECKING:
@@ -194,7 +194,7 @@ def _extract_confidence(title_str) -> float:
     return 1
 
 
-class MetsGbsDocumentBackend(PaginatedDocumentBackend):
+class MetsGbsDocumentBackend(PdfDocumentBackend):
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
 
diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py
index 1b0d612e..e14b147a 100644
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@@ -84,9 +84,9 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
 
                 buf.seek(0)
                 self.path_or_stream = buf
-            else:
+            elif self.input_format not in self.supported_formats():
                 raise RuntimeError(
-                    f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
+                    f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
                 )
 
     @abstractmethod
@@ -99,7 +99,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
 
     @classmethod
     def supported_formats(cls) -> Set[InputFormat]:
-        return {InputFormat.PDF}
+        return {InputFormat.PDF, InputFormat.IMAGE}
 
     @classmethod
     def supports_pagination(cls) -> bool:

From 5f5a3cd9148561cdf3b2b1048fbc6d24b261b728 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Fri, 25 Jul 2025 15:36:09 +0200
Subject: [PATCH 8/8] use test file from test folder (still missing)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 tests/test_backend_mets_gbs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_backend_mets_gbs.py b/tests/test_backend_mets_gbs.py
index 894579ec..7003e957 100644
--- a/tests/test_backend_mets_gbs.py
+++ b/tests/test_backend_mets_gbs.py
@@ -9,7 +9,7 @@ from docling.datamodel.document import InputDocument
 
 @pytest.fixture
 def test_doc_path():
-    return Path("/Users/dol/Downloads/32044009881525_select.tar.gz")
+    return Path("tests/data/mets_gbs/32044009881525_select.tar.gz")
 
 
 def _get_backend(pdf_doc):