Add DoclingParseV3 backend implementation

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-02-06 20:29:44 +01:00 · 2025-02-06 20:29:44 +01:00 · 3f0e98b1ad
commit 3f0e98b1ad
parent ed74fe2ec0
3 changed files with 288 additions and 8 deletions
--- a/docling/backend/docling_parse_v3_backend.py
+++ b/docling/backend/docling_parse_v3_backend.py
@ -0,0 +1,198 @@
+import logging
+import random
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+
+import pypdfium2 as pdfium
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_parse.document import PageBoundaryType, ParsedPdfPage
+from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
+from docling_parse.pdf_parsers import pdf_parser_v2
+from PIL import Image, ImageDraw
+from pypdfium2 import PdfPage
+
+from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.base_models import Cell, Size
+
+if TYPE_CHECKING:
+    from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class DoclingParseV3PageBackend(PdfPageBackend):
+    def __init__(self, parsed_page: ParsedPdfPage, page_obj: PdfPage):
+        self._ppage = page_obj
+        self._dpage = parsed_page
+        self.valid = parsed_page is not None
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        # Find intersecting cells on the page
+        text_piece = ""
+        page_size = self.get_size()
+
+        scale = (
+            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
+        )
+
+        for i, cell in enumerate(self._dpage.sanitized.cells):
+            cell_bbox = (
+                cell.rect.to_bounding_box()
+                .to_top_left_origin(page_height=page_size.height)
+                .scaled(scale)
+            )
+
+            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+
+            if overlap_frac > 0.5:
+                if len(text_piece) > 0:
+                    text_piece += " "
+                text_piece += cell.text
+
+        return text_piece
+
+    def get_text_cells(self) -> Iterable[Cell]:
+        cells: List[Cell] = []
+        cell_counter = 0
+
+        page_size = self.get_size()
+
+        for i, cell in enumerate(self._dpage.sanitized.cells):
+            cell_bbox = cell.rect.to_bounding_box()
+
+            if cell_bbox.r < cell_bbox.l:
+                cell_bbox.r, cell_bbox.l = cell_bbox.l, cell_bbox.r
+            if cell_bbox.b > cell_bbox.t:
+                cell_bbox.b, cell_bbox.t = cell_bbox.t, cell_bbox.b
+
+            text_piece = cell.text
+            cells.append(
+                Cell(
+                    id=cell_counter,
+                    text=text_piece,
+                    bbox=cell_bbox.to_top_left_origin(page_size.height),
+                )
+            )
+            cell_counter += 1
+
+        def draw_clusters_and_cells():
+            image = (
+                self.get_page_image()
+            )  # make new image to avoid drawing on the saved ones
+            draw = ImageDraw.Draw(image)
+            for c in cells:
+                x0, y0, x1, y1 = c.bbox.as_tuple()
+                cell_color = (
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                    random.randint(30, 140),
+                )
+                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+            image.show()
+
+        # draw_clusters_and_cells()
+
+        return cells
+
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        AREA_THRESHOLD = 0  # 32 * 32
+
+        images = self._dpage.sanitized.bitmap_resources
+
+        for img in images:
+            cropbox = img.rect.to_bounding_box().to_top_left_origin(
+                self.get_size().height
+            )
+
+            if cropbox.area() > AREA_THRESHOLD:
+                cropbox = cropbox.scaled(scale=scale)
+
+                yield cropbox
+
+    def get_page_image(
+        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
+    ) -> Image.Image:
+
+        page_size = self.get_size()
+
+        if not cropbox:
+            cropbox = BoundingBox(
+                l=0,
+                r=page_size.width,
+                t=0,
+                b=page_size.height,
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
+            padbox = BoundingBox(
+                l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
+            )
+        else:
+            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
+            padbox.r = page_size.width - padbox.r
+            padbox.t = page_size.height - padbox.t
+
+        image = (
+            self._ppage.render(
+                scale=scale * 1.5,
+                rotation=0,  # no additional rotation
+                crop=padbox.as_tuple(),
+            )
+            .to_pil()
+            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
+        )  # We resize the image from 1.5x the given scale to make it sharper.
+
+        return image
+
+    def get_size(self) -> Size:
+        return Size(
+            width=self._dpage.sanitized.dimension.width,
+            height=self._dpage.sanitized.dimension.height,
+        )
+
+    def unload(self):
+        self._ppage = None
+        self._dpage = None
+
+
+class DoclingParseV3DocumentBackend(PdfDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
+        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
+        self.parser = DoclingPdfParser(loglevel="fatal")
+        self.dp_doc: PdfDocument = self.parser.load(path_or_stream=path_or_stream)
+        success = self.dp_doc is not None
+
+        if not success:
+            raise RuntimeError(
+                f"docling-parse v2 could not load document {self.document_hash}."
+            )
+
+    def page_count(self) -> int:
+        # return len(self._pdoc)  # To be replaced with docling-parse API
+
+        len_1 = len(self._pdoc)
+        len_2 = self.dp_doc.number_of_pages()
+
+        if len_1 != len_2:
+            _log.error(f"Inconsistent number of pages: {len_1}!={len_2}")
+
+        return len_2
+
+    def load_page(self, page_no: int) -> DoclingParseV3PageBackend:
+        return DoclingParseV3PageBackend(
+            self.dp_doc.get_page(page_no + 1), self._pdoc[page_no]
+        )
+
+    def is_valid(self) -> bool:
+        return self.page_count() > 0
+
+    def unload(self):
+        super().unload()
+        self.dp_doc.unload()
+        self._pdoc.close()
+        self._pdoc = None
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@ -6,10 +6,11 @@ from typing import Iterable

 import yaml

-from docling.datamodel.base_models import ConversionStatus
+from docling.backend.docling_parse_v3_backend import DoclingParseV3DocumentBackend
+from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.settings import settings
-from docling.document_converter import DocumentConverter
+from docling.document_converter import DocumentConverter, PdfFormatOption

 _log = logging.getLogger(__name__)

@ -103,10 +104,11 @@ def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_paths = [
-        Path("./tests/data/2206.01062.pdf"),
-        Path("./tests/data/2203.01017v2.pdf"),
-        Path("./tests/data/2305.03393v1.pdf"),
-        Path("./tests/data/redp5110_sampled.pdf"),
+        Path("tests/data/redp5110_sampled.pdf"),
+        # Path("./tests/data/2206.01062.pdf"),
+        # Path("./tests/data/2203.01017v2.pdf"),
+        # Path("./tests/data/2305.03393v1.pdf"),
+        # Path("./tests/data/redp5110_sampled.pdf"),
    ]

    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
@ -119,13 +121,17 @@ def main():
    # settings.debug.visualize_tables = True
    # settings.debug.visualize_cells = True

-    doc_converter = DocumentConverter()
+    doc_converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(backend=DoclingParseV3DocumentBackend)
+        }
+    )

    start_time = time.time()

    conv_results = doc_converter.convert_all(
        input_doc_paths,
-        raises_on_error=False,  # to let conversion run through all and examine results at the end
+        raises_on_error=True,  # to let conversion run through all and examine results at the end
    )
    success_count, partial_success_count, failure_count = export_documents(
        conv_results, output_dir=Path("scratch")
--- a/tests/test_backend_docling_parse_v3.py
+++ b/tests/test_backend_docling_parse_v3.py
@ -0,0 +1,76 @@
+from pathlib import Path
+
+import pytest
+
+from docling.backend.docling_parse_v3_backend import (
+    DoclingParseV3DocumentBackend,
+    DoclingParseV3PageBackend,
+)
+from docling.datamodel.base_models import BoundingBox, InputFormat
+from docling.datamodel.document import InputDocument
+
+
+@pytest.fixture
+def test_doc_path():
+    return Path("./tests/data/2206.01062.pdf")
+
+
+def _get_backend(pdf_doc):
+    in_doc = InputDocument(
+        path_or_stream=pdf_doc,
+        format=InputFormat.PDF,
+        backend=DoclingParseV3DocumentBackend,
+    )
+
+    doc_backend = in_doc._backend
+    return doc_backend
+
+
+def test_text_cell_counts():
+    pdf_doc = Path("./tests/data/redp5110_sampled.pdf")
+
+    doc_backend = _get_backend(pdf_doc)
+
+    for page_index in range(0, doc_backend.page_count()):
+        last_cell_count = None
+        for i in range(10):
+            page_backend: DoclingParseV3PageBackend = doc_backend.load_page(0)
+            cells = list(page_backend.get_text_cells())
+
+            if last_cell_count is None:
+                last_cell_count = len(cells)
+
+            if len(cells) != last_cell_count:
+                assert (
+                    False
+                ), "Loading page multiple times yielded non-identical text cell counts"
+            last_cell_count = len(cells)
+
+
+def test_get_text_from_rect(test_doc_path):
+    doc_backend = _get_backend(test_doc_path)
+    page_backend: DoclingParseV3PageBackend = doc_backend.load_page(0)
+
+    # Get the title text of the DocLayNet paper
+    textpiece = page_backend.get_text_in_rect(
+        bbox=BoundingBox(l=102, t=77, r=511, b=124)
+    )
+    ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
+
+    assert textpiece.strip() == ref
+
+
+def test_crop_page_image(test_doc_path):
+    doc_backend = _get_backend(test_doc_path)
+    page_backend: DoclingParseV3PageBackend = doc_backend.load_page(0)
+
+    # Crop out "Figure 1" from the DocLayNet paper
+    im = page_backend.get_page_image(
+        scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
+    )
+    # im.show()
+
+
+def test_num_pages(test_doc_path):
+    doc_backend = _get_backend(test_doc_path)
+    doc_backend.page_count() == 9