Add back DoclingParse v1 backend, pipeline options

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2025-03-12 16:28:25 +01:00 · 2025-03-12 16:28:25 +01:00 · 9ebd7108f2
commit 9ebd7108f2
parent 8a45a2cafa
8 changed files with 327 additions and 7 deletions
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -0,0 +1,234 @@
 import logging
 import random
 from io import BytesIO
 from pathlib import Path
 from typing import Iterable, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
 from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
 from docling_parse.pdf_parsers import pdf_parser_v1
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
 class DoclingParsePageBackend(PdfPageBackend):
    def __init__(
        self, parser: pdf_parser_v1, document_hash: str, page_no: int, page_obj: PdfPage
    ):
        self._ppage = page_obj
        parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
        self.valid = "pages" in parsed_page
        if self.valid:
            self._dpage = parsed_page["pages"][0]
        else:
            _log.info(
                f"An error occurred when loading page {page_no} of document {document_hash}."
            )
    def is_valid(self) -> bool:
        return self.valid
    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        if not self.valid:
            return ""
        # Find intersecting cells on the page
        text_piece = ""
        page_size = self.get_size()
        parser_width = self._dpage["width"]
        parser_height = self._dpage["height"]
        scale = (
            1  # FIX - Replace with param in get_text_in_rect across backends (optional)
        )
        for i in range(len(self._dpage["cells"])):
            rect = self._dpage["cells"][i]["box"]["device"]
            x0, y0, x1, y1 = rect
            cell_bbox = BoundingBox(
                l=x0 * scale * page_size.width / parser_width,
                b=y0 * scale * page_size.height / parser_height,
                r=x1 * scale * page_size.width / parser_width,
                t=y1 * scale * page_size.height / parser_height,
                coord_origin=CoordOrigin.BOTTOMLEFT,
            ).to_top_left_origin(page_height=page_size.height * scale)
            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
            if overlap_frac > 0.5:
                if len(text_piece) > 0:
                    text_piece += " "
                text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
        return text_piece
    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
        return None
    def get_text_cells(self) -> Iterable[TextCell]:
        cells: List[TextCell] = []
        cell_counter = 0
        if not self.valid:
            return cells
        page_size = self.get_size()
        parser_width = self._dpage["width"]
        parser_height = self._dpage["height"]
        for i in range(len(self._dpage["cells"])):
            rect = self._dpage["cells"][i]["box"]["device"]
            x0, y0, x1, y1 = rect
            if x1 < x0:
                x0, x1 = x1, x0
            if y1 < y0:
                y0, y1 = y1, y0
            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
            cells.append(
                TextCell(
                    index=cell_counter,
                    text=text_piece,
                    orig=text_piece,
                    rect=BoundingRectangle.from_bounding_box(
                        BoundingBox(
                            # l=x0, b=y0, r=x1, t=y1,
                            l=x0 * page_size.width / parser_width,
                            b=y0 * page_size.height / parser_height,
                            r=x1 * page_size.width / parser_width,
                            t=y1 * page_size.height / parser_height,
                            coord_origin=CoordOrigin.BOTTOMLEFT,
                        )
                    ),
                ).to_top_left_origin(page_size.height),
            )
            cell_counter += 1
        def draw_clusters_and_cells():
            image = (
                self.get_page_image()
            )  # make new image to avoid drawing on the saved ones
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
                cell_color = (
                    random.randint(30, 140),
                    random.randint(30, 140),
                    random.randint(30, 140),
                )
                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
            image.show()
        # before merge:
        # draw_clusters_and_cells()
        # cells = merge_horizontal_cells(cells)
        # after merge:
        # draw_clusters_and_cells()
        return cells
    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
        AREA_THRESHOLD = 0  # 32 * 32
        for i in range(len(self._dpage["images"])):
            bitmap = self._dpage["images"][i]
            cropbox = BoundingBox.from_tuple(
                bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
            ).to_top_left_origin(self.get_size().height)
            if cropbox.area() > AREA_THRESHOLD:
                cropbox = cropbox.scaled(scale=scale)
                yield cropbox
    def get_page_image(
        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
    ) -> Image.Image:
        page_size = self.get_size()
        if not cropbox:
            cropbox = BoundingBox(
                l=0,
                r=page_size.width,
                t=0,
                b=page_size.height,
                coord_origin=CoordOrigin.TOPLEFT,
            )
            padbox = BoundingBox(
                l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
            )
        else:
            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
            padbox.r = page_size.width - padbox.r
            padbox.t = page_size.height - padbox.t
        image = (
            self._ppage.render(
                scale=scale * 1.5,
                rotation=0,  # no additional rotation
                crop=padbox.as_tuple(),
            )
            .to_pil()
            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
        )  # We resize the image from 1.5x the given scale to make it sharper.
        return image
    def get_size(self) -> Size:
        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
    def unload(self):
        self._ppage = None
        self._dpage = None
 class DoclingParseDocumentBackend(PdfDocumentBackend):
    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)
        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
        self.parser = pdf_parser_v1()
        success = False
        if isinstance(self.path_or_stream, BytesIO):
            success = self.parser.load_document_from_bytesio(
                self.document_hash, self.path_or_stream
            )
        elif isinstance(self.path_or_stream, Path):
            success = self.parser.load_document(
                self.document_hash, str(self.path_or_stream)
            )
        if not success:
            raise RuntimeError(
                f"docling-parse could not load document with hash {self.document_hash}."
            )
    def page_count(self) -> int:
        return len(self._pdoc)  # To be replaced with docling-parse API
    def load_page(self, page_no: int) -> DoclingParsePageBackend:
        return DoclingParsePageBackend(
            self.parser, self.document_hash, page_no, self._pdoc[page_no]
        )
    def is_valid(self) -> bool:
        return self.page_count() > 0
    def unload(self):
        super().unload()
        self.parser.unload_document(self.document_hash)
        self._pdoc.close()
        self._pdoc = None
--- a/docling/backend/docling_parse_v3_backend.py
+++ b/docling/backend/docling_parse_v3_backend.py
@ -65,9 +65,9 @@ class DoclingParseV3PageBackend(PdfPageBackend):
        for cell in self._dpage.textline_cells:
            rect = cell.rect
-            if rect.r_x2 < rect.r_x0:
+            # if rect.r_x2 < rect.r_x0:
-                rect.r_x0, rect.r_x2 = rect.r_x2, rect.r_x0
+            #    rect.r_x0, rect.r_x2 = rect.r_x2, rect.r_x0
-                rect.r_y3, rect.r_y1 = rect.r_y1, rect.r_y3
+            #    rect.r_y3, rect.r_y1 = rect.r_y1, rect.r_y3
            # rect.r_x2, rect.r_x3 = rect.r_x3, rect.r_x2
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -412,7 +412,9 @@ def convert(
        if artifacts_path is not None:
            pipeline_options.artifacts_path = artifacts_path
-        if pdf_backend == PdfBackend.DLPARSE_V2:
+        if pdf_backend == PdfBackend.DLPARSE_V1:
            backend = DoclingParseV2DocumentBackend
        elif pdf_backend == PdfBackend.DLPARSE_V2:
            backend = DoclingParseV2DocumentBackend
        elif pdf_backend == PdfBackend.DLPARSE_V3:
            backend = DoclingParseV3DocumentBackend  # type: ignore
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -299,6 +299,7 @@ class PdfBackend(str, Enum):
    """Enum of valid PDF backends."""
    PYPDFIUM2 = "pypdfium2"
    DLPARSE_V1 = "dlparse_v1"
    DLPARSE_V2 = "dlparse_v2"
    DLPARSE_V3 = "dlparse_v3"
@ -381,3 +382,5 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
            "before conversion and then use the `TableItem.get_image` function."
        ),
    )
    generate_parsed_pages: bool = False
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -13,6 +13,7 @@ from docling.utils.profiling import TimeRecorder
 class PagePreprocessingOptions(BaseModel):
    images_scale: Optional[float]
    create_parsed_page: bool
 class PagePreprocessingModel(BasePageModel):
@ -54,7 +55,9 @@ class PagePreprocessingModel(BasePageModel):
        assert page._backend is not None
        page.cells = list(page._backend.get_text_cells())
-        page.parsed_page = page._backend.get_segmented_page()
+
        if self.options.create_parsed_page:
            page.parsed_page = page._backend.get_segmented_page()
        # DEBUG code:
        def draw_text_boxes(image, cells, show: bool = False):
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -87,7 +87,8 @@ class StandardPdfPipeline(PaginatedPipeline):
            # Pre-processing
            PagePreprocessingModel(
                options=PagePreprocessingOptions(
-                    images_scale=pipeline_options.images_scale
+                    images_scale=pipeline_options.images_scale,
                    create_parsed_page=pipeline_options.generate_parsed_pages,
                )
            ),
            # OCR
--- a/poetry.lock
+++ b/poetry.lock
@ -898,7 +898,7 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]
 type = "git"
 url = "https://github.com/DS4SD/docling-core"
 reference = "cau/docling-parse-types"
-resolved_reference = "a2f1fccf80324e74c1ed66574bfa2bc02163e2ae"
+resolved_reference = "7cb80880a4781e781cf797d42bda34498cf81184"
 [[package]]
 name = "docling-ibm-models"
--- a/tests/test_backend_docling_parse.py
+++ b/tests/test_backend_docling_parse.py
@ -0,0 +1,77 @@
 from pathlib import Path
 import pytest
 from docling_core.types.doc import BoundingBox
 from docling.backend.docling_parse_backend import (
    DoclingParseDocumentBackend,
    DoclingParsePageBackend,
 )
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
@pytest.fixture
 def test_doc_path():
    return Path("./tests/data/pdf/2206.01062.pdf")
 def _get_backend(pdf_doc):
    in_doc = InputDocument(
        path_or_stream=pdf_doc,
        format=InputFormat.PDF,
        backend=DoclingParseDocumentBackend,
    )
    doc_backend = in_doc._backend
    return doc_backend
 def test_text_cell_counts():
    pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
    doc_backend = _get_backend(pdf_doc)
    for page_index in range(0, doc_backend.page_count()):
        last_cell_count = None
        for i in range(10):
            page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
            cells = list(page_backend.get_text_cells())
            if last_cell_count is None:
                last_cell_count = len(cells)
            if len(cells) != last_cell_count:
                assert (
                    False
                ), "Loading page multiple times yielded non-identical text cell counts"
            last_cell_count = len(cells)
 def test_get_text_from_rect(test_doc_path):
    doc_backend = _get_backend(test_doc_path)
    page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
    # Get the title text of the DocLayNet paper
    textpiece = page_backend.get_text_in_rect(
        bbox=BoundingBox(l=102, t=77, r=511, b=124)
    )
    ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
    assert textpiece.strip() == ref
 def test_crop_page_image(test_doc_path):
    doc_backend = _get_backend(test_doc_path)
    page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
    # Crop out "Figure 1" from the DocLayNet paper
    im = page_backend.get_page_image(
        scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
    )
    # im.show()
 def test_num_pages(test_doc_path):
    doc_backend = _get_backend(test_doc_path)
    doc_backend.page_count() == 9