From 9752e824fb8e7eda990fa2633045f310adff98c8 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 11 Jun 2025 14:19:55 +0200 Subject: [PATCH] Correctly compute PDF boxes from pymupdf Signed-off-by: Christoph Auer --- docling/backend/docling_parse_backend.py | 25 +----- docling/backend/docling_parse_v2_backend.py | 24 +----- docling/backend/docling_parse_v4_backend.py | 40 ++++----- docling/backend/pypdfium2_backend.py | 92 ++++++++++++++++----- docling/models/base_ocr_model.py | 2 +- 5 files changed, 101 insertions(+), 82 deletions(-) diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 056f61b0..902bb0d5 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -9,8 +9,6 @@ import pypdfium2 as pdfium from docling_core.types.doc import BoundingBox, CoordOrigin, Size from docling_core.types.doc.page import ( BoundingRectangle, - PdfPageBoundaryType, - PdfPageGeometry, SegmentedPdfPage, TextCell, ) @@ -19,6 +17,7 @@ from PIL import Image, ImageDraw from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend +from docling.backend.pypdfium2_backend import get_pdf_page_geometry from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) @@ -124,28 +123,10 @@ class DoclingParsePageBackend(PdfPageBackend): if not self.valid: return None - page_size = self.get_size() text_cells = self._compute_text_cells() - # Create page geometry - crop_bbox = BoundingBox( - l=0, - r=page_size.width, - t=0, - b=page_size.height, - coord_origin=CoordOrigin.TOPLEFT, - ).to_bottom_left_origin(page_size.height) - - dimension = PdfPageGeometry( - angle=0.0, - rect=BoundingRectangle.from_bounding_box(crop_bbox), - boundary_type=PdfPageBoundaryType.CROP_BOX, - art_bbox=crop_bbox, - bleed_bbox=crop_bbox, - crop_bbox=crop_bbox, - media_bbox=crop_bbox, - trim_bbox=crop_bbox, - ) + # Get the PDF page geometry from pypdfium2 + dimension = get_pdf_page_geometry(self._ppage) # Create SegmentedPdfPage return SegmentedPdfPage( diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index e20ce1e0..99a2bf3f 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -19,6 +19,7 @@ from PIL import Image, ImageDraw from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend +from docling.backend.pypdfium2_backend import get_pdf_page_geometry from docling.datamodel.base_models import Size from docling.utils.locks import pypdfium2_lock @@ -139,28 +140,11 @@ class DoclingParseV2PageBackend(PdfPageBackend): if not self.valid: return None - page_size = self.get_size() text_cells = self._compute_text_cells() - # Create page geometry - crop_bbox = BoundingBox( - l=0, - r=page_size.width, - t=0, - b=page_size.height, - coord_origin=CoordOrigin.TOPLEFT, - ).to_bottom_left_origin(page_size.height) - - dimension = PdfPageGeometry( - angle=0.0, - rect=BoundingRectangle.from_bounding_box(crop_bbox), - boundary_type=PdfPageBoundaryType.CROP_BOX, - art_bbox=crop_bbox, - bleed_bbox=crop_bbox, - crop_bbox=crop_bbox, - media_bbox=crop_bbox, - trim_bbox=crop_bbox, - ) + # Get the PDF page geometry from pypdfium2 + with pypdfium2_lock: + dimension = get_pdf_page_geometry(self._ppage) # Create SegmentedPdfPage return SegmentedPdfPage( diff --git a/docling/backend/docling_parse_v4_backend.py b/docling/backend/docling_parse_v4_backend.py index a95aceb9..cac07f80 100644 --- a/docling/backend/docling_parse_v4_backend.py +++ b/docling/backend/docling_parse_v4_backend.py @@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend): return self._dpage def get_text_cells(self) -> Iterable[TextCell]: - page_size = self.get_size() - - [tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells] - - # for cell in self._dpage.textline_cells: - # rect = cell.rect - # - # assert ( - # rect.to_bounding_box().l <= rect.to_bounding_box().r - # ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}" - # assert ( - # rect.to_bounding_box().t <= rect.to_bounding_box().b - # ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}" - return self._dpage.textline_cells def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: @@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend): self, page_no: int, create_words: bool = True, create_textlines: bool = True ) -> DoclingParseV4PageBackend: with pypdfium2_lock: + seg_page = self.dp_doc.get_page( + page_no + 1, + create_words=create_words, + create_textlines=create_textlines, + ) + + # In Docling, all TextCell instances are expected with top-left origin. + [ + tc.to_top_left_origin(seg_page.dimension.height) + for tc in seg_page.textline_cells + ] + [ + tc.to_top_left_origin(seg_page.dimension.height) + for tc in seg_page.char_cells + ] + [ + tc.to_top_left_origin(seg_page.dimension.height) + for tc in seg_page.word_cells + ] + return DoclingParseV4PageBackend( - self.dp_doc.get_page( - page_no + 1, - create_words=create_words, - create_textlines=create_textlines, - ), + seg_page, self._pdoc[page_no], ) diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index 569c0383..676e8662 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -22,6 +22,75 @@ from pypdfium2._helpers.misc import PdfiumError from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.utils.locks import pypdfium2_lock + +def get_pdf_page_geometry( + ppage: pdfium.PdfPage, + angle: float = 0.0, + boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, +) -> PdfPageGeometry: + """ + Create PdfPageGeometry from a pypdfium2 PdfPage object. + + Args: + ppage: pypdfium2 PdfPage object + angle: Page rotation angle in degrees (default: 0.0) + boundary_type: The boundary type for the page (default: CROP_BOX) + + Returns: + PdfPageGeometry with all the different bounding boxes properly set + """ + # Get the main bounding box (intersection of crop_box and media_box) + bbox_tuple = ppage.get_bbox() + bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT) + + # Get all the different page boxes from pypdfium2 + media_box_tuple = ppage.get_mediabox() + crop_box_tuple = ppage.get_cropbox() + art_box_tuple = ppage.get_artbox() + bleed_box_tuple = ppage.get_bleedbox() + trim_box_tuple = ppage.get_trimbox() + + # Convert to BoundingBox objects using existing from_tuple method + # pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin) + # Use bbox as fallback when specific box types are not defined + media_bbox = ( + BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT) + if media_box_tuple + else bbox + ) + crop_bbox = ( + BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT) + if crop_box_tuple + else bbox + ) + art_bbox = ( + BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT) + if art_box_tuple + else bbox + ) + bleed_bbox = ( + BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT) + if bleed_box_tuple + else bbox + ) + trim_bbox = ( + BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT) + if trim_box_tuple + else bbox + ) + + return PdfPageGeometry( + angle=angle, + rect=BoundingRectangle.from_bounding_box(bbox), + boundary_type=boundary_type, + art_bbox=art_bbox, + bleed_bbox=bleed_bbox, + crop_bbox=crop_bbox, + media_bbox=media_bbox, + trim_bbox=trim_bbox, + ) + + if TYPE_CHECKING: from docling.datamodel.document import InputDocument @@ -213,28 +282,11 @@ class PyPdfiumPageBackend(PdfPageBackend): if not self.valid: return None - page_size = self.get_size() text_cells = self._compute_text_cells() - # Create page geometry - crop_bbox = BoundingBox( - l=0, - r=page_size.width, - t=0, - b=page_size.height, - coord_origin=CoordOrigin.TOPLEFT, - ).to_bottom_left_origin(page_size.height) - - dimension = PdfPageGeometry( - angle=0.0, - rect=BoundingRectangle.from_bounding_box(crop_bbox), - boundary_type=PdfPageBoundaryType.CROP_BOX, - art_bbox=crop_bbox, - bleed_bbox=crop_bbox, - crop_bbox=crop_bbox, - media_bbox=crop_bbox, - trim_bbox=crop_bbox, - ) + # Get the PDF page geometry from pypdfium2 + with pypdfium2_lock: + dimension = get_pdf_page_geometry(self._ppage) # Create SegmentedPdfPage return SegmentedPdfPage( diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 98fcbbe7..be60e79d 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -145,7 +145,7 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions): # Update parsed_page.textline_cells directly page.parsed_page.textline_cells = final_cells - page.parsed_page.has_lines = bool(final_cells) + page.parsed_page.has_lines = len(final_cells) > 0 def _combine_cells(self, existing_cells, ocr_cells): """Combine existing and OCR cells with filtering and re-indexing."""