mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Correctly compute PDF boxes from pymupdf
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
06b408fa41
commit
9752e824fb
@ -9,8 +9,6 @@ import pypdfium2 as pdfium
|
|||||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||||
from docling_core.types.doc.page import (
|
from docling_core.types.doc.page import (
|
||||||
BoundingRectangle,
|
BoundingRectangle,
|
||||||
PdfPageBoundaryType,
|
|
||||||
PdfPageGeometry,
|
|
||||||
SegmentedPdfPage,
|
SegmentedPdfPage,
|
||||||
TextCell,
|
TextCell,
|
||||||
)
|
)
|
||||||
@ -19,6 +17,7 @@ from PIL import Image, ImageDraw
|
|||||||
from pypdfium2 import PdfPage
|
from pypdfium2 import PdfPage
|
||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
|
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -124,28 +123,10 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|||||||
if not self.valid:
|
if not self.valid:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
page_size = self.get_size()
|
|
||||||
text_cells = self._compute_text_cells()
|
text_cells = self._compute_text_cells()
|
||||||
|
|
||||||
# Create page geometry
|
# Get the PDF page geometry from pypdfium2
|
||||||
crop_bbox = BoundingBox(
|
dimension = get_pdf_page_geometry(self._ppage)
|
||||||
l=0,
|
|
||||||
r=page_size.width,
|
|
||||||
t=0,
|
|
||||||
b=page_size.height,
|
|
||||||
coord_origin=CoordOrigin.TOPLEFT,
|
|
||||||
).to_bottom_left_origin(page_size.height)
|
|
||||||
|
|
||||||
dimension = PdfPageGeometry(
|
|
||||||
angle=0.0,
|
|
||||||
rect=BoundingRectangle.from_bounding_box(crop_bbox),
|
|
||||||
boundary_type=PdfPageBoundaryType.CROP_BOX,
|
|
||||||
art_bbox=crop_bbox,
|
|
||||||
bleed_bbox=crop_bbox,
|
|
||||||
crop_bbox=crop_bbox,
|
|
||||||
media_bbox=crop_bbox,
|
|
||||||
trim_bbox=crop_bbox,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create SegmentedPdfPage
|
# Create SegmentedPdfPage
|
||||||
return SegmentedPdfPage(
|
return SegmentedPdfPage(
|
||||||
|
@ -19,6 +19,7 @@ from PIL import Image, ImageDraw
|
|||||||
from pypdfium2 import PdfPage
|
from pypdfium2 import PdfPage
|
||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
|
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
|
||||||
from docling.datamodel.base_models import Size
|
from docling.datamodel.base_models import Size
|
||||||
from docling.utils.locks import pypdfium2_lock
|
from docling.utils.locks import pypdfium2_lock
|
||||||
|
|
||||||
@ -139,28 +140,11 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
if not self.valid:
|
if not self.valid:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
page_size = self.get_size()
|
|
||||||
text_cells = self._compute_text_cells()
|
text_cells = self._compute_text_cells()
|
||||||
|
|
||||||
# Create page geometry
|
# Get the PDF page geometry from pypdfium2
|
||||||
crop_bbox = BoundingBox(
|
with pypdfium2_lock:
|
||||||
l=0,
|
dimension = get_pdf_page_geometry(self._ppage)
|
||||||
r=page_size.width,
|
|
||||||
t=0,
|
|
||||||
b=page_size.height,
|
|
||||||
coord_origin=CoordOrigin.TOPLEFT,
|
|
||||||
).to_bottom_left_origin(page_size.height)
|
|
||||||
|
|
||||||
dimension = PdfPageGeometry(
|
|
||||||
angle=0.0,
|
|
||||||
rect=BoundingRectangle.from_bounding_box(crop_bbox),
|
|
||||||
boundary_type=PdfPageBoundaryType.CROP_BOX,
|
|
||||||
art_bbox=crop_bbox,
|
|
||||||
bleed_bbox=crop_bbox,
|
|
||||||
crop_bbox=crop_bbox,
|
|
||||||
media_bbox=crop_bbox,
|
|
||||||
trim_bbox=crop_bbox,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create SegmentedPdfPage
|
# Create SegmentedPdfPage
|
||||||
return SegmentedPdfPage(
|
return SegmentedPdfPage(
|
||||||
|
@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|||||||
return self._dpage
|
return self._dpage
|
||||||
|
|
||||||
def get_text_cells(self) -> Iterable[TextCell]:
|
def get_text_cells(self) -> Iterable[TextCell]:
|
||||||
page_size = self.get_size()
|
|
||||||
|
|
||||||
[tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
|
|
||||||
|
|
||||||
# for cell in self._dpage.textline_cells:
|
|
||||||
# rect = cell.rect
|
|
||||||
#
|
|
||||||
# assert (
|
|
||||||
# rect.to_bounding_box().l <= rect.to_bounding_box().r
|
|
||||||
# ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
|
|
||||||
# assert (
|
|
||||||
# rect.to_bounding_box().t <= rect.to_bounding_box().b
|
|
||||||
# ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
|
|
||||||
|
|
||||||
return self._dpage.textline_cells
|
return self._dpage.textline_cells
|
||||||
|
|
||||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||||
@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|||||||
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
||||||
) -> DoclingParseV4PageBackend:
|
) -> DoclingParseV4PageBackend:
|
||||||
with pypdfium2_lock:
|
with pypdfium2_lock:
|
||||||
|
seg_page = self.dp_doc.get_page(
|
||||||
|
page_no + 1,
|
||||||
|
create_words=create_words,
|
||||||
|
create_textlines=create_textlines,
|
||||||
|
)
|
||||||
|
|
||||||
|
# In Docling, all TextCell instances are expected with top-left origin.
|
||||||
|
[
|
||||||
|
tc.to_top_left_origin(seg_page.dimension.height)
|
||||||
|
for tc in seg_page.textline_cells
|
||||||
|
]
|
||||||
|
[
|
||||||
|
tc.to_top_left_origin(seg_page.dimension.height)
|
||||||
|
for tc in seg_page.char_cells
|
||||||
|
]
|
||||||
|
[
|
||||||
|
tc.to_top_left_origin(seg_page.dimension.height)
|
||||||
|
for tc in seg_page.word_cells
|
||||||
|
]
|
||||||
|
|
||||||
return DoclingParseV4PageBackend(
|
return DoclingParseV4PageBackend(
|
||||||
self.dp_doc.get_page(
|
seg_page,
|
||||||
page_no + 1,
|
|
||||||
create_words=create_words,
|
|
||||||
create_textlines=create_textlines,
|
|
||||||
),
|
|
||||||
self._pdoc[page_no],
|
self._pdoc[page_no],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -22,6 +22,75 @@ from pypdfium2._helpers.misc import PdfiumError
|
|||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.utils.locks import pypdfium2_lock
|
from docling.utils.locks import pypdfium2_lock
|
||||||
|
|
||||||
|
|
||||||
|
def get_pdf_page_geometry(
|
||||||
|
ppage: pdfium.PdfPage,
|
||||||
|
angle: float = 0.0,
|
||||||
|
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
|
||||||
|
) -> PdfPageGeometry:
|
||||||
|
"""
|
||||||
|
Create PdfPageGeometry from a pypdfium2 PdfPage object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ppage: pypdfium2 PdfPage object
|
||||||
|
angle: Page rotation angle in degrees (default: 0.0)
|
||||||
|
boundary_type: The boundary type for the page (default: CROP_BOX)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
PdfPageGeometry with all the different bounding boxes properly set
|
||||||
|
"""
|
||||||
|
# Get the main bounding box (intersection of crop_box and media_box)
|
||||||
|
bbox_tuple = ppage.get_bbox()
|
||||||
|
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
|
||||||
|
|
||||||
|
# Get all the different page boxes from pypdfium2
|
||||||
|
media_box_tuple = ppage.get_mediabox()
|
||||||
|
crop_box_tuple = ppage.get_cropbox()
|
||||||
|
art_box_tuple = ppage.get_artbox()
|
||||||
|
bleed_box_tuple = ppage.get_bleedbox()
|
||||||
|
trim_box_tuple = ppage.get_trimbox()
|
||||||
|
|
||||||
|
# Convert to BoundingBox objects using existing from_tuple method
|
||||||
|
# pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
|
||||||
|
# Use bbox as fallback when specific box types are not defined
|
||||||
|
media_bbox = (
|
||||||
|
BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||||
|
if media_box_tuple
|
||||||
|
else bbox
|
||||||
|
)
|
||||||
|
crop_bbox = (
|
||||||
|
BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||||
|
if crop_box_tuple
|
||||||
|
else bbox
|
||||||
|
)
|
||||||
|
art_bbox = (
|
||||||
|
BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||||
|
if art_box_tuple
|
||||||
|
else bbox
|
||||||
|
)
|
||||||
|
bleed_bbox = (
|
||||||
|
BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||||
|
if bleed_box_tuple
|
||||||
|
else bbox
|
||||||
|
)
|
||||||
|
trim_bbox = (
|
||||||
|
BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||||
|
if trim_box_tuple
|
||||||
|
else bbox
|
||||||
|
)
|
||||||
|
|
||||||
|
return PdfPageGeometry(
|
||||||
|
angle=angle,
|
||||||
|
rect=BoundingRectangle.from_bounding_box(bbox),
|
||||||
|
boundary_type=boundary_type,
|
||||||
|
art_bbox=art_bbox,
|
||||||
|
bleed_bbox=bleed_bbox,
|
||||||
|
crop_bbox=crop_bbox,
|
||||||
|
media_bbox=media_bbox,
|
||||||
|
trim_bbox=trim_bbox,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
@ -213,28 +282,11 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
if not self.valid:
|
if not self.valid:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
page_size = self.get_size()
|
|
||||||
text_cells = self._compute_text_cells()
|
text_cells = self._compute_text_cells()
|
||||||
|
|
||||||
# Create page geometry
|
# Get the PDF page geometry from pypdfium2
|
||||||
crop_bbox = BoundingBox(
|
with pypdfium2_lock:
|
||||||
l=0,
|
dimension = get_pdf_page_geometry(self._ppage)
|
||||||
r=page_size.width,
|
|
||||||
t=0,
|
|
||||||
b=page_size.height,
|
|
||||||
coord_origin=CoordOrigin.TOPLEFT,
|
|
||||||
).to_bottom_left_origin(page_size.height)
|
|
||||||
|
|
||||||
dimension = PdfPageGeometry(
|
|
||||||
angle=0.0,
|
|
||||||
rect=BoundingRectangle.from_bounding_box(crop_bbox),
|
|
||||||
boundary_type=PdfPageBoundaryType.CROP_BOX,
|
|
||||||
art_bbox=crop_bbox,
|
|
||||||
bleed_bbox=crop_bbox,
|
|
||||||
crop_bbox=crop_bbox,
|
|
||||||
media_bbox=crop_bbox,
|
|
||||||
trim_bbox=crop_bbox,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create SegmentedPdfPage
|
# Create SegmentedPdfPage
|
||||||
return SegmentedPdfPage(
|
return SegmentedPdfPage(
|
||||||
|
@ -145,7 +145,7 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
|||||||
|
|
||||||
# Update parsed_page.textline_cells directly
|
# Update parsed_page.textline_cells directly
|
||||||
page.parsed_page.textline_cells = final_cells
|
page.parsed_page.textline_cells = final_cells
|
||||||
page.parsed_page.has_lines = bool(final_cells)
|
page.parsed_page.has_lines = len(final_cells) > 0
|
||||||
|
|
||||||
def _combine_cells(self, existing_cells, ocr_cells):
|
def _combine_cells(self, existing_cells, ocr_cells):
|
||||||
"""Combine existing and OCR cells with filtering and re-indexing."""
|
"""Combine existing and OCR cells with filtering and re-indexing."""
|
||||||
|
Loading…
Reference in New Issue
Block a user