mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Remove with pypdfium2_lock from caller sites
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
1c39dc93ab
commit
5ec6de3ae4
@ -143,8 +143,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
text_cells = self._compute_text_cells()
|
text_cells = self._compute_text_cells()
|
||||||
|
|
||||||
# Get the PDF page geometry from pypdfium2
|
# Get the PDF page geometry from pypdfium2
|
||||||
with pypdfium2_lock:
|
dimension = get_pdf_page_geometry(self._ppage)
|
||||||
dimension = get_pdf_page_geometry(self._ppage)
|
|
||||||
|
|
||||||
# Create SegmentedPdfPage
|
# Create SegmentedPdfPage
|
||||||
return SegmentedPdfPage(
|
return SegmentedPdfPage(
|
||||||
@ -152,7 +151,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
textline_cells=text_cells,
|
textline_cells=text_cells,
|
||||||
char_cells=[],
|
char_cells=[],
|
||||||
word_cells=[],
|
word_cells=[],
|
||||||
has_lines=len(text_cells) > 0,
|
has_textlines=len(text_cells) > 0,
|
||||||
has_words=False,
|
has_words=False,
|
||||||
has_chars=False,
|
has_chars=False,
|
||||||
)
|
)
|
||||||
|
@ -39,56 +39,57 @@ def get_pdf_page_geometry(
|
|||||||
Returns:
|
Returns:
|
||||||
PdfPageGeometry with all the different bounding boxes properly set
|
PdfPageGeometry with all the different bounding boxes properly set
|
||||||
"""
|
"""
|
||||||
# Get the main bounding box (intersection of crop_box and media_box)
|
with pypdfium2_lock:
|
||||||
bbox_tuple = ppage.get_bbox()
|
# Get the main bounding box (intersection of crop_box and media_box)
|
||||||
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
|
bbox_tuple = ppage.get_bbox()
|
||||||
|
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
|
||||||
|
|
||||||
# Get all the different page boxes from pypdfium2
|
# Get all the different page boxes from pypdfium2
|
||||||
media_box_tuple = ppage.get_mediabox()
|
media_box_tuple = ppage.get_mediabox()
|
||||||
crop_box_tuple = ppage.get_cropbox()
|
crop_box_tuple = ppage.get_cropbox()
|
||||||
art_box_tuple = ppage.get_artbox()
|
art_box_tuple = ppage.get_artbox()
|
||||||
bleed_box_tuple = ppage.get_bleedbox()
|
bleed_box_tuple = ppage.get_bleedbox()
|
||||||
trim_box_tuple = ppage.get_trimbox()
|
trim_box_tuple = ppage.get_trimbox()
|
||||||
|
|
||||||
# Convert to BoundingBox objects using existing from_tuple method
|
# Convert to BoundingBox objects using existing from_tuple method
|
||||||
# pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
|
# pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
|
||||||
# Use bbox as fallback when specific box types are not defined
|
# Use bbox as fallback when specific box types are not defined
|
||||||
media_bbox = (
|
media_bbox = (
|
||||||
BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
|
BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||||
if media_box_tuple
|
if media_box_tuple
|
||||||
else bbox
|
else bbox
|
||||||
)
|
)
|
||||||
crop_bbox = (
|
crop_bbox = (
|
||||||
BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
|
BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||||
if crop_box_tuple
|
if crop_box_tuple
|
||||||
else bbox
|
else bbox
|
||||||
)
|
)
|
||||||
art_bbox = (
|
art_bbox = (
|
||||||
BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
|
BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||||
if art_box_tuple
|
if art_box_tuple
|
||||||
else bbox
|
else bbox
|
||||||
)
|
)
|
||||||
bleed_bbox = (
|
bleed_bbox = (
|
||||||
BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
|
BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||||
if bleed_box_tuple
|
if bleed_box_tuple
|
||||||
else bbox
|
else bbox
|
||||||
)
|
)
|
||||||
trim_bbox = (
|
trim_bbox = (
|
||||||
BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
|
BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||||
if trim_box_tuple
|
if trim_box_tuple
|
||||||
else bbox
|
else bbox
|
||||||
)
|
)
|
||||||
|
|
||||||
return PdfPageGeometry(
|
return PdfPageGeometry(
|
||||||
angle=angle,
|
angle=angle,
|
||||||
rect=BoundingRectangle.from_bounding_box(bbox),
|
rect=BoundingRectangle.from_bounding_box(bbox),
|
||||||
boundary_type=boundary_type,
|
boundary_type=boundary_type,
|
||||||
art_bbox=art_bbox,
|
art_bbox=art_bbox,
|
||||||
bleed_bbox=bleed_bbox,
|
bleed_bbox=bleed_bbox,
|
||||||
crop_bbox=crop_bbox,
|
crop_bbox=crop_bbox,
|
||||||
media_bbox=media_bbox,
|
media_bbox=media_bbox,
|
||||||
trim_bbox=trim_bbox,
|
trim_bbox=trim_bbox,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@ -285,8 +286,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
text_cells = self._compute_text_cells()
|
text_cells = self._compute_text_cells()
|
||||||
|
|
||||||
# Get the PDF page geometry from pypdfium2
|
# Get the PDF page geometry from pypdfium2
|
||||||
with pypdfium2_lock:
|
dimension = get_pdf_page_geometry(self._ppage)
|
||||||
dimension = get_pdf_page_geometry(self._ppage)
|
|
||||||
|
|
||||||
# Create SegmentedPdfPage
|
# Create SegmentedPdfPage
|
||||||
return SegmentedPdfPage(
|
return SegmentedPdfPage(
|
||||||
@ -294,7 +294,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
textline_cells=text_cells,
|
textline_cells=text_cells,
|
||||||
char_cells=[],
|
char_cells=[],
|
||||||
word_cells=[],
|
word_cells=[],
|
||||||
has_lines=len(text_cells) > 0,
|
has_textlines=len(text_cells) > 0,
|
||||||
has_words=False,
|
has_words=False,
|
||||||
has_chars=False,
|
has_chars=False,
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user