Remove with pypdfium2_lock from caller sites

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-13 16:11:34 +02:00
parent 1c39dc93ab
commit 5ec6de3ae4
2 changed files with 52 additions and 53 deletions

View File

@ -143,8 +143,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
text_cells = self._compute_text_cells() text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2 # Get the PDF page geometry from pypdfium2
with pypdfium2_lock: dimension = get_pdf_page_geometry(self._ppage)
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage # Create SegmentedPdfPage
return SegmentedPdfPage( return SegmentedPdfPage(
@ -152,7 +151,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
textline_cells=text_cells, textline_cells=text_cells,
char_cells=[], char_cells=[],
word_cells=[], word_cells=[],
has_lines=len(text_cells) > 0, has_textlines=len(text_cells) > 0,
has_words=False, has_words=False,
has_chars=False, has_chars=False,
) )

View File

@ -39,56 +39,57 @@ def get_pdf_page_geometry(
Returns: Returns:
PdfPageGeometry with all the different bounding boxes properly set PdfPageGeometry with all the different bounding boxes properly set
""" """
# Get the main bounding box (intersection of crop_box and media_box) with pypdfium2_lock:
bbox_tuple = ppage.get_bbox() # Get the main bounding box (intersection of crop_box and media_box)
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT) bbox_tuple = ppage.get_bbox()
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
# Get all the different page boxes from pypdfium2 # Get all the different page boxes from pypdfium2
media_box_tuple = ppage.get_mediabox() media_box_tuple = ppage.get_mediabox()
crop_box_tuple = ppage.get_cropbox() crop_box_tuple = ppage.get_cropbox()
art_box_tuple = ppage.get_artbox() art_box_tuple = ppage.get_artbox()
bleed_box_tuple = ppage.get_bleedbox() bleed_box_tuple = ppage.get_bleedbox()
trim_box_tuple = ppage.get_trimbox() trim_box_tuple = ppage.get_trimbox()
# Convert to BoundingBox objects using existing from_tuple method # Convert to BoundingBox objects using existing from_tuple method
# pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin) # pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
# Use bbox as fallback when specific box types are not defined # Use bbox as fallback when specific box types are not defined
media_bbox = ( media_bbox = (
BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT) BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
if media_box_tuple if media_box_tuple
else bbox else bbox
) )
crop_bbox = ( crop_bbox = (
BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT) BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
if crop_box_tuple if crop_box_tuple
else bbox else bbox
) )
art_bbox = ( art_bbox = (
BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT) BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
if art_box_tuple if art_box_tuple
else bbox else bbox
) )
bleed_bbox = ( bleed_bbox = (
BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT) BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
if bleed_box_tuple if bleed_box_tuple
else bbox else bbox
) )
trim_bbox = ( trim_bbox = (
BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT) BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
if trim_box_tuple if trim_box_tuple
else bbox else bbox
) )
return PdfPageGeometry( return PdfPageGeometry(
angle=angle, angle=angle,
rect=BoundingRectangle.from_bounding_box(bbox), rect=BoundingRectangle.from_bounding_box(bbox),
boundary_type=boundary_type, boundary_type=boundary_type,
art_bbox=art_bbox, art_bbox=art_bbox,
bleed_bbox=bleed_bbox, bleed_bbox=bleed_bbox,
crop_bbox=crop_bbox, crop_bbox=crop_bbox,
media_bbox=media_bbox, media_bbox=media_bbox,
trim_bbox=trim_bbox, trim_bbox=trim_bbox,
) )
if TYPE_CHECKING: if TYPE_CHECKING:
@ -285,8 +286,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
text_cells = self._compute_text_cells() text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2 # Get the PDF page geometry from pypdfium2
with pypdfium2_lock: dimension = get_pdf_page_geometry(self._ppage)
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage # Create SegmentedPdfPage
return SegmentedPdfPage( return SegmentedPdfPage(
@ -294,7 +294,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
textline_cells=text_cells, textline_cells=text_cells,
char_cells=[], char_cells=[],
word_cells=[], word_cells=[],
has_lines=len(text_cells) > 0, has_textlines=len(text_cells) > 0,
has_words=False, has_words=False,
has_chars=False, has_chars=False,
) )