fix(pdf): threadsafe for pypdfium2 backend (#2527)

* add threadsafe test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* test backend

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* test threaded pipeline

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add test_pypdfium_threaded_pipeline

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add more threadsafe blocks

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix threadsafe in pypdfium backend

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* remove unneccessary tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* restore clean test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-10-30 17:58:39 +01:00
committed by GitHub
parent d27fe92e01
commit a51275d080
2 changed files with 26 additions and 4 deletions

View File

@@ -229,10 +229,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
b=max(cell.rect.to_bounding_box().b for cell in group),
)
assert self._ppage is not None
self.text_page = self._ppage.get_textpage()
assert self.text_page is not None
bbox = merged_bbox.to_bottom_left_origin(page_size.height)
merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
with pypdfium2_lock:
merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
return TextCell(
index=group[0].index,
@@ -255,9 +255,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32
page_size = self.get_size()
rotation = self._ppage.get_rotation()
with pypdfium2_lock:
rotation = self._ppage.get_rotation()
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
if rotation == 90: