From 346a49c28375168747a5ead903981ed429dba04d Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Sun, 2 Mar 2025 09:58:06 -0500 Subject: [PATCH] fix deadlock in pypdfium2 backend Signed-off-by: Michele Dolfi --- docling/backend/pypdfium2_backend.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index b19a95b8..b585e2d5 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -42,12 +42,13 @@ class PyPdfiumPageBackend(PdfPageBackend): def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: AREA_THRESHOLD = 0 # 32 * 32 + page_size = self.get_size() with pypdfium2_lock: for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]): pos = obj.get_pos() cropbox = BoundingBox.from_tuple( pos, origin=CoordOrigin.BOTTOMLEFT - ).to_top_left_origin(page_height=self.get_size().height) + ).to_top_left_origin(page_height=page_size.height) if cropbox.area() > AREA_THRESHOLD: cropbox = cropbox.scaled(scale=scale) @@ -59,9 +60,10 @@ class PyPdfiumPageBackend(PdfPageBackend): if not self.text_page: self.text_page = self._ppage.get_textpage() - if bbox.coord_origin != CoordOrigin.BOTTOMLEFT: - bbox = bbox.to_bottom_left_origin(self.get_size().height) + if bbox.coord_origin != CoordOrigin.BOTTOMLEFT: + bbox = bbox.to_bottom_left_origin(self.get_size().height) + with pypdfium2_lock: text_piece = self.text_page.get_text_bounded(*bbox.as_tuple()) return text_piece @@ -71,11 +73,12 @@ class PyPdfiumPageBackend(PdfPageBackend): if not self.text_page: self.text_page = self._ppage.get_textpage() - cells = [] - cell_counter = 0 + cells = [] + cell_counter = 0 - page_size = self.get_size() + page_size = self.get_size() + with pypdfium2_lock: for i in range(self.text_page.count_rects()): rect = self.text_page.get_rect(i) text_piece = self.text_page.get_text_bounded(*rect)