fix(pdf): threadsafe for pypdfium2 backend (#2527)

* add threadsafe test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* test backend

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* test threaded pipeline

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add test_pypdfium_threaded_pipeline

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add more threadsafe blocks

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix threadsafe in pypdfium backend

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* remove unneccessary tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* restore clean test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-10-30 17:58:39 +01:00
committed by GitHub
parent d27fe92e01
commit a51275d080
2 changed files with 26 additions and 4 deletions

View File

@@ -229,9 +229,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
b=max(cell.rect.to_bounding_box().b for cell in group), b=max(cell.rect.to_bounding_box().b for cell in group),
) )
assert self._ppage is not None assert self.text_page is not None
self.text_page = self._ppage.get_textpage()
bbox = merged_bbox.to_bottom_left_origin(page_size.height) bbox = merged_bbox.to_bottom_left_origin(page_size.height)
with pypdfium2_lock:
merged_text = self.text_page.get_text_bounded(*bbox.as_tuple()) merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
return TextCell( return TextCell(
@@ -255,9 +255,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32 AREA_THRESHOLD = 0 # 32 * 32
page_size = self.get_size() page_size = self.get_size()
rotation = self._ppage.get_rotation()
with pypdfium2_lock: with pypdfium2_lock:
rotation = self._ppage.get_rotation()
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]): for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos() pos = obj.get_pos()
if rotation == 90: if rotation == 90:

View File

@@ -5,6 +5,7 @@ from typing import List
import pytest import pytest
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
@@ -171,6 +172,27 @@ def test_pipeline_comparison():
assert len(sync_doc.texts) == len(threaded_doc.texts) assert len(sync_doc.texts) == len(threaded_doc.texts)
def test_pypdfium_threaded_pipeline():
doc_converter = (
DocumentConverter( # all of the below is optional, has internal defaults.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=ThreadedStandardPdfPipeline,
backend=PyPdfiumDocumentBackend,
),
},
)
)
test_file = "tests/data/pdf/2206.01062.pdf"
for i in range(6):
print(f"iteration {i=}")
conv_result = doc_converter.convert(test_file)
assert conv_result.status == ConversionStatus.SUCCESS
print(f"[{i=}] Success")
print("All done!")
if __name__ == "__main__": if __name__ == "__main__":
# Run basic performance test # Run basic performance test
test_pipeline_comparison() test_pipeline_comparison()