diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index bacc4a99..bd3b1b7c 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -229,10 +229,10 @@ class PyPdfiumPageBackend(PdfPageBackend): b=max(cell.rect.to_bounding_box().b for cell in group), ) - assert self._ppage is not None - self.text_page = self._ppage.get_textpage() + assert self.text_page is not None bbox = merged_bbox.to_bottom_left_origin(page_size.height) - merged_text = self.text_page.get_text_bounded(*bbox.as_tuple()) + with pypdfium2_lock: + merged_text = self.text_page.get_text_bounded(*bbox.as_tuple()) return TextCell( index=group[0].index, @@ -255,9 +255,9 @@ class PyPdfiumPageBackend(PdfPageBackend): def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: AREA_THRESHOLD = 0 # 32 * 32 page_size = self.get_size() - rotation = self._ppage.get_rotation() with pypdfium2_lock: + rotation = self._ppage.get_rotation() for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]): pos = obj.get_pos() if rotation == 90: diff --git a/tests/test_threaded_pipeline.py b/tests/test_threaded_pipeline.py index ec6d3427..5810565c 100644 --- a/tests/test_threaded_pipeline.py +++ b/tests/test_threaded_pipeline.py @@ -5,6 +5,7 @@ from typing import List import pytest +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( @@ -171,6 +172,27 @@ def test_pipeline_comparison(): assert len(sync_doc.texts) == len(threaded_doc.texts) +def test_pypdfium_threaded_pipeline(): + doc_converter = ( + DocumentConverter( # all of the below is optional, has internal defaults. + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=ThreadedStandardPdfPipeline, + backend=PyPdfiumDocumentBackend, + ), + }, + ) + ) + + test_file = "tests/data/pdf/2206.01062.pdf" + for i in range(6): + print(f"iteration {i=}") + conv_result = doc_converter.convert(test_file) + assert conv_result.status == ConversionStatus.SUCCESS + print(f"[{i=}] Success") + print("All done!") + + if __name__ == "__main__": # Run basic performance test test_pipeline_comparison()