mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
fix(pdf): threadsafe for pypdfium2 backend (#2527)
* add threadsafe test Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * test backend Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * test threaded pipeline Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add test_pypdfium_threaded_pipeline Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add more threadsafe blocks Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix threadsafe in pypdfium backend Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove unneccessary tests Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * restore clean test Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -229,9 +229,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
b=max(cell.rect.to_bounding_box().b for cell in group),
|
b=max(cell.rect.to_bounding_box().b for cell in group),
|
||||||
)
|
)
|
||||||
|
|
||||||
assert self._ppage is not None
|
assert self.text_page is not None
|
||||||
self.text_page = self._ppage.get_textpage()
|
|
||||||
bbox = merged_bbox.to_bottom_left_origin(page_size.height)
|
bbox = merged_bbox.to_bottom_left_origin(page_size.height)
|
||||||
|
with pypdfium2_lock:
|
||||||
merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
|
merged_text = self.text_page.get_text_bounded(*bbox.as_tuple())
|
||||||
|
|
||||||
return TextCell(
|
return TextCell(
|
||||||
@@ -255,9 +255,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||||
AREA_THRESHOLD = 0 # 32 * 32
|
AREA_THRESHOLD = 0 # 32 * 32
|
||||||
page_size = self.get_size()
|
page_size = self.get_size()
|
||||||
rotation = self._ppage.get_rotation()
|
|
||||||
|
|
||||||
with pypdfium2_lock:
|
with pypdfium2_lock:
|
||||||
|
rotation = self._ppage.get_rotation()
|
||||||
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
||||||
pos = obj.get_pos()
|
pos = obj.get_pos()
|
||||||
if rotation == 90:
|
if rotation == 90:
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from typing import List
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
@@ -171,6 +172,27 @@ def test_pipeline_comparison():
|
|||||||
assert len(sync_doc.texts) == len(threaded_doc.texts)
|
assert len(sync_doc.texts) == len(threaded_doc.texts)
|
||||||
|
|
||||||
|
|
||||||
|
def test_pypdfium_threaded_pipeline():
|
||||||
|
doc_converter = (
|
||||||
|
DocumentConverter( # all of the below is optional, has internal defaults.
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=ThreadedStandardPdfPipeline,
|
||||||
|
backend=PyPdfiumDocumentBackend,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
test_file = "tests/data/pdf/2206.01062.pdf"
|
||||||
|
for i in range(6):
|
||||||
|
print(f"iteration {i=}")
|
||||||
|
conv_result = doc_converter.convert(test_file)
|
||||||
|
assert conv_result.status == ConversionStatus.SUCCESS
|
||||||
|
print(f"[{i=}] Success")
|
||||||
|
print("All done!")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Run basic performance test
|
# Run basic performance test
|
||||||
test_pipeline_comparison()
|
test_pipeline_comparison()
|
||||||
|
|||||||
Reference in New Issue
Block a user