fix(pdf): threadsafe for pypdfium2 backend (#2527)

* add threadsafe test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* test backend

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* test threaded pipeline

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add test_pypdfium_threaded_pipeline

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add more threadsafe blocks

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix threadsafe in pypdfium backend

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* remove unneccessary tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* restore clean test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-10-30 17:58:39 +01:00
committed by GitHub
parent d27fe92e01
commit a51275d080
2 changed files with 26 additions and 4 deletions

View File

@@ -5,6 +5,7 @@ from typing import List
import pytest
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
@@ -171,6 +172,27 @@ def test_pipeline_comparison():
assert len(sync_doc.texts) == len(threaded_doc.texts)
def test_pypdfium_threaded_pipeline():
doc_converter = (
DocumentConverter( # all of the below is optional, has internal defaults.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=ThreadedStandardPdfPipeline,
backend=PyPdfiumDocumentBackend,
),
},
)
)
test_file = "tests/data/pdf/2206.01062.pdf"
for i in range(6):
print(f"iteration {i=}")
conv_result = doc_converter.convert(test_file)
assert conv_result.status == ConversionStatus.SUCCESS
print(f"[{i=}] Success")
print("All done!")
if __name__ == "__main__":
# Run basic performance test
test_pipeline_comparison()