Update threaded test

Signed-off-by: Ubuntu <ubuntu@ip-172-31-30-253.eu-central-1.compute.internal>
This commit is contained in:
Ubuntu 2025-07-18 15:18:21 +00:00
parent 988db91bff
commit 89acdb5db2

View File

@ -23,7 +23,15 @@ def test_threaded_pipeline_multiple_documents():
"tests/data/pdf/2206.01062.pdf", "tests/data/pdf/2206.01062.pdf",
"tests/data/pdf/2305.03393v1.pdf", "tests/data/pdf/2305.03393v1.pdf",
] ]
# test_files = [str(f) for f in Path("/home/ubuntu/datasets/flat_bench_set").rglob("*.pdf")]
do_ts = False
do_ocr = False
run_threaded = True
run_serial = True
if run_threaded:
# Threaded pipeline # Threaded pipeline
threaded_converter = DocumentConverter( threaded_converter = DocumentConverter(
format_options={ format_options={
@ -34,8 +42,8 @@ def test_threaded_pipeline_multiple_documents():
table_batch_size=1, table_batch_size=1,
ocr_batch_size=1, ocr_batch_size=1,
batch_timeout_seconds=1.0, batch_timeout_seconds=1.0,
do_table_structure=True, do_table_structure=do_ts,
do_ocr=True, do_ocr=do_ocr,
), ),
) )
} }
@ -56,17 +64,17 @@ def test_threaded_pipeline_multiple_documents():
del threaded_converter del threaded_converter
print("\nMulti-document Pipeline Comparison:")
print(f"Threaded pipeline: {threaded_time:.2f} seconds") print(f"Threaded pipeline: {threaded_time:.2f} seconds")
if run_serial:
# Standard pipeline # Standard pipeline
standard_converter = DocumentConverter( standard_converter = DocumentConverter(
format_options={ format_options={
InputFormat.PDF: PdfFormatOption( InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, pipeline_cls=StandardPdfPipeline,
pipeline_options=PdfPipelineOptions( pipeline_options=PdfPipelineOptions(
do_table_structure=True, do_table_structure=do_ts,
do_ocr=True, do_ocr=do_ocr,
), ),
) )
} }
@ -88,15 +96,18 @@ def test_threaded_pipeline_multiple_documents():
del standard_converter del standard_converter
print(f"Standard pipeline: {standard_time:.2f} seconds") print(f"Standard pipeline: {standard_time:.2f} seconds")
print(f"Speedup: {standard_time / threaded_time:.2f}x")
# Verify results # Verify results
if run_threaded and run_serial:
assert len(standard_results) == len(threaded_results) assert len(standard_results) == len(threaded_results)
if run_serial:
for result in standard_results: for result in standard_results:
assert result.status == ConversionStatus.SUCCESS assert result.status == ConversionStatus.SUCCESS
if run_threaded:
for result in threaded_results: for result in threaded_results:
assert result.status == ConversionStatus.SUCCESS assert result.status == ConversionStatus.SUCCESS
if run_serial and run_threaded:
# Basic content comparison # Basic content comparison
for i, (standard_result, threaded_result) in enumerate( for i, (standard_result, threaded_result) in enumerate(
zip(standard_results, threaded_results) zip(standard_results, threaded_results)