Revise pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-07-18 14:33:03 +02:00
parent 9fd01f3399
commit 33a24848a0
3 changed files with 455 additions and 529 deletions

View File

@ -1,8 +1,10 @@
import hashlib import hashlib
import logging import logging
import sys import sys
import threading
import time import time
from collections.abc import Iterable, Iterator from collections.abc import Iterable, Iterator
from concurrent.futures import ThreadPoolExecutor
from functools import partial from functools import partial
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional, Tuple, Type, Union from typing import Dict, List, Optional, Tuple, Type, Union
@ -50,6 +52,9 @@ from docling.utils.utils import chunkify
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
# Module-level lock for pipeline cache
_pipeline_cache_lock = threading.Lock()
class FormatOption(BaseModel): class FormatOption(BaseModel):
pipeline_cls: Type[BasePipeline] pipeline_cls: Type[BasePipeline]
@ -287,7 +292,10 @@ class DocumentConverter:
#with ThreadPoolExecutor( #with ThreadPoolExecutor(
# max_workers=settings.perf.doc_batch_concurrency # max_workers=settings.perf.doc_batch_concurrency
#) as pool: #) as pool:
# yield from pool.map(self.process_document, input_batch) # yield from pool.map(
# partial(self._process_document, raises_on_error=raises_on_error),
# input_batch,
# )
# Note: PDF backends are not thread-safe, thread pool usage was disabled. # Note: PDF backends are not thread-safe, thread pool usage was disabled.
for item in map( for item in map(
@ -315,6 +323,7 @@ class DocumentConverter:
# Use a composite key to cache pipelines # Use a composite key to cache pipelines
cache_key = (pipeline_class, options_hash) cache_key = (pipeline_class, options_hash)
with _pipeline_cache_lock:
if cache_key not in self.initialized_pipelines: if cache_key not in self.initialized_pipelines:
_log.info( _log.info(
f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}" f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"

File diff suppressed because it is too large Load Diff

View File

@ -1,50 +1,98 @@
import logging
import time import time
from pathlib import Path from pathlib import Path
from typing import List from typing import List
import pytest import pytest
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.base_models import InputFormat, ConversionStatus
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
PdfPipelineOptions, PdfPipelineOptions,
ThreadedPdfPipelineOptions ThreadedPdfPipelineOptions,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.pipeline.threaded_standard_pdf_pipeline import ThreadedStandardPdfPipeline from docling.pipeline.threaded_standard_pdf_pipeline import ThreadedStandardPdfPipeline
def test_threaded_pipeline_multiple_documents(): def test_threaded_pipeline_multiple_documents():
"""Test threaded pipeline with multiple documents""" """Test threaded pipeline with multiple documents and compare with standard pipeline"""
converter = DocumentConverter( test_files = [
"tests/data/pdf/2203.01017v2.pdf",
"tests/data/pdf/2206.01062.pdf",
"tests/data/pdf/2305.03393v1.pdf"
]
# Standard pipeline
standard_converter = DocumentConverter(
format_options={ format_options={
InputFormat.PDF: PdfFormatOption( InputFormat.PDF: PdfFormatOption(
pipeline_cls=ThreadedStandardPdfPipeline, pipeline_cls=StandardPdfPipeline,
pipeline_options=ThreadedPdfPipelineOptions( pipeline_options=PdfPipelineOptions(
layout_batch_size=48, do_table_structure=True,
ocr_batch_size=24, do_ocr=True,
batch_timeout_seconds=1.0, ),
)
) )
} }
) )
# Test threaded pipeline with multiple documents # Threaded pipeline
results = [] threaded_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=ThreadedStandardPdfPipeline,
pipeline_options=ThreadedPdfPipelineOptions(
layout_batch_size=1,
table_batch_size=1,
ocr_batch_size=1,
batch_timeout_seconds=1.0,
do_table_structure=True,
do_ocr=True,
),
)
}
)
# Test standard pipeline
standard_results = []
start_time = time.perf_counter() start_time = time.perf_counter()
for result in converter.convert_all([ for result in standard_converter.convert_all(test_files, raises_on_error=True):
"tests/data/pdf/2206.01062.pdf", print("Finished converting document with standard pipeline:", result.input.file.name)
"tests/data/pdf/2305.03393v1.pdf" standard_results.append(result)
]): standard_time = time.perf_counter() - start_time
results.append(result)
end_time = time.perf_counter()
conversion_duration = end_time - start_time del standard_converter
print(f"Threaded multi-doc conversion took {conversion_duration:.2f} seconds")
assert len(results) == 2 # Test threaded pipeline
for result in results: threaded_results = []
start_time = time.perf_counter()
for result in threaded_converter.convert_all(test_files, raises_on_error=True):
print("Finished converting document with threaded pipeline:", result.input.file.name)
threaded_results.append(result)
threaded_time = time.perf_counter() - start_time
del threaded_converter
print("\nMulti-document Pipeline Comparison:")
print(f"Standard pipeline: {standard_time:.2f} seconds")
print(f"Threaded pipeline: {threaded_time:.2f} seconds")
print(f"Speedup: {standard_time / threaded_time:.2f}x")
# Verify results
assert len(standard_results) == len(threaded_results)
for result in standard_results:
assert result.status == ConversionStatus.SUCCESS assert result.status == ConversionStatus.SUCCESS
for result in threaded_results:
assert result.status == ConversionStatus.SUCCESS
# Basic content comparison
for i, (standard_result, threaded_result) in enumerate(zip(standard_results, threaded_results)):
standard_doc = standard_result.document
threaded_doc = threaded_result.document
assert len(standard_doc.pages) == len(threaded_doc.pages), f"Document {i} page count mismatch"
assert len(standard_doc.texts) == len(threaded_doc.texts), f"Document {i} text count mismatch"
def test_pipeline_comparison(): def test_pipeline_comparison():
@ -73,7 +121,7 @@ def test_pipeline_comparison():
layout_batch_size=1, layout_batch_size=1,
ocr_batch_size=1, ocr_batch_size=1,
table_batch_size=1, table_batch_size=1,
) ),
) )
} }
) )
@ -82,14 +130,16 @@ def test_pipeline_comparison():
threaded_results = list(threaded_converter.convert_all([test_file])) threaded_results = list(threaded_converter.convert_all([test_file]))
threaded_time = time.perf_counter() - start_time threaded_time = time.perf_counter() - start_time
print(f"\nPipeline Comparison:") print("\nPipeline Comparison:")
print(f"Sync pipeline: {sync_time:.2f} seconds") print(f"Sync pipeline: {sync_time:.2f} seconds")
print(f"Threaded pipeline: {threaded_time:.2f} seconds") print(f"Threaded pipeline: {threaded_time:.2f} seconds")
print(f"Speedup: {sync_time / threaded_time:.2f}x") print(f"Speedup: {sync_time / threaded_time:.2f}x")
# Verify results are equivalent # Verify results are equivalent
assert len(sync_results) == len(threaded_results) == 1 assert len(sync_results) == len(threaded_results) == 1
assert sync_results[0].status == threaded_results[0].status == ConversionStatus.SUCCESS assert (
sync_results[0].status == threaded_results[0].status == ConversionStatus.SUCCESS
)
# Basic content comparison # Basic content comparison
sync_doc = sync_results[0].document sync_doc = sync_results[0].document
@ -99,9 +149,6 @@ def test_pipeline_comparison():
assert len(sync_doc.texts) == len(threaded_doc.texts) assert len(sync_doc.texts) == len(threaded_doc.texts)
if __name__ == "__main__": if __name__ == "__main__":
# Run basic performance test # Run basic performance test
test_pipeline_comparison() test_pipeline_comparison()