Revise pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-10 21:58:15 +00:00 · 2025-07-18 14:33:03 +02:00
parent 9fd01f3399
commit 33a24848a0
3 changed files with 455 additions and 529 deletions
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -1,8 +1,10 @@
 import hashlib
 import logging
 import sys
 import threading
 import time
 from collections.abc import Iterable, Iterator
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Type, Union
@@ -50,6 +52,9 @@ from docling.utils.utils import chunkify
 _log = logging.getLogger(__name__)
 # Module-level lock for pipeline cache
 _pipeline_cache_lock = threading.Lock()
 class FormatOption(BaseModel):
    pipeline_cls: Type[BasePipeline]
@@ -284,10 +289,13 @@ class DocumentConverter:
            _log.info("Going to convert document batch...")
            # parallel processing only within input_batch
-            # with ThreadPoolExecutor(
+            #with ThreadPoolExecutor(
            #    max_workers=settings.perf.doc_batch_concurrency
-            # ) as pool:
+            #) as pool:
-            #   yield from pool.map(self.process_document, input_batch)
+            #    yield from pool.map(
            #        partial(self._process_document, raises_on_error=raises_on_error),
            #        input_batch,
            #    )
            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
            for item in map(
@@ -315,6 +323,7 @@ class DocumentConverter:
        # Use a composite key to cache pipelines
        cache_key = (pipeline_class, options_hash)
        with _pipeline_cache_lock:
            if cache_key not in self.initialized_pipelines:
                _log.info(
                    f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
--- a/docling/pipeline/threaded_standard_pdf_pipeline.py
+++ b/docling/pipeline/threaded_standard_pdf_pipeline.py
--- a/tests/test_threaded_pipeline.py
+++ b/tests/test_threaded_pipeline.py
@@ -1,50 +1,98 @@
 import logging
 import time
 from pathlib import Path
 from typing import List
 import pytest
-from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.base_models import InputFormat, ConversionStatus
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
-    ThreadedPdfPipelineOptions
+    ThreadedPdfPipelineOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 from docling.pipeline.threaded_standard_pdf_pipeline import ThreadedStandardPdfPipeline
 def test_threaded_pipeline_multiple_documents():
-    """Test threaded pipeline with multiple documents"""
+    """Test threaded pipeline with multiple documents and compare with standard pipeline"""
-    converter = DocumentConverter(
+    test_files = [
        "tests/data/pdf/2203.01017v2.pdf",
        "tests/data/pdf/2206.01062.pdf",
        "tests/data/pdf/2305.03393v1.pdf"
    ]
    # Standard pipeline
    standard_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
-                pipeline_cls=ThreadedStandardPdfPipeline,
+                pipeline_cls=StandardPdfPipeline,
-                pipeline_options=ThreadedPdfPipelineOptions(
+                pipeline_options=PdfPipelineOptions(
-                    layout_batch_size=48,
+                    do_table_structure=True,
-                    ocr_batch_size=24,
+                    do_ocr=True,
-                    batch_timeout_seconds=1.0,
+                ),
                )
            )
        }
    )
-    # Test threaded pipeline with multiple documents
+    # Threaded pipeline
-    results = []
+    threaded_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=ThreadedStandardPdfPipeline,
                pipeline_options=ThreadedPdfPipelineOptions(
                    layout_batch_size=1,
                    table_batch_size=1,
                    ocr_batch_size=1,
                    batch_timeout_seconds=1.0,
                    do_table_structure=True,
                    do_ocr=True,
                ),
            )
        }
    )
    # Test standard pipeline
    standard_results = []
    start_time = time.perf_counter()
-    for result in converter.convert_all([
+    for result in standard_converter.convert_all(test_files, raises_on_error=True):
-        "tests/data/pdf/2206.01062.pdf", 
+        print("Finished converting document with standard pipeline:", result.input.file.name)
-        "tests/data/pdf/2305.03393v1.pdf"
+        standard_results.append(result)
-    ]):
+    standard_time = time.perf_counter() - start_time
        results.append(result)
    end_time = time.perf_counter()
-    conversion_duration = end_time - start_time
+    del standard_converter
    print(f"Threaded multi-doc conversion took {conversion_duration:.2f} seconds")
-    assert len(results) == 2
+    # Test threaded pipeline
-    for result in results:
+    threaded_results = []
    start_time = time.perf_counter()
    for result in threaded_converter.convert_all(test_files, raises_on_error=True):
        print("Finished converting document with threaded pipeline:", result.input.file.name)
        threaded_results.append(result)
    threaded_time = time.perf_counter() - start_time
    del threaded_converter
    print("\nMulti-document Pipeline Comparison:")
    print(f"Standard pipeline:  {standard_time:.2f} seconds")
    print(f"Threaded pipeline:  {threaded_time:.2f} seconds")
    print(f"Speedup:            {standard_time / threaded_time:.2f}x")
    # Verify results
    assert len(standard_results) == len(threaded_results)
    for result in standard_results:
        assert result.status == ConversionStatus.SUCCESS
    for result in threaded_results:
        assert result.status == ConversionStatus.SUCCESS
    # Basic content comparison
    for i, (standard_result, threaded_result) in enumerate(zip(standard_results, threaded_results)):
        standard_doc = standard_result.document
        threaded_doc = threaded_result.document
        assert len(standard_doc.pages) == len(threaded_doc.pages), f"Document {i} page count mismatch"
        assert len(standard_doc.texts) == len(threaded_doc.texts), f"Document {i} text count mismatch"
 def test_pipeline_comparison():
@@ -73,7 +121,7 @@ def test_pipeline_comparison():
                    layout_batch_size=1,
                    ocr_batch_size=1,
                    table_batch_size=1,
-                )
+                ),
            )
        }
    )
@@ -82,14 +130,16 @@ def test_pipeline_comparison():
    threaded_results = list(threaded_converter.convert_all([test_file]))
    threaded_time = time.perf_counter() - start_time
-    print(f"\nPipeline Comparison:")
+    print("\nPipeline Comparison:")
    print(f"Sync pipeline:     {sync_time:.2f} seconds")
    print(f"Threaded pipeline: {threaded_time:.2f} seconds")
-    print(f"Speedup:           {sync_time/threaded_time:.2f}x")
+    print(f"Speedup:           {sync_time / threaded_time:.2f}x")
    # Verify results are equivalent
    assert len(sync_results) == len(threaded_results) == 1
-    assert sync_results[0].status == threaded_results[0].status == ConversionStatus.SUCCESS
+    assert (
        sync_results[0].status == threaded_results[0].status == ConversionStatus.SUCCESS
    )
    # Basic content comparison
    sync_doc = sync_results[0].document
@@ -99,9 +149,6 @@ def test_pipeline_comparison():
    assert len(sync_doc.texts) == len(threaded_doc.texts)
 if __name__ == "__main__":
    # Run basic performance test
    test_pipeline_comparison()