mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Revise pipeline
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
9fd01f3399
commit
33a24848a0
@ -1,8 +1,10 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
import threading
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable, Iterator
|
from collections.abc import Iterable, Iterator
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional, Tuple, Type, Union
|
from typing import Dict, List, Optional, Tuple, Type, Union
|
||||||
@ -50,6 +52,9 @@ from docling.utils.utils import chunkify
|
|||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Module-level lock for pipeline cache
|
||||||
|
_pipeline_cache_lock = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
class FormatOption(BaseModel):
|
class FormatOption(BaseModel):
|
||||||
pipeline_cls: Type[BasePipeline]
|
pipeline_cls: Type[BasePipeline]
|
||||||
@ -284,10 +289,13 @@ class DocumentConverter:
|
|||||||
_log.info("Going to convert document batch...")
|
_log.info("Going to convert document batch...")
|
||||||
|
|
||||||
# parallel processing only within input_batch
|
# parallel processing only within input_batch
|
||||||
# with ThreadPoolExecutor(
|
#with ThreadPoolExecutor(
|
||||||
# max_workers=settings.perf.doc_batch_concurrency
|
# max_workers=settings.perf.doc_batch_concurrency
|
||||||
# ) as pool:
|
#) as pool:
|
||||||
# yield from pool.map(self.process_document, input_batch)
|
# yield from pool.map(
|
||||||
|
# partial(self._process_document, raises_on_error=raises_on_error),
|
||||||
|
# input_batch,
|
||||||
|
# )
|
||||||
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
||||||
|
|
||||||
for item in map(
|
for item in map(
|
||||||
@ -315,6 +323,7 @@ class DocumentConverter:
|
|||||||
# Use a composite key to cache pipelines
|
# Use a composite key to cache pipelines
|
||||||
cache_key = (pipeline_class, options_hash)
|
cache_key = (pipeline_class, options_hash)
|
||||||
|
|
||||||
|
with _pipeline_cache_lock:
|
||||||
if cache_key not in self.initialized_pipelines:
|
if cache_key not in self.initialized_pipelines:
|
||||||
_log.info(
|
_log.info(
|
||||||
f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,50 +1,98 @@
|
|||||||
|
import logging
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.base_models import InputFormat, ConversionStatus
|
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
ThreadedPdfPipelineOptions
|
ThreadedPdfPipelineOptions,
|
||||||
)
|
)
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
from docling.pipeline.threaded_standard_pdf_pipeline import ThreadedStandardPdfPipeline
|
from docling.pipeline.threaded_standard_pdf_pipeline import ThreadedStandardPdfPipeline
|
||||||
|
|
||||||
|
|
||||||
def test_threaded_pipeline_multiple_documents():
|
def test_threaded_pipeline_multiple_documents():
|
||||||
"""Test threaded pipeline with multiple documents"""
|
"""Test threaded pipeline with multiple documents and compare with standard pipeline"""
|
||||||
converter = DocumentConverter(
|
test_files = [
|
||||||
|
"tests/data/pdf/2203.01017v2.pdf",
|
||||||
|
"tests/data/pdf/2206.01062.pdf",
|
||||||
|
"tests/data/pdf/2305.03393v1.pdf"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Standard pipeline
|
||||||
|
standard_converter = DocumentConverter(
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_cls=ThreadedStandardPdfPipeline,
|
pipeline_cls=StandardPdfPipeline,
|
||||||
pipeline_options=ThreadedPdfPipelineOptions(
|
pipeline_options=PdfPipelineOptions(
|
||||||
layout_batch_size=48,
|
do_table_structure=True,
|
||||||
ocr_batch_size=24,
|
do_ocr=True,
|
||||||
batch_timeout_seconds=1.0,
|
),
|
||||||
)
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test threaded pipeline with multiple documents
|
# Threaded pipeline
|
||||||
results = []
|
threaded_converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=ThreadedStandardPdfPipeline,
|
||||||
|
pipeline_options=ThreadedPdfPipelineOptions(
|
||||||
|
layout_batch_size=1,
|
||||||
|
table_batch_size=1,
|
||||||
|
ocr_batch_size=1,
|
||||||
|
batch_timeout_seconds=1.0,
|
||||||
|
do_table_structure=True,
|
||||||
|
do_ocr=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test standard pipeline
|
||||||
|
standard_results = []
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
for result in converter.convert_all([
|
for result in standard_converter.convert_all(test_files, raises_on_error=True):
|
||||||
"tests/data/pdf/2206.01062.pdf",
|
print("Finished converting document with standard pipeline:", result.input.file.name)
|
||||||
"tests/data/pdf/2305.03393v1.pdf"
|
standard_results.append(result)
|
||||||
]):
|
standard_time = time.perf_counter() - start_time
|
||||||
results.append(result)
|
|
||||||
end_time = time.perf_counter()
|
|
||||||
|
|
||||||
conversion_duration = end_time - start_time
|
del standard_converter
|
||||||
print(f"Threaded multi-doc conversion took {conversion_duration:.2f} seconds")
|
|
||||||
|
|
||||||
assert len(results) == 2
|
# Test threaded pipeline
|
||||||
for result in results:
|
threaded_results = []
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
for result in threaded_converter.convert_all(test_files, raises_on_error=True):
|
||||||
|
print("Finished converting document with threaded pipeline:", result.input.file.name)
|
||||||
|
threaded_results.append(result)
|
||||||
|
threaded_time = time.perf_counter() - start_time
|
||||||
|
|
||||||
|
del threaded_converter
|
||||||
|
|
||||||
|
print("\nMulti-document Pipeline Comparison:")
|
||||||
|
print(f"Standard pipeline: {standard_time:.2f} seconds")
|
||||||
|
print(f"Threaded pipeline: {threaded_time:.2f} seconds")
|
||||||
|
print(f"Speedup: {standard_time / threaded_time:.2f}x")
|
||||||
|
|
||||||
|
# Verify results
|
||||||
|
assert len(standard_results) == len(threaded_results)
|
||||||
|
for result in standard_results:
|
||||||
assert result.status == ConversionStatus.SUCCESS
|
assert result.status == ConversionStatus.SUCCESS
|
||||||
|
for result in threaded_results:
|
||||||
|
assert result.status == ConversionStatus.SUCCESS
|
||||||
|
|
||||||
|
# Basic content comparison
|
||||||
|
for i, (standard_result, threaded_result) in enumerate(zip(standard_results, threaded_results)):
|
||||||
|
standard_doc = standard_result.document
|
||||||
|
threaded_doc = threaded_result.document
|
||||||
|
|
||||||
|
assert len(standard_doc.pages) == len(threaded_doc.pages), f"Document {i} page count mismatch"
|
||||||
|
assert len(standard_doc.texts) == len(threaded_doc.texts), f"Document {i} text count mismatch"
|
||||||
|
|
||||||
|
|
||||||
def test_pipeline_comparison():
|
def test_pipeline_comparison():
|
||||||
@ -73,7 +121,7 @@ def test_pipeline_comparison():
|
|||||||
layout_batch_size=1,
|
layout_batch_size=1,
|
||||||
ocr_batch_size=1,
|
ocr_batch_size=1,
|
||||||
table_batch_size=1,
|
table_batch_size=1,
|
||||||
)
|
),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -82,14 +130,16 @@ def test_pipeline_comparison():
|
|||||||
threaded_results = list(threaded_converter.convert_all([test_file]))
|
threaded_results = list(threaded_converter.convert_all([test_file]))
|
||||||
threaded_time = time.perf_counter() - start_time
|
threaded_time = time.perf_counter() - start_time
|
||||||
|
|
||||||
print(f"\nPipeline Comparison:")
|
print("\nPipeline Comparison:")
|
||||||
print(f"Sync pipeline: {sync_time:.2f} seconds")
|
print(f"Sync pipeline: {sync_time:.2f} seconds")
|
||||||
print(f"Threaded pipeline: {threaded_time:.2f} seconds")
|
print(f"Threaded pipeline: {threaded_time:.2f} seconds")
|
||||||
print(f"Speedup: {sync_time/threaded_time:.2f}x")
|
print(f"Speedup: {sync_time / threaded_time:.2f}x")
|
||||||
|
|
||||||
# Verify results are equivalent
|
# Verify results are equivalent
|
||||||
assert len(sync_results) == len(threaded_results) == 1
|
assert len(sync_results) == len(threaded_results) == 1
|
||||||
assert sync_results[0].status == threaded_results[0].status == ConversionStatus.SUCCESS
|
assert (
|
||||||
|
sync_results[0].status == threaded_results[0].status == ConversionStatus.SUCCESS
|
||||||
|
)
|
||||||
|
|
||||||
# Basic content comparison
|
# Basic content comparison
|
||||||
sync_doc = sync_results[0].document
|
sync_doc = sync_results[0].document
|
||||||
@ -99,9 +149,6 @@ def test_pipeline_comparison():
|
|||||||
assert len(sync_doc.texts) == len(threaded_doc.texts)
|
assert len(sync_doc.texts) == len(threaded_doc.texts)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Run basic performance test
|
# Run basic performance test
|
||||||
test_pipeline_comparison()
|
test_pipeline_comparison()
|
Loading…
Reference in New Issue
Block a user