fix: Respect document_timeout in new threaded StandardPdfPipeline (#2653)

* fix: Respect document_timeout in new threaded StandardPdfPipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* add test case to test_options

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Make sure unprocessed pages are not getting into assemble_document

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-11-20 14:57:14 +01:00
committed by GitHub
parent 54e65d9511
commit 2087c6bf9f
2 changed files with 144 additions and 28 deletions

View File

@@ -15,6 +15,7 @@ from docling.datamodel.pipeline_options import (
TableFormerMode,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.legacy_standard_pdf_pipeline import LegacyStandardPdfPipeline
@pytest.fixture
@@ -118,6 +119,33 @@ def test_page_range(test_doc_path):
assert doc_result.status == ConversionStatus.FAILURE
def test_document_timeout(test_doc_path):
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=PdfPipelineOptions(document_timeout=1)
)
}
)
result = converter.convert(test_doc_path)
assert result.status == ConversionStatus.PARTIAL_SUCCESS, (
"Expected document timeout to be used"
)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=PdfPipelineOptions(document_timeout=1),
pipeline_cls=LegacyStandardPdfPipeline,
)
}
)
result = converter.convert(test_doc_path)
assert result.status == ConversionStatus.PARTIAL_SUCCESS, (
"Expected document timeout to be used"
)
def test_ocr_coverage_threshold(test_doc_path):
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True