From 8a1c4331fbc0fb68739db0a22079128c4683a827 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 23 Jul 2025 09:18:06 +0000 Subject: [PATCH] Initial investigation: analyze ReadingOrderModel timeout issue Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com> --- test_timeout_fix.py | 98 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 test_timeout_fix.py diff --git a/test_timeout_fix.py b/test_timeout_fix.py new file mode 100644 index 00000000..8d7cb429 --- /dev/null +++ b/test_timeout_fix.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +"""Test to reproduce and validate the fix for document_timeout AssertionError issue.""" + +import tempfile +from pathlib import Path +import pytest + +from docling.datamodel.base_models import ConversionStatus, InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption + + +def test_document_timeout_no_assertion_error(): + """ + Test that setting document_timeout doesn't cause an AssertionError in ReadingOrderModel. + + This test validates the fix for the issue where setting pipeline_options.document_timeout + would lead to an AssertionError in ReadingOrderModel._readingorder_elements_to_docling_doc + when page.size is None for uninitialized pages after timeout. + """ + # Test PDF path - using an existing test file + test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + + if not test_doc_path.exists(): + pytest.skip("Test PDF file not found") + + # Configure pipeline with a very short timeout to trigger the timeout condition + pipeline_options = PdfPipelineOptions() + pipeline_options.document_timeout = 0.001 # Very short timeout to trigger timeout + pipeline_options.do_ocr = False # Disable OCR to make processing faster but still trigger timeout + pipeline_options.do_table_structure = False # Disable table structure for faster processing + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + } + ) + + # This should not raise an AssertionError even with timeout + # Before the fix, this would fail with: AssertionError at line 140 in readingorder_model.py + try: + doc_result = converter.convert(test_doc_path, raises_on_error=False) + + # The conversion should complete without throwing an AssertionError + # It may result in PARTIAL_SUCCESS due to timeout, but should not crash + assert doc_result.status in [ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS], \ + f"Expected SUCCESS or PARTIAL_SUCCESS, got {doc_result.status}" + + # Verify that we have a document with pages + assert doc_result.document is not None, "Document should not be None" + + print(f"Test passed: Conversion completed with status {doc_result.status}") + print(f"Document has {doc_result.document.num_pages()} pages") + + except AssertionError as e: + if "size is not None" in str(e): + pytest.fail(f"The original AssertionError still occurs: {e}") + else: + # Re-raise other assertion errors + raise + + +def test_document_timeout_with_longer_timeout(): + """ + Test that document_timeout works correctly with a reasonable timeout value. + """ + test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + + if not test_doc_path.exists(): + pytest.skip("Test PDF file not found") + + # Configure pipeline with a reasonable timeout + pipeline_options = PdfPipelineOptions() + pipeline_options.document_timeout = 10.0 # 10 seconds should be enough for a small document + pipeline_options.do_ocr = False + pipeline_options.do_table_structure = False + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + } + ) + + # This should complete successfully + doc_result = converter.convert(test_doc_path) + + assert doc_result.status == ConversionStatus.SUCCESS, \ + f"Expected SUCCESS, got {doc_result.status}" + assert doc_result.document is not None, "Document should not be None" + assert doc_result.document.num_pages() > 0, "Document should have pages" + + print(f"Test passed: Conversion completed successfully with {doc_result.document.num_pages()} pages") + + +if __name__ == "__main__": + test_document_timeout_no_assertion_error() + test_document_timeout_with_longer_timeout() + print("All tests passed!") \ No newline at end of file