From 252e5e214f6b08f2bad62a37779e956a1833d17c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 23 Jul 2025 09:44:10 +0000 Subject: [PATCH] Fix timeout status preservation issue by extending _determine_status method Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com> --- TIMEOUT_FIX_DOCUMENTATION.md | 54 +++++++++++++++++---- docling/pipeline/base_pipeline.py | 4 +- tests/test_timeout_fix.py | 80 +++++++++++++++++++++++-------- 3 files changed, 107 insertions(+), 31 deletions(-) diff --git a/TIMEOUT_FIX_DOCUMENTATION.md b/TIMEOUT_FIX_DOCUMENTATION.md index c74ace24..3c595ebd 100644 --- a/TIMEOUT_FIX_DOCUMENTATION.md +++ b/TIMEOUT_FIX_DOCUMENTATION.md @@ -10,17 +10,28 @@ assert size is not None, "Page size is not initialized." This error occurred in `ReadingOrderModel._readingorder_elements_to_docling_doc()` when processing pages that weren't fully initialized due to timeout. +Additionally, there was a secondary issue where the `ConversionStatus.PARTIAL_SUCCESS` status that was correctly set during timeout was being overwritten by the `_determine_status` method. + ## Root Cause -When a document processing timeout occurs: -1. The pipeline stops processing pages mid-way through the document -2. Some pages remain uninitialized with `page.size = None` -3. These uninitialized pages are passed to the `ReadingOrderModel` -4. The `ReadingOrderModel` expects all pages to have `size != None`, causing the assertion to fail +The issue had two parts: + +1. **Uninitialized Pages**: When a document processing timeout occurs: + - The pipeline stops processing pages mid-way through the document + - Some pages remain uninitialized with `page.size = None` + - These uninitialized pages are passed to the `ReadingOrderModel` + - The `ReadingOrderModel` expects all pages to have `size != None`, causing the assertion to fail + +2. **Status Overwriting**: The `_determine_status` method would: + - Always start with `ConversionStatus.SUCCESS` + - Only change to `PARTIAL_SUCCESS` based on backend validation issues + - Ignore that timeout might have already set the status to `PARTIAL_SUCCESS` ## Solution -The fix was implemented in `docling/pipeline/base_pipeline.py` (lines 196-206): +The fix was implemented in two parts in `docling/pipeline/base_pipeline.py`: + +### Part 1: Page Filtering (lines 196-206) ```python # Filter out uninitialized pages (those with size=None) that may remain @@ -35,21 +46,44 @@ if len(conv_res.pages) < initial_page_count: ) ``` +### Part 2: Status Preservation (lines 220-221) + +```python +def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus: + # Preserve PARTIAL_SUCCESS status if already set (e.g., due to timeout) + status = ConversionStatus.SUCCESS if conv_res.status != ConversionStatus.PARTIAL_SUCCESS else ConversionStatus.PARTIAL_SUCCESS + + for page in conv_res.pages: + if page._backend is None or not page._backend.is_valid(): + conv_res.errors.append( + ErrorItem( + component_type=DoclingComponentType.DOCUMENT_BACKEND, + module_name=type(page._backend).__name__, + error_message=f"Page {page.page_no} failed to parse.", + ) + ) + status = ConversionStatus.PARTIAL_SUCCESS + + return status +``` + This fix: 1. **Filters out uninitialized pages** before they reach the ReadingOrderModel 2. **Prevents the AssertionError** by ensuring all pages have `size != None` -3. **Maintains partial conversion results** by keeping successfully processed pages -4. **Logs the filtering action** for transparency +3. **Preserves timeout-induced PARTIAL_SUCCESS status** through the status determination process +4. **Maintains partial conversion results** by keeping successfully processed pages +5. **Logs the filtering action** for transparency ## Verification The fix has been verified with comprehensive tests that: 1. ✅ Confirm timeout scenarios don't cause AssertionError 2. ✅ Validate that filtered pages are compatible with ReadingOrderModel -3. ✅ Ensure normal processing (without timeout) still works correctly +3. ✅ Ensure timeout-induced PARTIAL_SUCCESS status is preserved +4. ✅ Ensure normal processing (without timeout) still works correctly ## Status ✅ **FIXED** - The issue has been resolved and the fix is working correctly. -The conversion will now complete with `ConversionStatus.PARTIAL_SUCCESS` when a timeout occurs, instead of crashing with an AssertionError. \ No newline at end of file +The conversion will now complete with `ConversionStatus.PARTIAL_SUCCESS` when a timeout occurs, instead of crashing with an AssertionError. The status is properly preserved throughout the pipeline execution. \ No newline at end of file diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 2b168101..ef1d9580 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -217,7 +217,9 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. return conv_res def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus: - status = ConversionStatus.SUCCESS + # Preserve PARTIAL_SUCCESS status if already set (e.g., due to timeout) + status = ConversionStatus.SUCCESS if conv_res.status != ConversionStatus.PARTIAL_SUCCESS else ConversionStatus.PARTIAL_SUCCESS + for page in conv_res.pages: if page._backend is None or not page._backend.is_valid(): conv_res.errors.append( diff --git a/tests/test_timeout_fix.py b/tests/test_timeout_fix.py index e258c6d5..919c1dd0 100644 --- a/tests/test_timeout_fix.py +++ b/tests/test_timeout_fix.py @@ -62,10 +62,8 @@ class TimeoutTestPipeline(PaginatedPipeline): return page def _determine_status(self, conv_res): - """Determine conversion status.""" - if len(conv_res.pages) < conv_res.input.page_count: - return ConversionStatus.PARTIAL_SUCCESS - return ConversionStatus.SUCCESS + """Determine conversion status - inherit from parent to test the actual fix.""" + return super()._determine_status(conv_res) @classmethod def get_default_options(cls): @@ -96,16 +94,14 @@ def test_document_timeout_filters_uninitialized_pages(): input_doc.file.name = "test.pdf" input_doc._backend = MockPdfBackend(input_doc, "test.pdf") - conv_res = ConversionResult(input=input_doc) - # Configure pipeline with very short timeout to trigger timeout condition pipeline_options = PdfPipelineOptions() pipeline_options.document_timeout = 0.05 # 50ms timeout - intentionally short pipeline = TimeoutTestPipeline(pipeline_options) - # Process document - this should trigger timeout but not cause AssertionError - result = pipeline._build_document(conv_res) + # Process document using full pipeline execution - this should trigger timeout but not cause AssertionError + result = pipeline.execute(input_doc, raises_on_error=False) # Verify that uninitialized pages (with size=None) were filtered out # This is the key fix - pages with size=None are removed before ReadingOrderModel processes them @@ -119,9 +115,9 @@ def test_document_timeout_filters_uninitialized_pages(): f"Page {page.page_no} should have size initialized, got None" # Status should indicate partial success due to timeout - final_status = pipeline._determine_status(result) - assert final_status == ConversionStatus.PARTIAL_SUCCESS, \ - f"Expected PARTIAL_SUCCESS, got {final_status}" + # This validates the fix for status preservation + assert result.status == ConversionStatus.PARTIAL_SUCCESS, \ + f"Expected PARTIAL_SUCCESS, got {result.status}" def test_readingorder_model_compatibility(): @@ -141,16 +137,14 @@ def test_readingorder_model_compatibility(): input_doc.file.name = "test.pdf" input_doc._backend = MockPdfBackend(input_doc, "test.pdf") - conv_res = ConversionResult(input=input_doc) - # Configure pipeline with timeout that allows some pages to be processed pipeline_options = PdfPipelineOptions() pipeline_options.document_timeout = 0.15 # 150ms timeout pipeline = TimeoutTestPipeline(pipeline_options) - # Process document - result = pipeline._build_document(conv_res) + # Process document using full pipeline execution + result = pipeline.execute(input_doc, raises_on_error=False) # Simulate what ReadingOrderModel expects - all pages should have size for page in result.pages: @@ -164,6 +158,50 @@ def test_readingorder_model_compatibility(): "Size should have width attribute" +def test_timeout_status_preservation(): + """ + Test that ConversionStatus.PARTIAL_SUCCESS set during timeout is preserved + by _determine_status method. + + This validates the fix for the issue where _determine_status would overwrite + the PARTIAL_SUCCESS status that was correctly set due to document_timeout. + """ + + # Create a test document + input_doc = Mock(spec=InputDocument) + input_doc.page_count = 10 + input_doc.limits = Mock() + input_doc.limits.page_range = (1, 10) + input_doc.file = Mock() + input_doc.file.name = "test.pdf" + input_doc._backend = MockPdfBackend(input_doc, "test.pdf") + + conv_res = ConversionResult(input=input_doc) + + # Configure pipeline with very short timeout + pipeline_options = PdfPipelineOptions() + pipeline_options.document_timeout = 0.05 # 50ms timeout + + pipeline = TimeoutTestPipeline(pipeline_options) + + # Process document - this should trigger timeout and set PARTIAL_SUCCESS + result = pipeline._build_document(conv_res) + + # Verify that timeout was triggered and status was set to PARTIAL_SUCCESS + assert result.status == ConversionStatus.PARTIAL_SUCCESS, \ + f"Expected PARTIAL_SUCCESS after timeout, got {result.status}" + + # Now test that _determine_status preserves the PARTIAL_SUCCESS status + final_status = pipeline._determine_status(result) + assert final_status == ConversionStatus.PARTIAL_SUCCESS, \ + f"_determine_status should preserve PARTIAL_SUCCESS from timeout, got {final_status}" + + # Also test the full pipeline execution to ensure end-to-end behavior + final_result = pipeline.execute(input_doc, raises_on_error=False) + assert final_result.status == ConversionStatus.PARTIAL_SUCCESS, \ + f"Final result should maintain PARTIAL_SUCCESS status, got {final_result.status}" + + def test_no_timeout_scenario(): """Test that normal processing without timeout works correctly.""" @@ -176,16 +214,14 @@ def test_no_timeout_scenario(): input_doc.file.name = "test.pdf" input_doc._backend = MockPdfBackend(input_doc, "test.pdf") - conv_res = ConversionResult(input=input_doc) - # Configure pipeline with sufficient timeout pipeline_options = PdfPipelineOptions() pipeline_options.document_timeout = 2.0 # 2 seconds - should be enough pipeline = TimeoutTestPipeline(pipeline_options) - # Process document - result = pipeline._build_document(conv_res) + # Process document using full pipeline execution + result = pipeline.execute(input_doc, raises_on_error=False) # All pages should be processed successfully without timeout assert len(result.pages) >= 2, \ @@ -194,4 +230,8 @@ def test_no_timeout_scenario(): # All pages should have size initialized for page in result.pages: assert page.size is not None, \ - f"Page {page.page_no} should have size initialized" \ No newline at end of file + f"Page {page.page_no} should have size initialized" + + # Status should be SUCCESS since no timeout occurred + assert result.status == ConversionStatus.SUCCESS, \ + f"Expected SUCCESS without timeout, got {result.status}" \ No newline at end of file