From 252e5e214f6b08f2bad62a37779e956a1833d17c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 23 Jul 2025 09:44:10 +0000
Subject: [PATCH] Fix timeout status preservation issue by extending
 _determine_status method

Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com>
---
 TIMEOUT_FIX_DOCUMENTATION.md      | 54 +++++++++++++++++----
 docling/pipeline/base_pipeline.py |  4 +-
 tests/test_timeout_fix.py         | 80 +++++++++++++++++++++++--------
 3 files changed, 107 insertions(+), 31 deletions(-)

diff --git a/TIMEOUT_FIX_DOCUMENTATION.md b/TIMEOUT_FIX_DOCUMENTATION.md
index c74ace24..3c595ebd 100644
--- a/TIMEOUT_FIX_DOCUMENTATION.md
+++ b/TIMEOUT_FIX_DOCUMENTATION.md
@@ -10,17 +10,28 @@ assert size is not None, "Page size is not initialized."
 
 This error occurred in `ReadingOrderModel._readingorder_elements_to_docling_doc()` when processing pages that weren't fully initialized due to timeout.
 
+Additionally, there was a secondary issue where the `ConversionStatus.PARTIAL_SUCCESS` status that was correctly set during timeout was being overwritten by the `_determine_status` method.
+
 ## Root Cause
 
-When a document processing timeout occurs:
-1. The pipeline stops processing pages mid-way through the document
-2. Some pages remain uninitialized with `page.size = None`
-3. These uninitialized pages are passed to the `ReadingOrderModel`
-4. The `ReadingOrderModel` expects all pages to have `size != None`, causing the assertion to fail
+The issue had two parts:
+
+1. **Uninitialized Pages**: When a document processing timeout occurs:
+   - The pipeline stops processing pages mid-way through the document
+   - Some pages remain uninitialized with `page.size = None`
+   - These uninitialized pages are passed to the `ReadingOrderModel`
+   - The `ReadingOrderModel` expects all pages to have `size != None`, causing the assertion to fail
+
+2. **Status Overwriting**: The `_determine_status` method would:
+   - Always start with `ConversionStatus.SUCCESS`
+   - Only change to `PARTIAL_SUCCESS` based on backend validation issues
+   - Ignore that timeout might have already set the status to `PARTIAL_SUCCESS`
 
 ## Solution
 
-The fix was implemented in `docling/pipeline/base_pipeline.py` (lines 196-206):
+The fix was implemented in two parts in `docling/pipeline/base_pipeline.py`:
+
+### Part 1: Page Filtering (lines 196-206)
 
 ```python
 # Filter out uninitialized pages (those with size=None) that may remain
@@ -35,21 +46,44 @@ if len(conv_res.pages) < initial_page_count:
     )
 ```
 
+### Part 2: Status Preservation (lines 220-221)
+
+```python
+def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
+    # Preserve PARTIAL_SUCCESS status if already set (e.g., due to timeout)
+    status = ConversionStatus.SUCCESS if conv_res.status != ConversionStatus.PARTIAL_SUCCESS else ConversionStatus.PARTIAL_SUCCESS
+    
+    for page in conv_res.pages:
+        if page._backend is None or not page._backend.is_valid():
+            conv_res.errors.append(
+                ErrorItem(
+                    component_type=DoclingComponentType.DOCUMENT_BACKEND,
+                    module_name=type(page._backend).__name__,
+                    error_message=f"Page {page.page_no} failed to parse.",
+                )
+            )
+            status = ConversionStatus.PARTIAL_SUCCESS
+
+    return status
+```
+
 This fix:
 1. **Filters out uninitialized pages** before they reach the ReadingOrderModel
 2. **Prevents the AssertionError** by ensuring all pages have `size != None`
-3. **Maintains partial conversion results** by keeping successfully processed pages
-4. **Logs the filtering action** for transparency
+3. **Preserves timeout-induced PARTIAL_SUCCESS status** through the status determination process
+4. **Maintains partial conversion results** by keeping successfully processed pages
+5. **Logs the filtering action** for transparency
 
 ## Verification
 
 The fix has been verified with comprehensive tests that:
 1. ✅ Confirm timeout scenarios don't cause AssertionError
 2. ✅ Validate that filtered pages are compatible with ReadingOrderModel
-3. ✅ Ensure normal processing (without timeout) still works correctly
+3. ✅ Ensure timeout-induced PARTIAL_SUCCESS status is preserved
+4. ✅ Ensure normal processing (without timeout) still works correctly
 
 ## Status
 
 ✅ **FIXED** - The issue has been resolved and the fix is working correctly.
 
-The conversion will now complete with `ConversionStatus.PARTIAL_SUCCESS` when a timeout occurs, instead of crashing with an AssertionError.
\ No newline at end of file
+The conversion will now complete with `ConversionStatus.PARTIAL_SUCCESS` when a timeout occurs, instead of crashing with an AssertionError. The status is properly preserved throughout the pipeline execution.
\ No newline at end of file
diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py
index 2b168101..ef1d9580 100644
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@@ -217,7 +217,9 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
         return conv_res
 
     def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
-        status = ConversionStatus.SUCCESS
+        # Preserve PARTIAL_SUCCESS status if already set (e.g., due to timeout)
+        status = ConversionStatus.SUCCESS if conv_res.status != ConversionStatus.PARTIAL_SUCCESS else ConversionStatus.PARTIAL_SUCCESS
+        
         for page in conv_res.pages:
             if page._backend is None or not page._backend.is_valid():
                 conv_res.errors.append(
diff --git a/tests/test_timeout_fix.py b/tests/test_timeout_fix.py
index e258c6d5..919c1dd0 100644
--- a/tests/test_timeout_fix.py
+++ b/tests/test_timeout_fix.py
@@ -62,10 +62,8 @@ class TimeoutTestPipeline(PaginatedPipeline):
         return page
     
     def _determine_status(self, conv_res):
-        """Determine conversion status."""
-        if len(conv_res.pages) < conv_res.input.page_count:
-            return ConversionStatus.PARTIAL_SUCCESS
-        return ConversionStatus.SUCCESS
+        """Determine conversion status - inherit from parent to test the actual fix."""
+        return super()._determine_status(conv_res)
     
     @classmethod
     def get_default_options(cls):
@@ -96,16 +94,14 @@ def test_document_timeout_filters_uninitialized_pages():
     input_doc.file.name = "test.pdf"
     input_doc._backend = MockPdfBackend(input_doc, "test.pdf")
     
-    conv_res = ConversionResult(input=input_doc)
-    
     # Configure pipeline with very short timeout to trigger timeout condition
     pipeline_options = PdfPipelineOptions()
     pipeline_options.document_timeout = 0.05  # 50ms timeout - intentionally short
     
     pipeline = TimeoutTestPipeline(pipeline_options)
     
-    # Process document - this should trigger timeout but not cause AssertionError
-    result = pipeline._build_document(conv_res)
+    # Process document using full pipeline execution - this should trigger timeout but not cause AssertionError
+    result = pipeline.execute(input_doc, raises_on_error=False)
     
     # Verify that uninitialized pages (with size=None) were filtered out
     # This is the key fix - pages with size=None are removed before ReadingOrderModel processes them
@@ -119,9 +115,9 @@ def test_document_timeout_filters_uninitialized_pages():
             f"Page {page.page_no} should have size initialized, got None"
     
     # Status should indicate partial success due to timeout
-    final_status = pipeline._determine_status(result)
-    assert final_status == ConversionStatus.PARTIAL_SUCCESS, \
-        f"Expected PARTIAL_SUCCESS, got {final_status}"
+    # This validates the fix for status preservation
+    assert result.status == ConversionStatus.PARTIAL_SUCCESS, \
+        f"Expected PARTIAL_SUCCESS, got {result.status}"
 
 
 def test_readingorder_model_compatibility():
@@ -141,16 +137,14 @@ def test_readingorder_model_compatibility():
     input_doc.file.name = "test.pdf"
     input_doc._backend = MockPdfBackend(input_doc, "test.pdf")
     
-    conv_res = ConversionResult(input=input_doc)
-    
     # Configure pipeline with timeout that allows some pages to be processed
     pipeline_options = PdfPipelineOptions()
     pipeline_options.document_timeout = 0.15  # 150ms timeout
     
     pipeline = TimeoutTestPipeline(pipeline_options)
     
-    # Process document
-    result = pipeline._build_document(conv_res)
+    # Process document using full pipeline execution
+    result = pipeline.execute(input_doc, raises_on_error=False)
     
     # Simulate what ReadingOrderModel expects - all pages should have size
     for page in result.pages:
@@ -164,6 +158,50 @@ def test_readingorder_model_compatibility():
             "Size should have width attribute"
 
 
+def test_timeout_status_preservation():
+    """
+    Test that ConversionStatus.PARTIAL_SUCCESS set during timeout is preserved 
+    by _determine_status method.
+    
+    This validates the fix for the issue where _determine_status would overwrite
+    the PARTIAL_SUCCESS status that was correctly set due to document_timeout.
+    """
+    
+    # Create a test document
+    input_doc = Mock(spec=InputDocument)
+    input_doc.page_count = 10
+    input_doc.limits = Mock()
+    input_doc.limits.page_range = (1, 10)
+    input_doc.file = Mock()
+    input_doc.file.name = "test.pdf"
+    input_doc._backend = MockPdfBackend(input_doc, "test.pdf")
+    
+    conv_res = ConversionResult(input=input_doc)
+    
+    # Configure pipeline with very short timeout
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.document_timeout = 0.05  # 50ms timeout
+    
+    pipeline = TimeoutTestPipeline(pipeline_options)
+    
+    # Process document - this should trigger timeout and set PARTIAL_SUCCESS
+    result = pipeline._build_document(conv_res)
+    
+    # Verify that timeout was triggered and status was set to PARTIAL_SUCCESS
+    assert result.status == ConversionStatus.PARTIAL_SUCCESS, \
+        f"Expected PARTIAL_SUCCESS after timeout, got {result.status}"
+    
+    # Now test that _determine_status preserves the PARTIAL_SUCCESS status
+    final_status = pipeline._determine_status(result)
+    assert final_status == ConversionStatus.PARTIAL_SUCCESS, \
+        f"_determine_status should preserve PARTIAL_SUCCESS from timeout, got {final_status}"
+    
+    # Also test the full pipeline execution to ensure end-to-end behavior
+    final_result = pipeline.execute(input_doc, raises_on_error=False)
+    assert final_result.status == ConversionStatus.PARTIAL_SUCCESS, \
+        f"Final result should maintain PARTIAL_SUCCESS status, got {final_result.status}"
+
+
 def test_no_timeout_scenario():
     """Test that normal processing without timeout works correctly."""
     
@@ -176,16 +214,14 @@ def test_no_timeout_scenario():
     input_doc.file.name = "test.pdf"
     input_doc._backend = MockPdfBackend(input_doc, "test.pdf")
     
-    conv_res = ConversionResult(input=input_doc)
-    
     # Configure pipeline with sufficient timeout
     pipeline_options = PdfPipelineOptions()
     pipeline_options.document_timeout = 2.0  # 2 seconds - should be enough
     
     pipeline = TimeoutTestPipeline(pipeline_options)
     
-    # Process document
-    result = pipeline._build_document(conv_res)
+    # Process document using full pipeline execution
+    result = pipeline.execute(input_doc, raises_on_error=False)
     
     # All pages should be processed successfully without timeout
     assert len(result.pages) >= 2, \
@@ -194,4 +230,8 @@ def test_no_timeout_scenario():
     # All pages should have size initialized
     for page in result.pages:
         assert page.size is not None, \
-            f"Page {page.page_no} should have size initialized"
\ No newline at end of file
+            f"Page {page.page_no} should have size initialized"
+    
+    # Status should be SUCCESS since no timeout occurred
+    assert result.status == ConversionStatus.SUCCESS, \
+        f"Expected SUCCESS without timeout, got {result.status}"
\ No newline at end of file