From 57d77232dc0162975c0ea0ec2fe46ad8113f299f Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Wed, 23 Jul 2025 12:23:03 +0200 Subject: [PATCH] Fix the PARTIAL_SUCCESS case in _determine_status properly Signed-off-by: Christoph Auer --- TIMEOUT_FIX_DOCUMENTATION.md | 89 ----------- docling/pipeline/base_pipeline.py | 10 +- tests/test_timeout_fix.py | 237 ------------------------------ 3 files changed, 7 insertions(+), 329 deletions(-) delete mode 100644 TIMEOUT_FIX_DOCUMENTATION.md delete mode 100644 tests/test_timeout_fix.py diff --git a/TIMEOUT_FIX_DOCUMENTATION.md b/TIMEOUT_FIX_DOCUMENTATION.md deleted file mode 100644 index 3c595ebd..00000000 --- a/TIMEOUT_FIX_DOCUMENTATION.md +++ /dev/null @@ -1,89 +0,0 @@ -# Fix for ReadingOrderModel AssertionError with document_timeout - -## Problem Description - -When `pipeline_options.document_timeout` was set in the latest version of docling (v2.24.0+), an `AssertionError` was raised in the `ReadingOrderModel` at line 132 (previously line 140): - -```python -assert size is not None, "Page size is not initialized." -``` - -This error occurred in `ReadingOrderModel._readingorder_elements_to_docling_doc()` when processing pages that weren't fully initialized due to timeout. - -Additionally, there was a secondary issue where the `ConversionStatus.PARTIAL_SUCCESS` status that was correctly set during timeout was being overwritten by the `_determine_status` method. - -## Root Cause - -The issue had two parts: - -1. **Uninitialized Pages**: When a document processing timeout occurs: - - The pipeline stops processing pages mid-way through the document - - Some pages remain uninitialized with `page.size = None` - - These uninitialized pages are passed to the `ReadingOrderModel` - - The `ReadingOrderModel` expects all pages to have `size != None`, causing the assertion to fail - -2. **Status Overwriting**: The `_determine_status` method would: - - Always start with `ConversionStatus.SUCCESS` - - Only change to `PARTIAL_SUCCESS` based on backend validation issues - - Ignore that timeout might have already set the status to `PARTIAL_SUCCESS` - -## Solution - -The fix was implemented in two parts in `docling/pipeline/base_pipeline.py`: - -### Part 1: Page Filtering (lines 196-206) - -```python -# Filter out uninitialized pages (those with size=None) that may remain -# after timeout or processing failures to prevent assertion errors downstream -initial_page_count = len(conv_res.pages) -conv_res.pages = [page for page in conv_res.pages if page.size is not None] - -if len(conv_res.pages) < initial_page_count: - _log.info( - f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages " - f"due to timeout or processing failures" - ) -``` - -### Part 2: Status Preservation (lines 220-221) - -```python -def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus: - # Preserve PARTIAL_SUCCESS status if already set (e.g., due to timeout) - status = ConversionStatus.SUCCESS if conv_res.status != ConversionStatus.PARTIAL_SUCCESS else ConversionStatus.PARTIAL_SUCCESS - - for page in conv_res.pages: - if page._backend is None or not page._backend.is_valid(): - conv_res.errors.append( - ErrorItem( - component_type=DoclingComponentType.DOCUMENT_BACKEND, - module_name=type(page._backend).__name__, - error_message=f"Page {page.page_no} failed to parse.", - ) - ) - status = ConversionStatus.PARTIAL_SUCCESS - - return status -``` - -This fix: -1. **Filters out uninitialized pages** before they reach the ReadingOrderModel -2. **Prevents the AssertionError** by ensuring all pages have `size != None` -3. **Preserves timeout-induced PARTIAL_SUCCESS status** through the status determination process -4. **Maintains partial conversion results** by keeping successfully processed pages -5. **Logs the filtering action** for transparency - -## Verification - -The fix has been verified with comprehensive tests that: -1. ✅ Confirm timeout scenarios don't cause AssertionError -2. ✅ Validate that filtered pages are compatible with ReadingOrderModel -3. ✅ Ensure timeout-induced PARTIAL_SUCCESS status is preserved -4. ✅ Ensure normal processing (without timeout) still works correctly - -## Status - -✅ **FIXED** - The issue has been resolved and the fix is working correctly. - -The conversion will now complete with `ConversionStatus.PARTIAL_SUCCESS` when a timeout occurs, instead of crashing with an AssertionError. The status is properly preserved throughout the pipeline execution. \ No newline at end of file diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index ef1d9580..6944a355 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -217,9 +217,13 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. return conv_res def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus: - # Preserve PARTIAL_SUCCESS status if already set (e.g., due to timeout) - status = ConversionStatus.SUCCESS if conv_res.status != ConversionStatus.PARTIAL_SUCCESS else ConversionStatus.PARTIAL_SUCCESS - + status = conv_res.status + if status in [ + ConversionStatus.PENDING, + ConversionStatus.STARTED, + ]: # preserves ConversionStatus.PARTIAL_SUCCESS + status = ConversionStatus.SUCCESS + for page in conv_res.pages: if page._backend is None or not page._backend.is_valid(): conv_res.errors.append( diff --git a/tests/test_timeout_fix.py b/tests/test_timeout_fix.py deleted file mode 100644 index 919c1dd0..00000000 --- a/tests/test_timeout_fix.py +++ /dev/null @@ -1,237 +0,0 @@ -#!/usr/bin/env python3 -"""Test to validate timeout handling and ReadingOrderModel compatibility.""" - -import time -from unittest.mock import Mock - -from docling.datamodel.base_models import Page, ConversionStatus -from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_options import PdfPipelineOptions -from docling.pipeline.base_pipeline import PaginatedPipeline -from docling.backend.pdf_backend import PdfDocumentBackend - - -class MockPdfBackend(PdfDocumentBackend): - """Mock PDF backend for testing timeout scenarios.""" - - def __init__(self, in_doc, path_or_stream): - # Skip parent __init__ since we're mocking - self.input_format = getattr(in_doc, 'format', 'PDF') - self.path_or_stream = path_or_stream - self._page_count = 10 - - def load_page(self, page_no): - mock_page = Mock() - mock_page.is_valid.return_value = True - return mock_page - - def page_count(self): - return self._page_count - - def is_valid(self): - return True - - def unload(self): - pass - - -class TimeoutTestPipeline(PaginatedPipeline): - """Test implementation of PaginatedPipeline for timeout testing.""" - - def __init__(self, pipeline_options: PdfPipelineOptions): - super().__init__(pipeline_options) - self.build_pipe = [self.slow_processing_model] - - def slow_processing_model(self, conv_res, page_batch): - """Mock model that takes time to process to trigger timeout.""" - for page in page_batch: - # Simulate slow processing - time.sleep(0.1) - # Only initialize size for first few pages before timeout hits - if page.page_no < 2: # Only first 2 pages get initialized - page.size = Mock() - page.size.height = 800 - page.size.width = 600 - # Other pages remain with size=None to simulate timeout scenario - yield page - - def initialize_page(self, conv_res, page): - """Initialize page with mock backend.""" - page._backend = Mock() - page._backend.is_valid.return_value = True - return page - - def _determine_status(self, conv_res): - """Determine conversion status - inherit from parent to test the actual fix.""" - return super()._determine_status(conv_res) - - @classmethod - def get_default_options(cls): - return PdfPipelineOptions() - - @classmethod - def is_backend_supported(cls, backend): - return True - - -def test_document_timeout_filters_uninitialized_pages(): - """ - Test that setting document_timeout doesn't cause an AssertionError in ReadingOrderModel. - - This test validates the fix for the issue where setting pipeline_options.document_timeout - would lead to an AssertionError in ReadingOrderModel._readingorder_elements_to_docling_doc - when page.size is None for uninitialized pages after timeout. - - The fix filters out pages with size=None after timeout, preventing the assertion error. - """ - - # Create a test document with multiple pages - input_doc = Mock(spec=InputDocument) - input_doc.page_count = 10 - input_doc.limits = Mock() - input_doc.limits.page_range = (1, 10) - input_doc.file = Mock() - input_doc.file.name = "test.pdf" - input_doc._backend = MockPdfBackend(input_doc, "test.pdf") - - # Configure pipeline with very short timeout to trigger timeout condition - pipeline_options = PdfPipelineOptions() - pipeline_options.document_timeout = 0.05 # 50ms timeout - intentionally short - - pipeline = TimeoutTestPipeline(pipeline_options) - - # Process document using full pipeline execution - this should trigger timeout but not cause AssertionError - result = pipeline.execute(input_doc, raises_on_error=False) - - # Verify that uninitialized pages (with size=None) were filtered out - # This is the key fix - pages with size=None are removed before ReadingOrderModel processes them - assert len(result.pages) < input_doc.page_count, \ - "Should have fewer pages after timeout filtering" - - # All remaining pages should have size initialized - # This ensures ReadingOrderModel won't encounter pages with size=None - for page in result.pages: - assert page.size is not None, \ - f"Page {page.page_no} should have size initialized, got None" - - # Status should indicate partial success due to timeout - # This validates the fix for status preservation - assert result.status == ConversionStatus.PARTIAL_SUCCESS, \ - f"Expected PARTIAL_SUCCESS, got {result.status}" - - -def test_readingorder_model_compatibility(): - """ - Test that the filtered pages are compatible with ReadingOrderModel expectations. - - This test ensures that after the timeout filtering fix, all remaining pages - have the required 'size' attribute that ReadingOrderModel expects. - """ - - # Create a test document - input_doc = Mock(spec=InputDocument) - input_doc.page_count = 5 - input_doc.limits = Mock() - input_doc.limits.page_range = (1, 5) - input_doc.file = Mock() - input_doc.file.name = "test.pdf" - input_doc._backend = MockPdfBackend(input_doc, "test.pdf") - - # Configure pipeline with timeout that allows some pages to be processed - pipeline_options = PdfPipelineOptions() - pipeline_options.document_timeout = 0.15 # 150ms timeout - - pipeline = TimeoutTestPipeline(pipeline_options) - - # Process document using full pipeline execution - result = pipeline.execute(input_doc, raises_on_error=False) - - # Simulate what ReadingOrderModel expects - all pages should have size - for page in result.pages: - # This would be the assertion that failed before the fix: - # assert size is not None (line 132 in readingorder_model.py) - assert page.size is not None, \ - "ReadingOrderModel requires all pages to have size != None" - assert hasattr(page.size, 'height'), \ - "Size should have height attribute" - assert hasattr(page.size, 'width'), \ - "Size should have width attribute" - - -def test_timeout_status_preservation(): - """ - Test that ConversionStatus.PARTIAL_SUCCESS set during timeout is preserved - by _determine_status method. - - This validates the fix for the issue where _determine_status would overwrite - the PARTIAL_SUCCESS status that was correctly set due to document_timeout. - """ - - # Create a test document - input_doc = Mock(spec=InputDocument) - input_doc.page_count = 10 - input_doc.limits = Mock() - input_doc.limits.page_range = (1, 10) - input_doc.file = Mock() - input_doc.file.name = "test.pdf" - input_doc._backend = MockPdfBackend(input_doc, "test.pdf") - - conv_res = ConversionResult(input=input_doc) - - # Configure pipeline with very short timeout - pipeline_options = PdfPipelineOptions() - pipeline_options.document_timeout = 0.05 # 50ms timeout - - pipeline = TimeoutTestPipeline(pipeline_options) - - # Process document - this should trigger timeout and set PARTIAL_SUCCESS - result = pipeline._build_document(conv_res) - - # Verify that timeout was triggered and status was set to PARTIAL_SUCCESS - assert result.status == ConversionStatus.PARTIAL_SUCCESS, \ - f"Expected PARTIAL_SUCCESS after timeout, got {result.status}" - - # Now test that _determine_status preserves the PARTIAL_SUCCESS status - final_status = pipeline._determine_status(result) - assert final_status == ConversionStatus.PARTIAL_SUCCESS, \ - f"_determine_status should preserve PARTIAL_SUCCESS from timeout, got {final_status}" - - # Also test the full pipeline execution to ensure end-to-end behavior - final_result = pipeline.execute(input_doc, raises_on_error=False) - assert final_result.status == ConversionStatus.PARTIAL_SUCCESS, \ - f"Final result should maintain PARTIAL_SUCCESS status, got {final_result.status}" - - -def test_no_timeout_scenario(): - """Test that normal processing without timeout works correctly.""" - - # Create a test document - input_doc = Mock(spec=InputDocument) - input_doc.page_count = 3 # Small number to avoid timeout - input_doc.limits = Mock() - input_doc.limits.page_range = (1, 3) - input_doc.file = Mock() - input_doc.file.name = "test.pdf" - input_doc._backend = MockPdfBackend(input_doc, "test.pdf") - - # Configure pipeline with sufficient timeout - pipeline_options = PdfPipelineOptions() - pipeline_options.document_timeout = 2.0 # 2 seconds - should be enough - - pipeline = TimeoutTestPipeline(pipeline_options) - - # Process document using full pipeline execution - result = pipeline.execute(input_doc, raises_on_error=False) - - # All pages should be processed successfully without timeout - assert len(result.pages) >= 2, \ - "Should process at least 2 pages without timeout" - - # All pages should have size initialized - for page in result.pages: - assert page.size is not None, \ - f"Page {page.page_no} should have size initialized" - - # Status should be SUCCESS since no timeout occurred - assert result.status == ConversionStatus.SUCCESS, \ - f"Expected SUCCESS without timeout, got {result.status}" \ No newline at end of file