mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
Fix timeout status preservation issue by extending _determine_status method
Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com>
This commit is contained in:
parent
e0a603a33a
commit
252e5e214f
@ -10,17 +10,28 @@ assert size is not None, "Page size is not initialized."
|
|||||||
|
|
||||||
This error occurred in `ReadingOrderModel._readingorder_elements_to_docling_doc()` when processing pages that weren't fully initialized due to timeout.
|
This error occurred in `ReadingOrderModel._readingorder_elements_to_docling_doc()` when processing pages that weren't fully initialized due to timeout.
|
||||||
|
|
||||||
|
Additionally, there was a secondary issue where the `ConversionStatus.PARTIAL_SUCCESS` status that was correctly set during timeout was being overwritten by the `_determine_status` method.
|
||||||
|
|
||||||
## Root Cause
|
## Root Cause
|
||||||
|
|
||||||
When a document processing timeout occurs:
|
The issue had two parts:
|
||||||
1. The pipeline stops processing pages mid-way through the document
|
|
||||||
2. Some pages remain uninitialized with `page.size = None`
|
1. **Uninitialized Pages**: When a document processing timeout occurs:
|
||||||
3. These uninitialized pages are passed to the `ReadingOrderModel`
|
- The pipeline stops processing pages mid-way through the document
|
||||||
4. The `ReadingOrderModel` expects all pages to have `size != None`, causing the assertion to fail
|
- Some pages remain uninitialized with `page.size = None`
|
||||||
|
- These uninitialized pages are passed to the `ReadingOrderModel`
|
||||||
|
- The `ReadingOrderModel` expects all pages to have `size != None`, causing the assertion to fail
|
||||||
|
|
||||||
|
2. **Status Overwriting**: The `_determine_status` method would:
|
||||||
|
- Always start with `ConversionStatus.SUCCESS`
|
||||||
|
- Only change to `PARTIAL_SUCCESS` based on backend validation issues
|
||||||
|
- Ignore that timeout might have already set the status to `PARTIAL_SUCCESS`
|
||||||
|
|
||||||
## Solution
|
## Solution
|
||||||
|
|
||||||
The fix was implemented in `docling/pipeline/base_pipeline.py` (lines 196-206):
|
The fix was implemented in two parts in `docling/pipeline/base_pipeline.py`:
|
||||||
|
|
||||||
|
### Part 1: Page Filtering (lines 196-206)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Filter out uninitialized pages (those with size=None) that may remain
|
# Filter out uninitialized pages (those with size=None) that may remain
|
||||||
@ -35,21 +46,44 @@ if len(conv_res.pages) < initial_page_count:
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Part 2: Status Preservation (lines 220-221)
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||||
|
# Preserve PARTIAL_SUCCESS status if already set (e.g., due to timeout)
|
||||||
|
status = ConversionStatus.SUCCESS if conv_res.status != ConversionStatus.PARTIAL_SUCCESS else ConversionStatus.PARTIAL_SUCCESS
|
||||||
|
|
||||||
|
for page in conv_res.pages:
|
||||||
|
if page._backend is None or not page._backend.is_valid():
|
||||||
|
conv_res.errors.append(
|
||||||
|
ErrorItem(
|
||||||
|
component_type=DoclingComponentType.DOCUMENT_BACKEND,
|
||||||
|
module_name=type(page._backend).__name__,
|
||||||
|
error_message=f"Page {page.page_no} failed to parse.",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
status = ConversionStatus.PARTIAL_SUCCESS
|
||||||
|
|
||||||
|
return status
|
||||||
|
```
|
||||||
|
|
||||||
This fix:
|
This fix:
|
||||||
1. **Filters out uninitialized pages** before they reach the ReadingOrderModel
|
1. **Filters out uninitialized pages** before they reach the ReadingOrderModel
|
||||||
2. **Prevents the AssertionError** by ensuring all pages have `size != None`
|
2. **Prevents the AssertionError** by ensuring all pages have `size != None`
|
||||||
3. **Maintains partial conversion results** by keeping successfully processed pages
|
3. **Preserves timeout-induced PARTIAL_SUCCESS status** through the status determination process
|
||||||
4. **Logs the filtering action** for transparency
|
4. **Maintains partial conversion results** by keeping successfully processed pages
|
||||||
|
5. **Logs the filtering action** for transparency
|
||||||
|
|
||||||
## Verification
|
## Verification
|
||||||
|
|
||||||
The fix has been verified with comprehensive tests that:
|
The fix has been verified with comprehensive tests that:
|
||||||
1. ✅ Confirm timeout scenarios don't cause AssertionError
|
1. ✅ Confirm timeout scenarios don't cause AssertionError
|
||||||
2. ✅ Validate that filtered pages are compatible with ReadingOrderModel
|
2. ✅ Validate that filtered pages are compatible with ReadingOrderModel
|
||||||
3. ✅ Ensure normal processing (without timeout) still works correctly
|
3. ✅ Ensure timeout-induced PARTIAL_SUCCESS status is preserved
|
||||||
|
4. ✅ Ensure normal processing (without timeout) still works correctly
|
||||||
|
|
||||||
## Status
|
## Status
|
||||||
|
|
||||||
✅ **FIXED** - The issue has been resolved and the fix is working correctly.
|
✅ **FIXED** - The issue has been resolved and the fix is working correctly.
|
||||||
|
|
||||||
The conversion will now complete with `ConversionStatus.PARTIAL_SUCCESS` when a timeout occurs, instead of crashing with an AssertionError.
|
The conversion will now complete with `ConversionStatus.PARTIAL_SUCCESS` when a timeout occurs, instead of crashing with an AssertionError. The status is properly preserved throughout the pipeline execution.
|
@ -217,7 +217,9 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||||
status = ConversionStatus.SUCCESS
|
# Preserve PARTIAL_SUCCESS status if already set (e.g., due to timeout)
|
||||||
|
status = ConversionStatus.SUCCESS if conv_res.status != ConversionStatus.PARTIAL_SUCCESS else ConversionStatus.PARTIAL_SUCCESS
|
||||||
|
|
||||||
for page in conv_res.pages:
|
for page in conv_res.pages:
|
||||||
if page._backend is None or not page._backend.is_valid():
|
if page._backend is None or not page._backend.is_valid():
|
||||||
conv_res.errors.append(
|
conv_res.errors.append(
|
||||||
|
@ -62,10 +62,8 @@ class TimeoutTestPipeline(PaginatedPipeline):
|
|||||||
return page
|
return page
|
||||||
|
|
||||||
def _determine_status(self, conv_res):
|
def _determine_status(self, conv_res):
|
||||||
"""Determine conversion status."""
|
"""Determine conversion status - inherit from parent to test the actual fix."""
|
||||||
if len(conv_res.pages) < conv_res.input.page_count:
|
return super()._determine_status(conv_res)
|
||||||
return ConversionStatus.PARTIAL_SUCCESS
|
|
||||||
return ConversionStatus.SUCCESS
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_default_options(cls):
|
def get_default_options(cls):
|
||||||
@ -96,16 +94,14 @@ def test_document_timeout_filters_uninitialized_pages():
|
|||||||
input_doc.file.name = "test.pdf"
|
input_doc.file.name = "test.pdf"
|
||||||
input_doc._backend = MockPdfBackend(input_doc, "test.pdf")
|
input_doc._backend = MockPdfBackend(input_doc, "test.pdf")
|
||||||
|
|
||||||
conv_res = ConversionResult(input=input_doc)
|
|
||||||
|
|
||||||
# Configure pipeline with very short timeout to trigger timeout condition
|
# Configure pipeline with very short timeout to trigger timeout condition
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.document_timeout = 0.05 # 50ms timeout - intentionally short
|
pipeline_options.document_timeout = 0.05 # 50ms timeout - intentionally short
|
||||||
|
|
||||||
pipeline = TimeoutTestPipeline(pipeline_options)
|
pipeline = TimeoutTestPipeline(pipeline_options)
|
||||||
|
|
||||||
# Process document - this should trigger timeout but not cause AssertionError
|
# Process document using full pipeline execution - this should trigger timeout but not cause AssertionError
|
||||||
result = pipeline._build_document(conv_res)
|
result = pipeline.execute(input_doc, raises_on_error=False)
|
||||||
|
|
||||||
# Verify that uninitialized pages (with size=None) were filtered out
|
# Verify that uninitialized pages (with size=None) were filtered out
|
||||||
# This is the key fix - pages with size=None are removed before ReadingOrderModel processes them
|
# This is the key fix - pages with size=None are removed before ReadingOrderModel processes them
|
||||||
@ -119,9 +115,9 @@ def test_document_timeout_filters_uninitialized_pages():
|
|||||||
f"Page {page.page_no} should have size initialized, got None"
|
f"Page {page.page_no} should have size initialized, got None"
|
||||||
|
|
||||||
# Status should indicate partial success due to timeout
|
# Status should indicate partial success due to timeout
|
||||||
final_status = pipeline._determine_status(result)
|
# This validates the fix for status preservation
|
||||||
assert final_status == ConversionStatus.PARTIAL_SUCCESS, \
|
assert result.status == ConversionStatus.PARTIAL_SUCCESS, \
|
||||||
f"Expected PARTIAL_SUCCESS, got {final_status}"
|
f"Expected PARTIAL_SUCCESS, got {result.status}"
|
||||||
|
|
||||||
|
|
||||||
def test_readingorder_model_compatibility():
|
def test_readingorder_model_compatibility():
|
||||||
@ -141,16 +137,14 @@ def test_readingorder_model_compatibility():
|
|||||||
input_doc.file.name = "test.pdf"
|
input_doc.file.name = "test.pdf"
|
||||||
input_doc._backend = MockPdfBackend(input_doc, "test.pdf")
|
input_doc._backend = MockPdfBackend(input_doc, "test.pdf")
|
||||||
|
|
||||||
conv_res = ConversionResult(input=input_doc)
|
|
||||||
|
|
||||||
# Configure pipeline with timeout that allows some pages to be processed
|
# Configure pipeline with timeout that allows some pages to be processed
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.document_timeout = 0.15 # 150ms timeout
|
pipeline_options.document_timeout = 0.15 # 150ms timeout
|
||||||
|
|
||||||
pipeline = TimeoutTestPipeline(pipeline_options)
|
pipeline = TimeoutTestPipeline(pipeline_options)
|
||||||
|
|
||||||
# Process document
|
# Process document using full pipeline execution
|
||||||
result = pipeline._build_document(conv_res)
|
result = pipeline.execute(input_doc, raises_on_error=False)
|
||||||
|
|
||||||
# Simulate what ReadingOrderModel expects - all pages should have size
|
# Simulate what ReadingOrderModel expects - all pages should have size
|
||||||
for page in result.pages:
|
for page in result.pages:
|
||||||
@ -164,6 +158,50 @@ def test_readingorder_model_compatibility():
|
|||||||
"Size should have width attribute"
|
"Size should have width attribute"
|
||||||
|
|
||||||
|
|
||||||
|
def test_timeout_status_preservation():
|
||||||
|
"""
|
||||||
|
Test that ConversionStatus.PARTIAL_SUCCESS set during timeout is preserved
|
||||||
|
by _determine_status method.
|
||||||
|
|
||||||
|
This validates the fix for the issue where _determine_status would overwrite
|
||||||
|
the PARTIAL_SUCCESS status that was correctly set due to document_timeout.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Create a test document
|
||||||
|
input_doc = Mock(spec=InputDocument)
|
||||||
|
input_doc.page_count = 10
|
||||||
|
input_doc.limits = Mock()
|
||||||
|
input_doc.limits.page_range = (1, 10)
|
||||||
|
input_doc.file = Mock()
|
||||||
|
input_doc.file.name = "test.pdf"
|
||||||
|
input_doc._backend = MockPdfBackend(input_doc, "test.pdf")
|
||||||
|
|
||||||
|
conv_res = ConversionResult(input=input_doc)
|
||||||
|
|
||||||
|
# Configure pipeline with very short timeout
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
pipeline_options.document_timeout = 0.05 # 50ms timeout
|
||||||
|
|
||||||
|
pipeline = TimeoutTestPipeline(pipeline_options)
|
||||||
|
|
||||||
|
# Process document - this should trigger timeout and set PARTIAL_SUCCESS
|
||||||
|
result = pipeline._build_document(conv_res)
|
||||||
|
|
||||||
|
# Verify that timeout was triggered and status was set to PARTIAL_SUCCESS
|
||||||
|
assert result.status == ConversionStatus.PARTIAL_SUCCESS, \
|
||||||
|
f"Expected PARTIAL_SUCCESS after timeout, got {result.status}"
|
||||||
|
|
||||||
|
# Now test that _determine_status preserves the PARTIAL_SUCCESS status
|
||||||
|
final_status = pipeline._determine_status(result)
|
||||||
|
assert final_status == ConversionStatus.PARTIAL_SUCCESS, \
|
||||||
|
f"_determine_status should preserve PARTIAL_SUCCESS from timeout, got {final_status}"
|
||||||
|
|
||||||
|
# Also test the full pipeline execution to ensure end-to-end behavior
|
||||||
|
final_result = pipeline.execute(input_doc, raises_on_error=False)
|
||||||
|
assert final_result.status == ConversionStatus.PARTIAL_SUCCESS, \
|
||||||
|
f"Final result should maintain PARTIAL_SUCCESS status, got {final_result.status}"
|
||||||
|
|
||||||
|
|
||||||
def test_no_timeout_scenario():
|
def test_no_timeout_scenario():
|
||||||
"""Test that normal processing without timeout works correctly."""
|
"""Test that normal processing without timeout works correctly."""
|
||||||
|
|
||||||
@ -176,16 +214,14 @@ def test_no_timeout_scenario():
|
|||||||
input_doc.file.name = "test.pdf"
|
input_doc.file.name = "test.pdf"
|
||||||
input_doc._backend = MockPdfBackend(input_doc, "test.pdf")
|
input_doc._backend = MockPdfBackend(input_doc, "test.pdf")
|
||||||
|
|
||||||
conv_res = ConversionResult(input=input_doc)
|
|
||||||
|
|
||||||
# Configure pipeline with sufficient timeout
|
# Configure pipeline with sufficient timeout
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.document_timeout = 2.0 # 2 seconds - should be enough
|
pipeline_options.document_timeout = 2.0 # 2 seconds - should be enough
|
||||||
|
|
||||||
pipeline = TimeoutTestPipeline(pipeline_options)
|
pipeline = TimeoutTestPipeline(pipeline_options)
|
||||||
|
|
||||||
# Process document
|
# Process document using full pipeline execution
|
||||||
result = pipeline._build_document(conv_res)
|
result = pipeline.execute(input_doc, raises_on_error=False)
|
||||||
|
|
||||||
# All pages should be processed successfully without timeout
|
# All pages should be processed successfully without timeout
|
||||||
assert len(result.pages) >= 2, \
|
assert len(result.pages) >= 2, \
|
||||||
@ -194,4 +230,8 @@ def test_no_timeout_scenario():
|
|||||||
# All pages should have size initialized
|
# All pages should have size initialized
|
||||||
for page in result.pages:
|
for page in result.pages:
|
||||||
assert page.size is not None, \
|
assert page.size is not None, \
|
||||||
f"Page {page.page_no} should have size initialized"
|
f"Page {page.page_no} should have size initialized"
|
||||||
|
|
||||||
|
# Status should be SUCCESS since no timeout occurred
|
||||||
|
assert result.status == ConversionStatus.SUCCESS, \
|
||||||
|
f"Expected SUCCESS without timeout, got {result.status}"
|
Loading…
Reference in New Issue
Block a user