Initial async pdf pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-24 19:14:23 +00:00 · 2025-07-15 19:25:48 +02:00 · 2025-07-15 19:25:48 +02:00 · f56de726f3
commit f56de726f3
parent d6d2dbe2f9
4 changed files with 1151 additions and 9 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -332,3 +332,41 @@ class ProcessingPipeline(str, Enum):
    STANDARD = "standard"
    VLM = "vlm"
    ASR = "asr"
+
+
+class AsyncPdfPipelineOptions(PdfPipelineOptions):
+    """Enhanced options for async pipeline with cross-document batching"""
+
+    # GPU batching configuration - larger than sync defaults
+    layout_batch_size: int = 64
+    ocr_batch_size: int = 32
+    table_batch_size: int = 16
+
+    # Async coordination
+    batch_timeout_seconds: float = 2.0
+    max_concurrent_extractions: int = 16
+
+    # Queue sizes for backpressure
+    extraction_queue_size: int = 100
+    model_queue_size_multiplier: float = 2.0  # queue_size = batch_size * multiplier
+
+    # Resource management
+    max_gpu_memory_mb: Optional[int] = None
+    enable_resource_monitoring: bool = True
+
+    # Safety settings
+    enable_exception_isolation: bool = True
+    cleanup_validation: bool = True
+
+    @classmethod
+    def from_sync_options(
+        cls, sync_options: PdfPipelineOptions
+    ) -> "AsyncPdfPipelineOptions":
+        """Convert sync options to async options"""
+        # Start with sync options and override with async defaults
+        data = sync_options.model_dump()
+
+        # Remove sync-specific fields if any
+        data.pop("page_batch_size", None)  # We don't use fixed page batching
+
+        return cls(**data)
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -1,8 +1,9 @@
+import asyncio
 import hashlib
 import logging
 import sys
 import time
-from collections.abc import Iterable, Iterator
+from collections.abc import AsyncIterable, Iterable, Iterator
 from functools import partial
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Type, Union
@ -217,29 +218,29 @@ class DocumentConverter:
    @validate_call(config=ConfigDict(strict=True))
    def convert(
        self,
-        source: Union[Path, str, DocumentStream],  # TODO review naming
+        source: Union[Path, str, DocumentStream],
        headers: Optional[Dict[str, str]] = None,
        raises_on_error: bool = True,
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
        page_range: PageRange = DEFAULT_PAGE_RANGE,
    ) -> ConversionResult:
-        all_res = self.convert_all(
+        for result in self.convert_all(
            source=[source],
+            headers=headers,
            raises_on_error=raises_on_error,
            max_num_pages=max_num_pages,
            max_file_size=max_file_size,
-            headers=headers,
            page_range=page_range,
-        )
-        return next(all_res)
+        ):
+            return result

    @validate_call(config=ConfigDict(strict=True))
    def convert_all(
        self,
-        source: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
+        source: Iterable[Union[Path, str, DocumentStream]],
        headers: Optional[Dict[str, str]] = None,
-        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
+        raises_on_error: bool = True,
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
        page_range: PageRange = DEFAULT_PAGE_RANGE,
@ -250,7 +251,10 @@ class DocumentConverter:
            page_range=page_range,
        )
        conv_input = _DocumentConversionInput(
-            path_or_stream_iterator=source, limits=limits, headers=headers
+            path_or_stream_iterator=source,
+            allowed_formats=self.allowed_formats,
+            limits=limits,
+            headers=headers,
        )
        conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)

@ -272,6 +276,107 @@ class DocumentConverter:
                "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
            )

+    async def convert_all_async(
+        self,
+        source: Iterable[Union[Path, str, DocumentStream]],
+        headers: Optional[Dict[str, str]] = None,
+        raises_on_error: bool = True,
+        max_num_pages: int = sys.maxsize,
+        max_file_size: int = sys.maxsize,
+        page_range: PageRange = DEFAULT_PAGE_RANGE,
+    ) -> AsyncIterable[ConversionResult]:
+        """
+        Async version of convert_all with cross-document batching.
+
+        Yields results as they complete, not necessarily in input order.
+        """
+        limits = DocumentLimits(
+            max_num_pages=max_num_pages,
+            max_file_size=max_file_size,
+            page_range=page_range,
+        )
+        conv_input = _DocumentConversionInput(
+            path_or_stream_iterator=source, limits=limits, headers=headers
+        )
+
+        # Create async document stream
+        async def doc_stream():
+            for doc in conv_input.docs(self.format_to_options):
+                yield doc
+
+        # Check if we have async-capable pipelines
+        has_async = False
+        for format_opt in self.format_to_options.values():
+            if hasattr(format_opt.pipeline_cls, "execute_stream"):
+                has_async = True
+                break
+
+        if has_async:
+            # Use async pipeline for cross-document batching
+            # For now, assume PDF pipeline handles all async processing
+            pdf_format_opt = self.format_to_options.get(InputFormat.PDF)
+
+            if pdf_format_opt is None:
+                return
+
+            pipeline_cls = pdf_format_opt.pipeline_cls
+            if hasattr(pipeline_cls, "execute_stream"):
+                # Initialize async pipeline
+                pipeline_options = self.format_to_options[
+                    InputFormat.PDF
+                ].pipeline_options
+
+                # Convert to async options if needed
+                from docling.datamodel.pipeline_options import AsyncPdfPipelineOptions
+
+                if not isinstance(pipeline_options, AsyncPdfPipelineOptions):
+                    pipeline_options = AsyncPdfPipelineOptions.from_sync_options(
+                        pipeline_options
+                    )
+
+                pipeline = pipeline_cls(pipeline_options)
+
+                # Process all documents through async pipeline
+                async for result in pipeline.execute_stream(doc_stream()):
+                    yield result
+            else:
+                # Fallback to sequential async processing
+                async for doc in doc_stream():
+                    result = await asyncio.to_thread(
+                        self._process_document, doc, raises_on_error
+                    )
+                    yield result
+        else:
+            # All pipelines are sync, process sequentially with threading
+            async for doc in doc_stream():
+                result = await asyncio.to_thread(
+                    self._process_document, doc, raises_on_error
+                )
+                yield result
+
+    async def convert_async(
+        self,
+        source: Union[Path, str, DocumentStream],
+        headers: Optional[Dict[str, str]] = None,
+        raises_on_error: bool = True,
+        max_num_pages: int = sys.maxsize,
+        max_file_size: int = sys.maxsize,
+        page_range: PageRange = DEFAULT_PAGE_RANGE,
+    ) -> ConversionResult:
+        """Async convenience method for single document conversion."""
+        async for result in self.convert_all_async(
+            [source],
+            headers=headers,
+            raises_on_error=raises_on_error,
+            max_num_pages=max_num_pages,
+            max_file_size=max_file_size,
+            page_range=page_range,
+        ):
+            return result
+
+        # If no results were yielded, raise an error
+        raise RuntimeError(f"No conversion result produced for source: {source}")
+
    def _convert(
        self, conv_input: _DocumentConversionInput, raises_on_error: bool
    ) -> Iterator[ConversionResult]:
--- a/docling/pipeline/async_base_pipeline.py
+++ b/docling/pipeline/async_base_pipeline.py
@ -0,0 +1,54 @@
+import asyncio
+import logging
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import AsyncIterable, Dict, Optional, Set, Tuple
+
+from docling.backend.pdf_backend import PdfPageBackend
+from docling.datamodel.base_models import Page
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.pipeline_options import PipelineOptions
+
+_log = logging.getLogger(__name__)
+
+
+@dataclass
+class DocumentTracker:
+    """Tracks document processing state for resource management"""
+
+    doc_id: str
+    total_pages: int
+    processed_pages: int = 0
+    page_backends: Dict[int, PdfPageBackend] = field(
+        default_factory=dict
+    )  # page_no -> backend
+    conv_result: Optional[ConversionResult] = None
+
+
+class AsyncPipeline(ABC):
+    """Base class for async pipeline implementations"""
+
+    def __init__(self, pipeline_options: PipelineOptions):
+        self.pipeline_options = pipeline_options
+        self.keep_images = False
+        self.keep_backend = False
+
+    @abstractmethod
+    async def execute_stream(
+        self, input_docs: AsyncIterable[InputDocument]
+    ) -> AsyncIterable[ConversionResult]:
+        """Process multiple documents with cross-document batching"""
+
+    async def execute_single(
+        self, in_doc: InputDocument, raises_on_error: bool = True
+    ) -> ConversionResult:
+        """Process a single document - for backward compatibility"""
+
+        async def single_doc_stream():
+            yield in_doc
+
+        async for result in self.execute_stream(single_doc_stream()):
+            return result
+
+        # Should never reach here
+        raise RuntimeError("No result produced for document")
--- a/docling/pipeline/async_standard_pdf_pipeline.py
+++ b/docling/pipeline/async_standard_pdf_pipeline.py
@ -0,0 +1,945 @@
+import asyncio
+import logging
+import time
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Any, AsyncIterable, Dict, List, Optional, Tuple
+
+from docling.backend.pdf_backend import PdfDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, Page
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.pipeline_options import AsyncPdfPipelineOptions
+from docling.datamodel.settings import settings
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
+from docling.models.document_picture_classifier import (
+    DocumentPictureClassifier,
+    DocumentPictureClassifierOptions,
+)
+from docling.models.factories import get_ocr_factory, get_picture_description_factory
+
+# Import the same models used by StandardPdfPipeline
+from docling.models.layout_model import LayoutModel
+from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
+from docling.models.page_preprocessing_model import (
+    PagePreprocessingModel,
+    PagePreprocessingOptions,
+)
+from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
+from docling.models.table_structure_model import TableStructureModel
+from docling.pipeline.async_base_pipeline import AsyncPipeline
+from docling.pipeline.resource_manager import (
+    AsyncPageTracker,
+    ConversionResultAccumulator,
+)
+from docling.utils.profiling import ProfilingScope, TimeRecorder
+
+_log = logging.getLogger(__name__)
+
+
+@dataclass
+class PageBatch:
+    """Represents a batch of pages to process through models"""
+
+    pages: List[Page] = field(default_factory=list)
+    conv_results: List[ConversionResult] = field(default_factory=list)
+    start_time: float = field(default_factory=time.time)
+
+
+@dataclass
+class QueueTerminator:
+    """Sentinel value for proper queue termination tracking"""
+
+    stage: str
+    error: Optional[Exception] = None
+
+
+class AsyncStandardPdfPipeline(AsyncPipeline):
+    """Async pipeline implementation with cross-document batching using structured concurrency"""
+
+    def __init__(self, pipeline_options: AsyncPdfPipelineOptions):
+        super().__init__(pipeline_options)
+        self.pipeline_options: AsyncPdfPipelineOptions = pipeline_options
+
+        # Resource management
+        self.page_tracker = AsyncPageTracker(
+            keep_images=self._should_keep_images(),
+            keep_backend=self._should_keep_backend(),
+        )
+
+        # Initialize models (same as StandardPdfPipeline)
+        self._initialize_models()
+
+    def _should_keep_images(self) -> bool:
+        """Determine if images should be kept (same logic as StandardPdfPipeline)"""
+        return (
+            self.pipeline_options.generate_page_images
+            or self.pipeline_options.generate_picture_images
+            or self.pipeline_options.generate_table_images
+        )
+
+    def _should_keep_backend(self) -> bool:
+        """Determine if backend should be kept"""
+        return (
+            self.pipeline_options.do_formula_enrichment
+            or self.pipeline_options.do_code_enrichment
+            or self.pipeline_options.do_picture_classification
+            or self.pipeline_options.do_picture_description
+        )
+
+    def _initialize_models(self):
+        """Initialize all models (matching StandardPdfPipeline)"""
+        artifacts_path = self._get_artifacts_path()
+
+        self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
+
+        # Build pipeline stages
+        self.preprocessing_model = PagePreprocessingModel(
+            options=PagePreprocessingOptions(
+                images_scale=self.pipeline_options.images_scale,
+            )
+        )
+
+        self.ocr_model = self._get_ocr_model(artifacts_path)
+
+        self.layout_model = LayoutModel(
+            artifacts_path=artifacts_path,
+            accelerator_options=self.pipeline_options.accelerator_options,
+            options=self.pipeline_options.layout_options,
+        )
+
+        self.table_model = TableStructureModel(
+            enabled=self.pipeline_options.do_table_structure,
+            artifacts_path=artifacts_path,
+            options=self.pipeline_options.table_structure_options,
+            accelerator_options=self.pipeline_options.accelerator_options,
+        )
+
+        self.assemble_model = PageAssembleModel(options=PageAssembleOptions())
+
+        # Enrichment models
+        self.code_formula_model = CodeFormulaModel(
+            enabled=self.pipeline_options.do_code_enrichment
+            or self.pipeline_options.do_formula_enrichment,
+            artifacts_path=artifacts_path,
+            options=CodeFormulaModelOptions(
+                do_code_enrichment=self.pipeline_options.do_code_enrichment,
+                do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
+            ),
+            accelerator_options=self.pipeline_options.accelerator_options,
+        )
+
+        self.picture_classifier = DocumentPictureClassifier(
+            enabled=self.pipeline_options.do_picture_classification,
+            artifacts_path=artifacts_path,
+            options=DocumentPictureClassifierOptions(),
+            accelerator_options=self.pipeline_options.accelerator_options,
+        )
+
+        self.picture_description_model = self._get_picture_description_model(
+            artifacts_path
+        )
+
+    def _get_artifacts_path(self) -> Optional[str]:
+        """Get artifacts path (same as StandardPdfPipeline)"""
+        from pathlib import Path
+
+        artifacts_path = None
+        if self.pipeline_options.artifacts_path is not None:
+            artifacts_path = Path(self.pipeline_options.artifacts_path).expanduser()
+        elif settings.artifacts_path is not None:
+            artifacts_path = Path(settings.artifacts_path).expanduser()
+
+        if artifacts_path is not None and not artifacts_path.is_dir():
+            raise RuntimeError(
+                f"The value of {artifacts_path=} is not valid. "
+                "When defined, it must point to a folder containing all models required by the pipeline."
+            )
+        return artifacts_path
+
+    def _get_ocr_model(self, artifacts_path: Optional[str] = None) -> BaseOcrModel:
+        """Get OCR model (same as StandardPdfPipeline)"""
+        factory = get_ocr_factory(
+            allow_external_plugins=self.pipeline_options.allow_external_plugins
+        )
+        return factory.create_instance(
+            options=self.pipeline_options.ocr_options,
+            enabled=self.pipeline_options.do_ocr,
+            artifacts_path=artifacts_path,
+            accelerator_options=self.pipeline_options.accelerator_options,
+        )
+
+    def _get_picture_description_model(self, artifacts_path: Optional[str] = None):
+        """Get picture description model (same as StandardPdfPipeline)"""
+        factory = get_picture_description_factory(
+            allow_external_plugins=self.pipeline_options.allow_external_plugins
+        )
+        return factory.create_instance(
+            options=self.pipeline_options.picture_description_options,
+            enabled=self.pipeline_options.do_picture_description,
+            enable_remote_services=self.pipeline_options.enable_remote_services,
+            artifacts_path=artifacts_path,
+            accelerator_options=self.pipeline_options.accelerator_options,
+        )
+
+    async def execute_stream(
+        self, input_docs: AsyncIterable[InputDocument]
+    ) -> AsyncIterable[ConversionResult]:
+        """Main async processing with structured concurrency and proper exception handling"""
+        # Create queues for pipeline stages
+        page_queue = asyncio.Queue(maxsize=self.pipeline_options.extraction_queue_size)
+        completed_queue = asyncio.Queue()
+        completed_docs = asyncio.Queue()
+
+        # Track active documents for proper termination
+        doc_tracker = {"active_docs": 0, "extraction_done": False}
+        doc_lock = asyncio.Lock()
+
+        # Create exception event for coordinated shutdown
+        exception_event = asyncio.Event()
+
+        async def track_document_start():
+            async with doc_lock:
+                doc_tracker["active_docs"] += 1
+
+        async def track_document_complete():
+            async with doc_lock:
+                doc_tracker["active_docs"] -= 1
+                if doc_tracker["extraction_done"] and doc_tracker["active_docs"] == 0:
+                    # All documents completed
+                    await completed_docs.put(None)
+
+        try:
+            async with asyncio.TaskGroup() as tg:
+                # Start all tasks
+                tg.create_task(
+                    self._extract_documents_wrapper(
+                        input_docs,
+                        page_queue,
+                        track_document_start,
+                        exception_event,
+                        doc_tracker,
+                        doc_lock,
+                    )
+                )
+                tg.create_task(
+                    self._process_pages_wrapper(
+                        page_queue, completed_queue, exception_event
+                    )
+                )
+                tg.create_task(
+                    self._aggregate_results_wrapper(
+                        completed_queue,
+                        completed_docs,
+                        track_document_complete,
+                        exception_event,
+                    )
+                )
+
+                # Yield results as they complete
+                async for result in self._yield_results(
+                    completed_docs, exception_event
+                ):
+                    yield result
+
+        except* Exception as eg:
+            # Handle exception group from TaskGroup
+            _log.error(f"Pipeline failed with exceptions: {eg.exceptions}")
+            # Re-raise the first exception
+            raise (eg.exceptions[0] if eg.exceptions else RuntimeError("Unknown error"))
+        finally:
+            # Ensure cleanup
+            await self.page_tracker.cleanup_all()
+
+    async def _extract_documents_wrapper(
+        self,
+        input_docs: AsyncIterable[InputDocument],
+        page_queue: asyncio.Queue,
+        track_document_start,
+        exception_event: asyncio.Event,
+        doc_tracker: Dict[str, Any],
+        doc_lock: asyncio.Lock,
+    ):
+        """Wrapper for document extraction with exception handling"""
+        try:
+            await self._extract_documents_safe(
+                input_docs,
+                page_queue,
+                track_document_start,
+                exception_event,
+            )
+        except Exception:
+            exception_event.set()
+            raise
+        finally:
+            async with doc_lock:
+                doc_tracker["extraction_done"] = True
+            # Send termination signal
+            await page_queue.put(QueueTerminator("extraction"))
+
+    async def _process_pages_wrapper(
+        self,
+        page_queue: asyncio.Queue,
+        completed_queue: asyncio.Queue,
+        exception_event: asyncio.Event,
+    ):
+        """Wrapper for page processing with exception handling"""
+        try:
+            await self._process_pages_safe(page_queue, completed_queue, exception_event)
+        except Exception:
+            exception_event.set()
+            raise
+        finally:
+            # Send termination signal
+            await completed_queue.put(QueueTerminator("processing"))
+
+    async def _aggregate_results_wrapper(
+        self,
+        completed_queue: asyncio.Queue,
+        completed_docs: asyncio.Queue,
+        track_document_complete,
+        exception_event: asyncio.Event,
+    ):
+        """Wrapper for result aggregation with exception handling"""
+        try:
+            await self._aggregate_results_safe(
+                completed_queue,
+                completed_docs,
+                track_document_complete,
+                exception_event,
+            )
+        except Exception:
+            exception_event.set()
+            raise
+
+    async def _yield_results(
+        self, completed_docs: asyncio.Queue, exception_event: asyncio.Event
+    ):
+        """Yield results as they complete"""
+        while True:
+            if exception_event.is_set():
+                break
+
+            try:
+                result = await asyncio.wait_for(completed_docs.get(), timeout=1.0)
+                if result is None:
+                    break
+                yield result
+            except asyncio.TimeoutError:
+                continue
+            except Exception:
+                exception_event.set()
+                raise
+
+    async def _extract_documents_safe(
+        self,
+        input_docs: AsyncIterable[InputDocument],
+        page_queue: asyncio.Queue,
+        track_document_start,
+        exception_event: asyncio.Event,
+    ) -> None:
+        """Extract pages from documents with exception handling"""
+        async for in_doc in input_docs:
+            if exception_event.is_set():
+                break
+
+            await track_document_start()
+            conv_res = ConversionResult(input=in_doc)
+
+            # Validate backend
+            if not isinstance(conv_res.input._backend, PdfDocumentBackend):
+                conv_res.status = ConversionStatus.FAILURE
+                await page_queue.put((None, conv_res))  # Signal failed document
+                continue
+
+            try:
+                # Initialize document
+                total_pages = conv_res.input.page_count
+                await self.page_tracker.register_document(conv_res, total_pages)
+
+                # Extract pages with limited concurrency
+                semaphore = asyncio.Semaphore(
+                    self.pipeline_options.max_concurrent_extractions
+                )
+
+                async def extract_page(page_no: int):
+                    if exception_event.is_set():
+                        return
+
+                    async with semaphore:
+                        # Create page
+                        page = Page(page_no=page_no)
+                        conv_res.pages.append(page)
+
+                        # Initialize page backend
+                        page._backend = await asyncio.to_thread(
+                            conv_res.input._backend.load_page, page_no
+                        )
+
+                        if page._backend is not None and page._backend.is_valid():
+                            page.size = page._backend.get_size()
+                            await self.page_tracker.track_page_loaded(page, conv_res)
+
+                        # Send to processing queue
+                        await page_queue.put((page, conv_res))
+
+                # Extract all pages concurrently
+                async with asyncio.TaskGroup() as tg:
+                    for i in range(total_pages):
+                        if exception_event.is_set():
+                            break
+                        start_page, end_page = conv_res.input.limits.page_range
+                        if (start_page - 1) <= i <= (end_page - 1):
+                            tg.create_task(extract_page(i))
+
+            except Exception as e:
+                _log.error(f"Failed to extract document {in_doc.file.name}: {e}")
+                conv_res.status = ConversionStatus.FAILURE
+                # Signal document failure
+                await page_queue.put((None, conv_res))
+                raise
+
+    async def _process_pages_safe(
+        self,
+        page_queue: asyncio.Queue,
+        completed_queue: asyncio.Queue,
+        exception_event: asyncio.Event,
+    ) -> None:
+        """Process pages through model pipeline with proper termination"""
+        # Process batches through each model stage
+        preprocessing_queue = asyncio.Queue()
+        ocr_queue = asyncio.Queue()
+        layout_queue = asyncio.Queue()
+        table_queue = asyncio.Queue()
+        assemble_queue = asyncio.Queue()
+
+        # Start processing stages using TaskGroup
+        async with asyncio.TaskGroup() as tg:
+            # Preprocessing stage
+            tg.create_task(
+                self._batch_process_stage_safe(
+                    page_queue,
+                    preprocessing_queue,
+                    self._preprocess_batch,
+                    1,
+                    0,  # No batching for preprocessing
+                    "preprocessing",
+                    exception_event,
+                )
+            )
+
+            # OCR stage
+            tg.create_task(
+                self._batch_process_stage_safe(
+                    preprocessing_queue,
+                    ocr_queue,
+                    self._ocr_batch,
+                    self.pipeline_options.ocr_batch_size,
+                    self.pipeline_options.batch_timeout_seconds,
+                    "ocr",
+                    exception_event,
+                )
+            )
+
+            # Layout stage
+            tg.create_task(
+                self._batch_process_stage_safe(
+                    ocr_queue,
+                    layout_queue,
+                    self._layout_batch,
+                    self.pipeline_options.layout_batch_size,
+                    self.pipeline_options.batch_timeout_seconds,
+                    "layout",
+                    exception_event,
+                )
+            )
+
+            # Table stage
+            tg.create_task(
+                self._batch_process_stage_safe(
+                    layout_queue,
+                    table_queue,
+                    self._table_batch,
+                    self.pipeline_options.table_batch_size,
+                    self.pipeline_options.batch_timeout_seconds,
+                    "table",
+                    exception_event,
+                )
+            )
+
+            # Assembly stage
+            tg.create_task(
+                self._batch_process_stage_safe(
+                    table_queue,
+                    assemble_queue,
+                    self._assemble_batch,
+                    1,
+                    0,  # No batching for assembly
+                    "assembly",
+                    exception_event,
+                )
+            )
+
+            # Finalization stage
+            tg.create_task(
+                self._finalize_pages_safe(
+                    assemble_queue, completed_queue, exception_event
+                )
+            )
+
+    async def _batch_process_stage_safe(
+        self,
+        input_queue: asyncio.Queue,
+        output_queue: asyncio.Queue,
+        process_func,
+        batch_size: int,
+        timeout: float,
+        stage_name: str,
+        exception_event: asyncio.Event,
+    ) -> None:
+        """Generic batch processing stage with proper termination handling"""
+        batch = PageBatch()
+
+        try:
+            while not exception_event.is_set():
+                # Collect batch
+                try:
+                    # Get first item or wait for timeout
+                    if not batch.pages:
+                        item = await input_queue.get()
+
+                        # Check for termination
+                        if isinstance(item, QueueTerminator):
+                            # Propagate termination signal
+                            await output_queue.put(item)
+                            break
+
+                        # Handle failed document signal
+                        if item[0] is None:
+                            # Pass through failure signal
+                            await output_queue.put(item)
+                            continue
+
+                        batch.pages.append(item[0])
+                        batch.conv_results.append(item[1])
+
+                    # Try to fill batch up to batch_size
+                    while len(batch.pages) < batch_size:
+                        remaining_time = timeout - (time.time() - batch.start_time)
+                        if remaining_time <= 0:
+                            break
+
+                        try:
+                            item = await asyncio.wait_for(
+                                input_queue.get(), timeout=remaining_time
+                            )
+
+                            # Check for termination
+                            if isinstance(item, QueueTerminator):
+                                # Put it back and process current batch
+                                await input_queue.put(item)
+                                break
+
+                            # Handle failed document signal
+                            if item[0] is None:
+                                # Put it back and process current batch
+                                await input_queue.put(item)
+                                break
+
+                            batch.pages.append(item[0])
+                            batch.conv_results.append(item[1])
+                        except asyncio.TimeoutError:
+                            break
+
+                    # Process batch
+                    if batch.pages:
+                        processed = await process_func(batch)
+
+                        # Send results to output queue
+                        for page, conv_res in processed:
+                            await output_queue.put((page, conv_res))
+
+                        # Clear batch
+                        batch = PageBatch()
+
+                except Exception as e:
+                    _log.error(f"Error in {stage_name} batch processing: {e}")
+                    # Send failed items downstream
+                    for page, conv_res in zip(batch.pages, batch.conv_results):
+                        await output_queue.put((page, conv_res))
+                    batch = PageBatch()
+                    raise
+
+        except Exception as e:
+            # Set exception event and propagate termination
+            exception_event.set()
+            await output_queue.put(QueueTerminator(stage_name, error=e))
+            raise
+
+    async def _preprocess_batch(
+        self, batch: PageBatch
+    ) -> List[Tuple[Page, ConversionResult]]:
+        """Preprocess pages (no actual batching needed)"""
+        results = []
+        for page, conv_res in zip(batch.pages, batch.conv_results):
+            processed_page = await asyncio.to_thread(
+                lambda: next(iter(self.preprocessing_model(conv_res, [page])))
+            )
+            results.append((processed_page, conv_res))
+        return results
+
+    async def _ocr_batch(self, batch: PageBatch) -> List[Tuple[Page, ConversionResult]]:
+        """Process OCR in batch"""
+        # Group by conversion result for proper context
+        grouped = defaultdict(list)
+        for page, conv_res in zip(batch.pages, batch.conv_results):
+            grouped[id(conv_res)].append(page)
+
+        results = []
+        for conv_res_id, pages in grouped.items():
+            # Find the conv_res
+            conv_res = next(
+                cr
+                for p, cr in zip(batch.pages, batch.conv_results)
+                if id(cr) == conv_res_id
+            )
+
+            # Process batch through OCR model
+            processed_pages = await asyncio.to_thread(
+                lambda: list(self.ocr_model(conv_res, pages))
+            )
+
+            for page in processed_pages:
+                results.append((page, conv_res))
+
+        return results
+
+    async def _layout_batch(
+        self, batch: PageBatch
+    ) -> List[Tuple[Page, ConversionResult]]:
+        """Process layout in batch"""
+        # Similar batching as OCR
+        grouped = defaultdict(list)
+        for page, conv_res in zip(batch.pages, batch.conv_results):
+            grouped[id(conv_res)].append(page)
+
+        results = []
+        for conv_res_id, pages in grouped.items():
+            conv_res = next(
+                cr
+                for p, cr in zip(batch.pages, batch.conv_results)
+                if id(cr) == conv_res_id
+            )
+
+            processed_pages = await asyncio.to_thread(
+                lambda: list(self.layout_model(conv_res, pages))
+            )
+
+            for page in processed_pages:
+                results.append((page, conv_res))
+
+        return results
+
+    async def _table_batch(
+        self, batch: PageBatch
+    ) -> List[Tuple[Page, ConversionResult]]:
+        """Process tables in batch"""
+        grouped = defaultdict(list)
+        for page, conv_res in zip(batch.pages, batch.conv_results):
+            grouped[id(conv_res)].append(page)
+
+        results = []
+        for conv_res_id, pages in grouped.items():
+            conv_res = next(
+                cr
+                for p, cr in zip(batch.pages, batch.conv_results)
+                if id(cr) == conv_res_id
+            )
+
+            processed_pages = await asyncio.to_thread(
+                lambda: list(self.table_model(conv_res, pages))
+            )
+
+            for page in processed_pages:
+                results.append((page, conv_res))
+
+        return results
+
+    async def _assemble_batch(
+        self, batch: PageBatch
+    ) -> List[Tuple[Page, ConversionResult]]:
+        """Assemble pages (no actual batching needed)"""
+        results = []
+        for page, conv_res in zip(batch.pages, batch.conv_results):
+            assembled_page = await asyncio.to_thread(
+                lambda: next(iter(self.assemble_model(conv_res, [page])))
+            )
+            results.append((assembled_page, conv_res))
+        return results
+
+    async def _finalize_pages_safe(
+        self,
+        input_queue: asyncio.Queue,
+        output_queue: asyncio.Queue,
+        exception_event: asyncio.Event,
+    ) -> None:
+        """Finalize pages and track completion with proper termination"""
+        try:
+            while not exception_event.is_set():
+                item = await input_queue.get()
+
+                # Check for termination
+                if isinstance(item, QueueTerminator):
+                    # Propagate termination signal
+                    await output_queue.put(item)
+                    break
+
+                # Handle failed document signal
+                if item[0] is None:
+                    # Pass through failure signal
+                    await output_queue.put(item)
+                    continue
+
+                page, conv_res = item
+
+                # Track page completion for resource cleanup
+                await self.page_tracker.track_page_completion(page, conv_res)
+
+                # Send to output
+                await output_queue.put((page, conv_res))
+
+        except Exception as e:
+            exception_event.set()
+            await output_queue.put(QueueTerminator("finalization", error=e))
+            raise
+
+    async def _aggregate_results_safe(
+        self,
+        completed_queue: asyncio.Queue,
+        completed_docs: asyncio.Queue,
+        track_document_complete,
+        exception_event: asyncio.Event,
+    ) -> None:
+        """Aggregate completed pages into documents with proper termination"""
+        doc_pages = defaultdict(list)
+        failed_docs = set()
+
+        try:
+            while not exception_event.is_set():
+                item = await completed_queue.get()
+
+                # Check for termination
+                if isinstance(item, QueueTerminator):
+                    # Finalize any remaining documents
+                    for conv_res_id, pages in doc_pages.items():
+                        if conv_res_id not in failed_docs:
+                            # Find conv_res from first page
+                            conv_res = pages[0][1]
+                            await self._finalize_document(conv_res)
+                            await completed_docs.put(conv_res)
+                            await track_document_complete()
+                    break
+
+                # Handle failed document signal
+                if item[0] is None:
+                    conv_res = item[1]
+                    doc_id = id(conv_res)
+                    failed_docs.add(doc_id)
+                    # Send failed document immediately
+                    await completed_docs.put(conv_res)
+                    await track_document_complete()
+                    continue
+
+                page, conv_res = item
+                doc_id = id(conv_res)
+
+                if doc_id not in failed_docs:
+                    doc_pages[doc_id].append((page, conv_res))
+
+                    # Check if document is complete
+                    if len(doc_pages[doc_id]) == len(conv_res.pages):
+                        await self._finalize_document(conv_res)
+                        await completed_docs.put(conv_res)
+                        await track_document_complete()
+                        del doc_pages[doc_id]
+
+        except Exception:
+            exception_event.set()
+            # Try to send any completed documents before failing
+            for conv_res_id, pages in doc_pages.items():
+                if conv_res_id not in failed_docs and pages:
+                    conv_res = pages[0][1]
+                    conv_res.status = ConversionStatus.PARTIAL_SUCCESS
+                    await completed_docs.put(conv_res)
+                    await track_document_complete()
+            raise
+
+    async def _finalize_document(self, conv_res: ConversionResult) -> None:
+        """Finalize a complete document (same as StandardPdfPipeline._assemble_document)"""
+        # This matches the logic from StandardPdfPipeline
+        import warnings
+
+        import numpy as np
+
+        from docling.datamodel.base_models import AssembledUnit
+
+        all_elements = []
+        all_headers = []
+        all_body = []
+
+        with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
+            for p in conv_res.pages:
+                if p.assembled is not None:
+                    for el in p.assembled.body:
+                        all_body.append(el)
+                    for el in p.assembled.headers:
+                        all_headers.append(el)
+                    for el in p.assembled.elements:
+                        all_elements.append(el)
+
+            conv_res.assembled = AssembledUnit(
+                elements=all_elements, headers=all_headers, body=all_body
+            )
+
+            conv_res.document = self.reading_order_model(conv_res)
+
+            # Generate page images in the output
+            if self.pipeline_options.generate_page_images:
+                for page in conv_res.pages:
+                    if page.image is not None:
+                        page_no = page.page_no + 1
+                        from docling_core.types.doc import ImageRef
+
+                        conv_res.document.pages[page_no].image = ImageRef.from_pil(
+                            page.image, dpi=int(72 * self.pipeline_options.images_scale)
+                        )
+
+            # Handle picture/table images (same as StandardPdfPipeline)
+            self._generate_element_images(conv_res)
+
+            # Aggregate confidence values
+            self._aggregate_confidence(conv_res)
+
+            # Run enrichment pipeline
+            await self._enrich_document(conv_res)
+
+            # Set final status
+            conv_res.status = self._determine_status(conv_res)
+
+    def _generate_element_images(self, conv_res: ConversionResult) -> None:
+        """Generate images for elements (same as StandardPdfPipeline)"""
+        import warnings
+
+        from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=DeprecationWarning)
+            if (
+                self.pipeline_options.generate_picture_images
+                or self.pipeline_options.generate_table_images
+            ):
+                scale = self.pipeline_options.images_scale
+                for element, _level in conv_res.document.iterate_items():
+                    if not isinstance(element, DocItem) or len(element.prov) == 0:
+                        continue
+                    if (
+                        isinstance(element, PictureItem)
+                        and self.pipeline_options.generate_picture_images
+                    ) or (
+                        isinstance(element, TableItem)
+                        and self.pipeline_options.generate_table_images
+                    ):
+                        page_ix = element.prov[0].page_no - 1
+                        page = next(
+                            (p for p in conv_res.pages if p.page_no == page_ix), None
+                        )
+                        if (
+                            page is not None
+                            and page.size is not None
+                            and page.image is not None
+                        ):
+                            crop_bbox = (
+                                element.prov[0]
+                                .bbox.scaled(scale=scale)
+                                .to_top_left_origin(
+                                    page_height=page.size.height * scale
+                                )
+                            )
+                            cropped_im = page.image.crop(crop_bbox.as_tuple())
+                            element.image = ImageRef.from_pil(
+                                cropped_im, dpi=int(72 * scale)
+                            )
+
+    def _aggregate_confidence(self, conv_res: ConversionResult) -> None:
+        """Aggregate confidence scores (same as StandardPdfPipeline)"""
+        import warnings
+
+        import numpy as np
+
+        if len(conv_res.pages) > 0:
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore",
+                    category=RuntimeWarning,
+                    message="Mean of empty slice|All-NaN slice encountered",
+                )
+                conv_res.confidence.layout_score = float(
+                    np.nanmean(
+                        [c.layout_score for c in conv_res.confidence.pages.values()]
+                    )
+                )
+                conv_res.confidence.parse_score = float(
+                    np.nanquantile(
+                        [c.parse_score for c in conv_res.confidence.pages.values()],
+                        q=0.1,
+                    )
+                )
+                conv_res.confidence.table_score = float(
+                    np.nanmean(
+                        [c.table_score for c in conv_res.confidence.pages.values()]
+                    )
+                )
+                conv_res.confidence.ocr_score = float(
+                    np.nanmean(
+                        [c.ocr_score for c in conv_res.confidence.pages.values()]
+                    )
+                )
+
+    async def _enrich_document(self, conv_res: ConversionResult) -> None:
+        """Run enrichment models on document"""
+        # Run enrichment models (same as base pipeline but async)
+        from docling.utils.utils import chunkify
+
+        enrichment_models = [
+            self.code_formula_model,
+            self.picture_classifier,
+            self.picture_description_model,
+        ]
+
+        for model in enrichment_models:
+            if model is None or not getattr(model, "enabled", True):
+                continue
+
+            # Prepare elements
+            elements_to_process = []
+            for doc_element, _level in conv_res.document.iterate_items():
+                prepared = model.prepare_element(conv_res=conv_res, element=doc_element)
+                if prepared is not None:
+                    elements_to_process.append(prepared)
+
+            # Process in batches
+            for element_batch in chunkify(
+                elements_to_process, model.elements_batch_size
+            ):
+                # Run model in thread to avoid blocking
+                await asyncio.to_thread(
+                    lambda: list(model(conv_res.document, element_batch))
+                )
+
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
+        """Determine conversion status"""
+        # Simple implementation - could be enhanced
+        if conv_res.pages and conv_res.document:
+            return ConversionStatus.SUCCESS
+        else:
+            return ConversionStatus.FAILURE