mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: Use threading in the standard pipeline and move old behavior to legacy (#2452)
* rename standard to legacy Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove old standard pipeline Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * move threaded to standard Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add backwards compatible threaded pipeline Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Updates for threaded pipeline to lower memory requirements Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * updating deps seem to remove the corrupted double-linked list error Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update pinning Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use main lock Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add more threadsafe blocks Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename batch_timeout_seconds Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -361,15 +361,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|||||||
|
|
||||||
generate_parsed_pages: bool = False
|
generate_parsed_pages: bool = False
|
||||||
|
|
||||||
|
### Arguments for threaded PDF pipeline with batching and backpressure control
|
||||||
class ProcessingPipeline(str, Enum):
|
|
||||||
STANDARD = "standard"
|
|
||||||
VLM = "vlm"
|
|
||||||
ASR = "asr"
|
|
||||||
|
|
||||||
|
|
||||||
class ThreadedPdfPipelineOptions(PdfPipelineOptions):
|
|
||||||
"""Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
|
|
||||||
|
|
||||||
# Batch sizes for different stages
|
# Batch sizes for different stages
|
||||||
ocr_batch_size: int = 4
|
ocr_batch_size: int = 4
|
||||||
@@ -377,7 +369,18 @@ class ThreadedPdfPipelineOptions(PdfPipelineOptions):
|
|||||||
table_batch_size: int = 4
|
table_batch_size: int = 4
|
||||||
|
|
||||||
# Timing control
|
# Timing control
|
||||||
batch_timeout_seconds: float = 2.0
|
batch_polling_interval_seconds: float = 0.5
|
||||||
|
|
||||||
# Backpressure and queue control
|
# Backpressure and queue control
|
||||||
queue_max_size: int = 100
|
queue_max_size: int = 100
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessingPipeline(str, Enum):
|
||||||
|
LEGACY = "legacy"
|
||||||
|
STANDARD = "standard"
|
||||||
|
VLM = "vlm"
|
||||||
|
ASR = "asr"
|
||||||
|
|
||||||
|
|
||||||
|
class ThreadedPdfPipelineOptions(PdfPipelineOptions):
|
||||||
|
"""Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
|
||||||
|
|||||||
@@ -167,6 +167,10 @@ class LayoutModel(BasePageModel):
|
|||||||
valid_pages.append(page)
|
valid_pages.append(page)
|
||||||
valid_page_images.append(page_image)
|
valid_page_images.append(page_image)
|
||||||
|
|
||||||
|
print(f"{len(pages)=}, {pages[0].page_no}-{pages[-1].page_no}")
|
||||||
|
print(f"{len(valid_pages)=}")
|
||||||
|
print(f"{len(valid_page_images)=}")
|
||||||
|
|
||||||
# Process all valid pages with batch prediction
|
# Process all valid pages with batch prediction
|
||||||
batch_predictions = []
|
batch_predictions = []
|
||||||
if valid_page_images:
|
if valid_page_images:
|
||||||
|
|||||||
242
docling/pipeline/legacy_standard_pdf_pipeline.py
Normal file
242
docling/pipeline/legacy_standard_pdf_pipeline.py
Normal file
@@ -0,0 +1,242 @@
|
|||||||
|
import logging
|
||||||
|
import warnings
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, cast
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
|
from docling.datamodel.base_models import AssembledUnit, Page
|
||||||
|
from docling.datamodel.document import ConversionResult
|
||||||
|
from docling.datamodel.layout_model_specs import LayoutModelConfig
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.models.base_ocr_model import BaseOcrModel
|
||||||
|
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
||||||
|
from docling.models.factories import get_ocr_factory
|
||||||
|
from docling.models.layout_model import LayoutModel
|
||||||
|
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||||
|
from docling.models.page_preprocessing_model import (
|
||||||
|
PagePreprocessingModel,
|
||||||
|
PagePreprocessingOptions,
|
||||||
|
)
|
||||||
|
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
||||||
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||||
|
from docling.utils.model_downloader import download_models
|
||||||
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class LegacyStandardPdfPipeline(PaginatedPipeline):
|
||||||
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||||
|
super().__init__(pipeline_options)
|
||||||
|
self.pipeline_options: PdfPipelineOptions
|
||||||
|
|
||||||
|
with warnings.catch_warnings(): # deprecated generate_table_images
|
||||||
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
|
self.keep_images = (
|
||||||
|
self.pipeline_options.generate_page_images
|
||||||
|
or self.pipeline_options.generate_picture_images
|
||||||
|
or self.pipeline_options.generate_table_images
|
||||||
|
)
|
||||||
|
|
||||||
|
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||||
|
|
||||||
|
ocr_model = self.get_ocr_model(artifacts_path=self.artifacts_path)
|
||||||
|
|
||||||
|
self.build_pipe = [
|
||||||
|
# Pre-processing
|
||||||
|
PagePreprocessingModel(
|
||||||
|
options=PagePreprocessingOptions(
|
||||||
|
images_scale=pipeline_options.images_scale,
|
||||||
|
)
|
||||||
|
),
|
||||||
|
# OCR
|
||||||
|
ocr_model,
|
||||||
|
# Layout model
|
||||||
|
LayoutModel(
|
||||||
|
artifacts_path=self.artifacts_path,
|
||||||
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
|
options=pipeline_options.layout_options,
|
||||||
|
),
|
||||||
|
# Table structure model
|
||||||
|
TableStructureModel(
|
||||||
|
enabled=pipeline_options.do_table_structure,
|
||||||
|
artifacts_path=self.artifacts_path,
|
||||||
|
options=pipeline_options.table_structure_options,
|
||||||
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
|
),
|
||||||
|
# Page assemble
|
||||||
|
PageAssembleModel(options=PageAssembleOptions()),
|
||||||
|
]
|
||||||
|
|
||||||
|
self.enrichment_pipe = [
|
||||||
|
# Code Formula Enrichment Model
|
||||||
|
CodeFormulaModel(
|
||||||
|
enabled=pipeline_options.do_code_enrichment
|
||||||
|
or pipeline_options.do_formula_enrichment,
|
||||||
|
artifacts_path=self.artifacts_path,
|
||||||
|
options=CodeFormulaModelOptions(
|
||||||
|
do_code_enrichment=pipeline_options.do_code_enrichment,
|
||||||
|
do_formula_enrichment=pipeline_options.do_formula_enrichment,
|
||||||
|
),
|
||||||
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
|
),
|
||||||
|
*self.enrichment_pipe,
|
||||||
|
]
|
||||||
|
|
||||||
|
if (
|
||||||
|
self.pipeline_options.do_formula_enrichment
|
||||||
|
or self.pipeline_options.do_code_enrichment
|
||||||
|
or self.pipeline_options.do_picture_classification
|
||||||
|
or self.pipeline_options.do_picture_description
|
||||||
|
):
|
||||||
|
self.keep_backend = True
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def download_models_hf(
|
||||||
|
local_dir: Optional[Path] = None, force: bool = False
|
||||||
|
) -> Path:
|
||||||
|
warnings.warn(
|
||||||
|
"The usage of LegacyStandardPdfPipeline.download_models_hf() is deprecated "
|
||||||
|
"use instead the utility `docling-tools models download`, or "
|
||||||
|
"the upstream method docling.utils.models_downloader.download_all()",
|
||||||
|
DeprecationWarning,
|
||||||
|
stacklevel=3,
|
||||||
|
)
|
||||||
|
|
||||||
|
output_dir = download_models(output_dir=local_dir, force=force, progress=False)
|
||||||
|
return output_dir
|
||||||
|
|
||||||
|
def get_ocr_model(self, artifacts_path: Optional[Path] = None) -> BaseOcrModel:
|
||||||
|
factory = get_ocr_factory(
|
||||||
|
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
||||||
|
)
|
||||||
|
return factory.create_instance(
|
||||||
|
options=self.pipeline_options.ocr_options,
|
||||||
|
enabled=self.pipeline_options.do_ocr,
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
accelerator_options=self.pipeline_options.accelerator_options,
|
||||||
|
)
|
||||||
|
|
||||||
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||||
|
with TimeRecorder(conv_res, "page_init"):
|
||||||
|
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
||||||
|
if page._backend is not None and page._backend.is_valid():
|
||||||
|
page.size = page._backend.get_size()
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
|
all_elements = []
|
||||||
|
all_headers = []
|
||||||
|
all_body = []
|
||||||
|
|
||||||
|
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
||||||
|
for p in conv_res.pages:
|
||||||
|
if p.assembled is not None:
|
||||||
|
for el in p.assembled.body:
|
||||||
|
all_body.append(el)
|
||||||
|
for el in p.assembled.headers:
|
||||||
|
all_headers.append(el)
|
||||||
|
for el in p.assembled.elements:
|
||||||
|
all_elements.append(el)
|
||||||
|
|
||||||
|
conv_res.assembled = AssembledUnit(
|
||||||
|
elements=all_elements, headers=all_headers, body=all_body
|
||||||
|
)
|
||||||
|
|
||||||
|
conv_res.document = self.reading_order_model(conv_res)
|
||||||
|
|
||||||
|
# Generate page images in the output
|
||||||
|
if self.pipeline_options.generate_page_images:
|
||||||
|
for page in conv_res.pages:
|
||||||
|
assert page.image is not None
|
||||||
|
page_no = page.page_no + 1
|
||||||
|
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
||||||
|
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate images of the requested element types
|
||||||
|
with warnings.catch_warnings(): # deprecated generate_table_images
|
||||||
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
|
if (
|
||||||
|
self.pipeline_options.generate_picture_images
|
||||||
|
or self.pipeline_options.generate_table_images
|
||||||
|
):
|
||||||
|
scale = self.pipeline_options.images_scale
|
||||||
|
for element, _level in conv_res.document.iterate_items():
|
||||||
|
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
||||||
|
continue
|
||||||
|
if (
|
||||||
|
isinstance(element, PictureItem)
|
||||||
|
and self.pipeline_options.generate_picture_images
|
||||||
|
) or (
|
||||||
|
isinstance(element, TableItem)
|
||||||
|
and self.pipeline_options.generate_table_images
|
||||||
|
):
|
||||||
|
page_ix = element.prov[0].page_no - 1
|
||||||
|
page = next(
|
||||||
|
(p for p in conv_res.pages if p.page_no == page_ix),
|
||||||
|
cast("Page", None),
|
||||||
|
)
|
||||||
|
assert page is not None
|
||||||
|
assert page.size is not None
|
||||||
|
assert page.image is not None
|
||||||
|
|
||||||
|
crop_bbox = (
|
||||||
|
element.prov[0]
|
||||||
|
.bbox.scaled(scale=scale)
|
||||||
|
.to_top_left_origin(
|
||||||
|
page_height=page.size.height * scale
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
||||||
|
element.image = ImageRef.from_pil(
|
||||||
|
cropped_im, dpi=int(72 * scale)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Aggregate confidence values for document:
|
||||||
|
if len(conv_res.pages) > 0:
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings(
|
||||||
|
"ignore",
|
||||||
|
category=RuntimeWarning,
|
||||||
|
message="Mean of empty slice|All-NaN slice encountered",
|
||||||
|
)
|
||||||
|
conv_res.confidence.layout_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.layout_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.parse_score = float(
|
||||||
|
np.nanquantile(
|
||||||
|
[c.parse_score for c in conv_res.confidence.pages.values()],
|
||||||
|
q=0.1, # parse score should relate to worst 10% of pages.
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.table_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.table_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.ocr_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return conv_res
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_default_options(cls) -> PdfPipelineOptions:
|
||||||
|
return PdfPipelineOptions()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||||
|
return isinstance(backend, PdfDocumentBackend)
|
||||||
@@ -1,19 +1,39 @@
|
|||||||
|
"""Thread-safe, production-ready PDF pipeline
|
||||||
|
================================================
|
||||||
|
A self-contained, thread-safe PDF conversion pipeline exploiting parallelism between pipeline stages and models.
|
||||||
|
|
||||||
|
* **Per-run isolation** - every :py:meth:`execute` call uses its own bounded queues and worker
|
||||||
|
threads so that concurrent invocations never share mutable state.
|
||||||
|
* **Deterministic run identifiers** - pages are tracked with an internal *run-id* instead of
|
||||||
|
relying on :pyfunc:`id`, which may clash after garbage collection.
|
||||||
|
* **Explicit back-pressure & shutdown** - producers block on full queues; queue *close()*
|
||||||
|
propagates downstream so stages terminate deterministically without sentinels.
|
||||||
|
* **Minimal shared state** - heavyweight models are initialised once per pipeline instance
|
||||||
|
and only read by worker threads; no runtime mutability is exposed.
|
||||||
|
* **Strict typing & clean API usage** - code is fully annotated and respects *coding_rules.md*.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
|
from collections import defaultdict, deque
|
||||||
|
from dataclasses import dataclass, field
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, cast
|
from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import AssembledUnit, Page
|
from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.layout_model_specs import LayoutModelConfig
|
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_ocr_model import BaseOcrModel
|
|
||||||
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
||||||
from docling.models.factories import get_ocr_factory
|
from docling.models.factories import get_ocr_factory
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
@@ -24,132 +44,588 @@ from docling.models.page_preprocessing_model import (
|
|||||||
)
|
)
|
||||||
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
from docling.pipeline.base_pipeline import ConvertPipeline
|
||||||
from docling.utils.model_downloader import download_models
|
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
from docling.utils.utils import chunkify
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Helper data structures
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
class StandardPdfPipeline(PaginatedPipeline):
|
|
||||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
@dataclass
|
||||||
|
class ThreadedItem:
|
||||||
|
"""Envelope that travels between pipeline stages."""
|
||||||
|
|
||||||
|
payload: Optional[Page]
|
||||||
|
run_id: int # Unique per *execute* call, monotonic across pipeline instance
|
||||||
|
page_no: int
|
||||||
|
conv_res: ConversionResult
|
||||||
|
error: Optional[Exception] = None
|
||||||
|
is_failed: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ProcessingResult:
|
||||||
|
"""Aggregated outcome of a pipeline run."""
|
||||||
|
|
||||||
|
pages: List[Page] = field(default_factory=list)
|
||||||
|
failed_pages: List[Tuple[int, Exception]] = field(default_factory=list)
|
||||||
|
total_expected: int = 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def success_count(self) -> int:
|
||||||
|
return len(self.pages)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def failure_count(self) -> int:
|
||||||
|
return len(self.failed_pages)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_partial_success(self) -> bool:
|
||||||
|
return 0 < self.success_count < self.total_expected
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_complete_failure(self) -> bool:
|
||||||
|
return self.success_count == 0 and self.failure_count > 0
|
||||||
|
|
||||||
|
|
||||||
|
class ThreadedQueue:
|
||||||
|
"""Bounded queue with blocking put/ get_batch and explicit *close()* semantics."""
|
||||||
|
|
||||||
|
__slots__ = ("_closed", "_items", "_lock", "_max", "_not_empty", "_not_full")
|
||||||
|
|
||||||
|
def __init__(self, max_size: int) -> None:
|
||||||
|
self._max: int = max_size
|
||||||
|
self._items: deque[ThreadedItem] = deque()
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
self._not_full = threading.Condition(self._lock)
|
||||||
|
self._not_empty = threading.Condition(self._lock)
|
||||||
|
self._closed = False
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------- put()
|
||||||
|
def put(self, item: ThreadedItem, timeout: Optional[float] | None = None) -> bool:
|
||||||
|
"""Block until queue accepts *item* or is closed. Returns *False* if closed."""
|
||||||
|
with self._not_full:
|
||||||
|
if self._closed:
|
||||||
|
return False
|
||||||
|
start = time.monotonic()
|
||||||
|
while len(self._items) >= self._max and not self._closed:
|
||||||
|
if timeout is not None:
|
||||||
|
remaining = timeout - (time.monotonic() - start)
|
||||||
|
if remaining <= 0:
|
||||||
|
return False
|
||||||
|
self._not_full.wait(remaining)
|
||||||
|
else:
|
||||||
|
self._not_full.wait()
|
||||||
|
if self._closed:
|
||||||
|
return False
|
||||||
|
self._items.append(item)
|
||||||
|
self._not_empty.notify()
|
||||||
|
return True
|
||||||
|
|
||||||
|
# ------------------------------------------------------------ get_batch()
|
||||||
|
def get_batch(
|
||||||
|
self, size: int, timeout: Optional[float] | None = None
|
||||||
|
) -> List[ThreadedItem]:
|
||||||
|
"""Return up to *size* items. Blocks until ≥1 item present or queue closed/timeout."""
|
||||||
|
with self._not_empty:
|
||||||
|
start = time.monotonic()
|
||||||
|
while not self._items and not self._closed:
|
||||||
|
if timeout is not None:
|
||||||
|
remaining = timeout - (time.monotonic() - start)
|
||||||
|
if remaining <= 0:
|
||||||
|
return []
|
||||||
|
self._not_empty.wait(remaining)
|
||||||
|
else:
|
||||||
|
self._not_empty.wait()
|
||||||
|
batch: List[ThreadedItem] = []
|
||||||
|
while self._items and len(batch) < size:
|
||||||
|
batch.append(self._items.popleft())
|
||||||
|
if batch:
|
||||||
|
self._not_full.notify_all()
|
||||||
|
return batch
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------- close()
|
||||||
|
def close(self) -> None:
|
||||||
|
with self._lock:
|
||||||
|
self._closed = True
|
||||||
|
self._not_empty.notify_all()
|
||||||
|
self._not_full.notify_all()
|
||||||
|
|
||||||
|
# -------------------------------------------------------------- property
|
||||||
|
@property
|
||||||
|
def closed(self) -> bool:
|
||||||
|
return self._closed
|
||||||
|
|
||||||
|
|
||||||
|
class ThreadedPipelineStage:
|
||||||
|
"""A single pipeline stage backed by one worker thread."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
name: str,
|
||||||
|
model: Any,
|
||||||
|
batch_size: int,
|
||||||
|
batch_timeout: float,
|
||||||
|
queue_max_size: int,
|
||||||
|
postprocess: Optional[Callable[[ThreadedItem], None]] = None,
|
||||||
|
) -> None:
|
||||||
|
self.name = name
|
||||||
|
self.model = model
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.batch_timeout = batch_timeout
|
||||||
|
self.input_queue = ThreadedQueue(queue_max_size)
|
||||||
|
self._outputs: list[ThreadedQueue] = []
|
||||||
|
self._thread: Optional[threading.Thread] = None
|
||||||
|
self._running = False
|
||||||
|
self._postprocess = postprocess
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------- wiring
|
||||||
|
def add_output_queue(self, q: ThreadedQueue) -> None:
|
||||||
|
self._outputs.append(q)
|
||||||
|
|
||||||
|
# -------------------------------------------------------------- lifecycle
|
||||||
|
def start(self) -> None:
|
||||||
|
if self._running:
|
||||||
|
return
|
||||||
|
self._running = True
|
||||||
|
self._thread = threading.Thread(
|
||||||
|
target=self._run, name=f"Stage-{self.name}", daemon=True
|
||||||
|
)
|
||||||
|
self._thread.start()
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
if not self._running:
|
||||||
|
return
|
||||||
|
self._running = False
|
||||||
|
self.input_queue.close()
|
||||||
|
if self._thread is not None:
|
||||||
|
self._thread.join(timeout=30.0)
|
||||||
|
if self._thread.is_alive():
|
||||||
|
_log.warning("Stage %s did not terminate cleanly within 30s", self.name)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ _run
|
||||||
|
def _run(self) -> None:
|
||||||
|
try:
|
||||||
|
while self._running:
|
||||||
|
batch = self.input_queue.get_batch(self.batch_size, self.batch_timeout)
|
||||||
|
if not batch and self.input_queue.closed:
|
||||||
|
break
|
||||||
|
processed = self._process_batch(batch)
|
||||||
|
self._emit(processed)
|
||||||
|
except Exception: # pragma: no cover - top-level guard
|
||||||
|
_log.exception("Fatal error in stage %s", self.name)
|
||||||
|
finally:
|
||||||
|
for q in self._outputs:
|
||||||
|
q.close()
|
||||||
|
|
||||||
|
# ----------------------------------------------------- _process_batch()
|
||||||
|
def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
|
||||||
|
"""Run *model* on *batch* grouped by run_id to maximise batching."""
|
||||||
|
groups: dict[int, list[ThreadedItem]] = defaultdict(list)
|
||||||
|
for itm in batch:
|
||||||
|
groups[itm.run_id].append(itm)
|
||||||
|
|
||||||
|
result: list[ThreadedItem] = []
|
||||||
|
for rid, items in groups.items():
|
||||||
|
good: list[ThreadedItem] = [i for i in items if not i.is_failed]
|
||||||
|
if not good:
|
||||||
|
result.extend(items)
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
# Filter out None payloads and ensure type safety
|
||||||
|
pages_with_payloads = [
|
||||||
|
(i, i.payload) for i in good if i.payload is not None
|
||||||
|
]
|
||||||
|
if len(pages_with_payloads) != len(good):
|
||||||
|
# Some items have None payloads, mark all as failed
|
||||||
|
for it in items:
|
||||||
|
it.is_failed = True
|
||||||
|
it.error = RuntimeError("Page payload is None")
|
||||||
|
result.extend(items)
|
||||||
|
continue
|
||||||
|
|
||||||
|
pages: List[Page] = [payload for _, payload in pages_with_payloads]
|
||||||
|
processed_pages = list(self.model(good[0].conv_res, pages)) # type: ignore[arg-type]
|
||||||
|
if len(processed_pages) != len(pages): # strict mismatch guard
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Model {self.name} returned wrong number of pages"
|
||||||
|
)
|
||||||
|
for idx, page in enumerate(processed_pages):
|
||||||
|
result.append(
|
||||||
|
ThreadedItem(
|
||||||
|
payload=page,
|
||||||
|
run_id=rid,
|
||||||
|
page_no=good[idx].page_no,
|
||||||
|
conv_res=good[idx].conv_res,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
_log.error("Stage %s failed for run %d: %s", self.name, rid, exc)
|
||||||
|
for it in items:
|
||||||
|
it.is_failed = True
|
||||||
|
it.error = exc
|
||||||
|
result.extend(items)
|
||||||
|
return result
|
||||||
|
|
||||||
|
# -------------------------------------------------------------- _emit()
|
||||||
|
def _emit(self, items: Iterable[ThreadedItem]) -> None:
|
||||||
|
for item in items:
|
||||||
|
if self._postprocess is not None:
|
||||||
|
self._postprocess(item)
|
||||||
|
for q in self._outputs:
|
||||||
|
if not q.put(item):
|
||||||
|
_log.error("Output queue closed while emitting from %s", self.name)
|
||||||
|
|
||||||
|
|
||||||
|
class PreprocessThreadedStage(ThreadedPipelineStage):
|
||||||
|
"""Pipeline stage that lazily loads PDF backends just-in-time."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
batch_timeout: float,
|
||||||
|
queue_max_size: int,
|
||||||
|
model: Any,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(
|
||||||
|
name="preprocess",
|
||||||
|
model=model,
|
||||||
|
batch_size=1,
|
||||||
|
batch_timeout=batch_timeout,
|
||||||
|
queue_max_size=queue_max_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
|
||||||
|
groups: dict[int, list[ThreadedItem]] = defaultdict(list)
|
||||||
|
for itm in batch:
|
||||||
|
groups[itm.run_id].append(itm)
|
||||||
|
|
||||||
|
result: list[ThreadedItem] = []
|
||||||
|
for rid, items in groups.items():
|
||||||
|
good = [i for i in items if not i.is_failed]
|
||||||
|
if not good:
|
||||||
|
result.extend(items)
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
pages_with_payloads: list[tuple[ThreadedItem, Page]] = []
|
||||||
|
for it in good:
|
||||||
|
page = it.payload
|
||||||
|
if page is None:
|
||||||
|
raise RuntimeError("Page payload is None")
|
||||||
|
if page._backend is None:
|
||||||
|
backend = it.conv_res.input._backend
|
||||||
|
assert isinstance(backend, PdfDocumentBackend), (
|
||||||
|
"Threaded pipeline only supports PdfDocumentBackend."
|
||||||
|
)
|
||||||
|
page_backend = backend.load_page(page.page_no)
|
||||||
|
page._backend = page_backend
|
||||||
|
if page_backend.is_valid():
|
||||||
|
page.size = page_backend.get_size()
|
||||||
|
pages_with_payloads.append((it, page))
|
||||||
|
|
||||||
|
pages = [payload for _, payload in pages_with_payloads]
|
||||||
|
processed_pages = list(
|
||||||
|
self.model(good[0].conv_res, pages) # type: ignore[arg-type]
|
||||||
|
)
|
||||||
|
if len(processed_pages) != len(pages):
|
||||||
|
raise RuntimeError(
|
||||||
|
"PagePreprocessingModel returned unexpected number of pages"
|
||||||
|
)
|
||||||
|
for idx, processed_page in enumerate(processed_pages):
|
||||||
|
result.append(
|
||||||
|
ThreadedItem(
|
||||||
|
payload=processed_page,
|
||||||
|
run_id=rid,
|
||||||
|
page_no=good[idx].page_no,
|
||||||
|
conv_res=good[idx].conv_res,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
_log.error("Stage preprocess failed for run %d: %s", rid, exc)
|
||||||
|
for it in items:
|
||||||
|
it.is_failed = True
|
||||||
|
it.error = exc
|
||||||
|
result.extend(items)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RunContext:
|
||||||
|
"""Wiring for a single *execute* call."""
|
||||||
|
|
||||||
|
stages: list[ThreadedPipelineStage]
|
||||||
|
first_stage: ThreadedPipelineStage
|
||||||
|
output_queue: ThreadedQueue
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Main pipeline
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class StandardPdfPipeline(ConvertPipeline):
|
||||||
|
"""High-performance PDF pipeline with multi-threaded stages."""
|
||||||
|
|
||||||
|
def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
|
||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
self.pipeline_options: PdfPipelineOptions
|
self.pipeline_options: ThreadedPdfPipelineOptions = pipeline_options
|
||||||
|
self._run_seq = itertools.count(1) # deterministic, monotonic run ids
|
||||||
|
|
||||||
with warnings.catch_warnings(): # deprecated generate_table_images
|
# initialise heavy models once
|
||||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
self._init_models()
|
||||||
self.keep_images = (
|
|
||||||
self.pipeline_options.generate_page_images
|
# ────────────────────────────────────────────────────────────────────────
|
||||||
or self.pipeline_options.generate_picture_images
|
# Heavy-model initialisation & helpers
|
||||||
or self.pipeline_options.generate_table_images
|
# ────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _init_models(self) -> None:
|
||||||
|
art_path = self.artifacts_path
|
||||||
|
self.keep_images = (
|
||||||
|
self.pipeline_options.generate_page_images
|
||||||
|
or self.pipeline_options.generate_picture_images
|
||||||
|
or self.pipeline_options.generate_table_images
|
||||||
|
)
|
||||||
|
self.preprocessing_model = PagePreprocessingModel(
|
||||||
|
options=PagePreprocessingOptions(
|
||||||
|
images_scale=self.pipeline_options.images_scale
|
||||||
)
|
)
|
||||||
|
)
|
||||||
|
self.ocr_model = self._make_ocr_model(art_path)
|
||||||
|
self.layout_model = LayoutModel(
|
||||||
|
artifacts_path=art_path,
|
||||||
|
accelerator_options=self.pipeline_options.accelerator_options,
|
||||||
|
options=self.pipeline_options.layout_options,
|
||||||
|
)
|
||||||
|
self.table_model = TableStructureModel(
|
||||||
|
enabled=self.pipeline_options.do_table_structure,
|
||||||
|
artifacts_path=art_path,
|
||||||
|
options=self.pipeline_options.table_structure_options,
|
||||||
|
accelerator_options=self.pipeline_options.accelerator_options,
|
||||||
|
)
|
||||||
|
self.assemble_model = PageAssembleModel(options=PageAssembleOptions())
|
||||||
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||||
|
|
||||||
ocr_model = self.get_ocr_model(artifacts_path=self.artifacts_path)
|
# --- optional enrichment ------------------------------------------------
|
||||||
|
|
||||||
self.build_pipe = [
|
|
||||||
# Pre-processing
|
|
||||||
PagePreprocessingModel(
|
|
||||||
options=PagePreprocessingOptions(
|
|
||||||
images_scale=pipeline_options.images_scale,
|
|
||||||
)
|
|
||||||
),
|
|
||||||
# OCR
|
|
||||||
ocr_model,
|
|
||||||
# Layout model
|
|
||||||
LayoutModel(
|
|
||||||
artifacts_path=self.artifacts_path,
|
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
|
||||||
options=pipeline_options.layout_options,
|
|
||||||
),
|
|
||||||
# Table structure model
|
|
||||||
TableStructureModel(
|
|
||||||
enabled=pipeline_options.do_table_structure,
|
|
||||||
artifacts_path=self.artifacts_path,
|
|
||||||
options=pipeline_options.table_structure_options,
|
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
|
||||||
),
|
|
||||||
# Page assemble
|
|
||||||
PageAssembleModel(options=PageAssembleOptions()),
|
|
||||||
]
|
|
||||||
|
|
||||||
self.enrichment_pipe = [
|
self.enrichment_pipe = [
|
||||||
# Code Formula Enrichment Model
|
# Code Formula Enrichment Model
|
||||||
CodeFormulaModel(
|
CodeFormulaModel(
|
||||||
enabled=pipeline_options.do_code_enrichment
|
enabled=self.pipeline_options.do_code_enrichment
|
||||||
or pipeline_options.do_formula_enrichment,
|
or self.pipeline_options.do_formula_enrichment,
|
||||||
artifacts_path=self.artifacts_path,
|
artifacts_path=self.artifacts_path,
|
||||||
options=CodeFormulaModelOptions(
|
options=CodeFormulaModelOptions(
|
||||||
do_code_enrichment=pipeline_options.do_code_enrichment,
|
do_code_enrichment=self.pipeline_options.do_code_enrichment,
|
||||||
do_formula_enrichment=pipeline_options.do_formula_enrichment,
|
do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
|
||||||
),
|
),
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=self.pipeline_options.accelerator_options,
|
||||||
),
|
),
|
||||||
*self.enrichment_pipe,
|
*self.enrichment_pipe,
|
||||||
]
|
]
|
||||||
|
|
||||||
if (
|
self.keep_backend = any(
|
||||||
self.pipeline_options.do_formula_enrichment
|
(
|
||||||
or self.pipeline_options.do_code_enrichment
|
self.pipeline_options.do_formula_enrichment,
|
||||||
or self.pipeline_options.do_picture_classification
|
self.pipeline_options.do_code_enrichment,
|
||||||
or self.pipeline_options.do_picture_description
|
self.pipeline_options.do_picture_classification,
|
||||||
):
|
self.pipeline_options.do_picture_description,
|
||||||
self.keep_backend = True
|
)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def download_models_hf(
|
|
||||||
local_dir: Optional[Path] = None, force: bool = False
|
|
||||||
) -> Path:
|
|
||||||
warnings.warn(
|
|
||||||
"The usage of StandardPdfPipeline.download_models_hf() is deprecated "
|
|
||||||
"use instead the utility `docling-tools models download`, or "
|
|
||||||
"the upstream method docling.utils.models_downloader.download_all()",
|
|
||||||
DeprecationWarning,
|
|
||||||
stacklevel=3,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
output_dir = download_models(output_dir=local_dir, force=force, progress=False)
|
# ---------------------------------------------------------------- helpers
|
||||||
return output_dir
|
def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
|
||||||
|
|
||||||
def get_ocr_model(self, artifacts_path: Optional[Path] = None) -> BaseOcrModel:
|
|
||||||
factory = get_ocr_factory(
|
factory = get_ocr_factory(
|
||||||
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
||||||
)
|
)
|
||||||
return factory.create_instance(
|
return factory.create_instance(
|
||||||
options=self.pipeline_options.ocr_options,
|
options=self.pipeline_options.ocr_options,
|
||||||
enabled=self.pipeline_options.do_ocr,
|
enabled=self.pipeline_options.do_ocr,
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=art_path,
|
||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
accelerator_options=self.pipeline_options.accelerator_options,
|
||||||
)
|
)
|
||||||
|
|
||||||
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
def _release_page_resources(self, item: ThreadedItem) -> None:
|
||||||
with TimeRecorder(conv_res, "page_init"):
|
page = item.payload
|
||||||
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
if page is None:
|
||||||
if page._backend is not None and page._backend.is_valid():
|
return
|
||||||
page.size = page._backend.get_size()
|
if not self.keep_images:
|
||||||
|
page._image_cache = {}
|
||||||
|
if not self.keep_backend and page._backend is not None:
|
||||||
|
page._backend.unload()
|
||||||
|
page._backend = None
|
||||||
|
if not self.pipeline_options.generate_parsed_pages:
|
||||||
|
page.parsed_page = None
|
||||||
|
|
||||||
return page
|
# ────────────────────────────────────────────────────────────────────────
|
||||||
|
# Build - thread pipeline
|
||||||
|
# ────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _create_run_ctx(self) -> RunContext:
|
||||||
|
opts = self.pipeline_options
|
||||||
|
preprocess = PreprocessThreadedStage(
|
||||||
|
batch_timeout=opts.batch_polling_interval_seconds,
|
||||||
|
queue_max_size=opts.queue_max_size,
|
||||||
|
model=self.preprocessing_model,
|
||||||
|
)
|
||||||
|
ocr = ThreadedPipelineStage(
|
||||||
|
name="ocr",
|
||||||
|
model=self.ocr_model,
|
||||||
|
batch_size=opts.ocr_batch_size,
|
||||||
|
batch_timeout=opts.batch_polling_interval_seconds,
|
||||||
|
queue_max_size=opts.queue_max_size,
|
||||||
|
)
|
||||||
|
layout = ThreadedPipelineStage(
|
||||||
|
name="layout",
|
||||||
|
model=self.layout_model,
|
||||||
|
batch_size=opts.layout_batch_size,
|
||||||
|
batch_timeout=opts.batch_polling_interval_seconds,
|
||||||
|
queue_max_size=opts.queue_max_size,
|
||||||
|
)
|
||||||
|
table = ThreadedPipelineStage(
|
||||||
|
name="table",
|
||||||
|
model=self.table_model,
|
||||||
|
batch_size=opts.table_batch_size,
|
||||||
|
batch_timeout=opts.batch_polling_interval_seconds,
|
||||||
|
queue_max_size=opts.queue_max_size,
|
||||||
|
)
|
||||||
|
assemble = ThreadedPipelineStage(
|
||||||
|
name="assemble",
|
||||||
|
model=self.assemble_model,
|
||||||
|
batch_size=1,
|
||||||
|
batch_timeout=opts.batch_polling_interval_seconds,
|
||||||
|
queue_max_size=opts.queue_max_size,
|
||||||
|
postprocess=self._release_page_resources,
|
||||||
|
)
|
||||||
|
|
||||||
|
# wire stages
|
||||||
|
output_q = ThreadedQueue(opts.queue_max_size)
|
||||||
|
preprocess.add_output_queue(ocr.input_queue)
|
||||||
|
ocr.add_output_queue(layout.input_queue)
|
||||||
|
layout.add_output_queue(table.input_queue)
|
||||||
|
table.add_output_queue(assemble.input_queue)
|
||||||
|
assemble.add_output_queue(output_q)
|
||||||
|
|
||||||
|
stages = [preprocess, ocr, layout, table, assemble]
|
||||||
|
return RunContext(stages=stages, first_stage=preprocess, output_queue=output_q)
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------- build
|
||||||
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
|
"""Stream-build the document while interleaving producer and consumer work."""
|
||||||
|
run_id = next(self._run_seq)
|
||||||
|
assert isinstance(conv_res.input._backend, PdfDocumentBackend)
|
||||||
|
|
||||||
|
# Collect page placeholders; backends are loaded lazily in preprocess stage
|
||||||
|
start_page, end_page = conv_res.input.limits.page_range
|
||||||
|
pages: list[Page] = []
|
||||||
|
for i in range(conv_res.input.page_count):
|
||||||
|
if start_page - 1 <= i <= end_page - 1:
|
||||||
|
page = Page(page_no=i)
|
||||||
|
conv_res.pages.append(page)
|
||||||
|
pages.append(page)
|
||||||
|
|
||||||
|
if not pages:
|
||||||
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
|
return conv_res
|
||||||
|
|
||||||
|
total_pages: int = len(pages)
|
||||||
|
ctx: RunContext = self._create_run_ctx()
|
||||||
|
for st in ctx.stages:
|
||||||
|
st.start()
|
||||||
|
|
||||||
|
proc = ProcessingResult(total_expected=total_pages)
|
||||||
|
fed_idx: int = 0 # number of pages successfully queued
|
||||||
|
batch_size: int = 32 # drain chunk
|
||||||
|
try:
|
||||||
|
while proc.success_count + proc.failure_count < total_pages:
|
||||||
|
# 1) feed - try to enqueue until the first queue is full
|
||||||
|
while fed_idx < total_pages:
|
||||||
|
ok = ctx.first_stage.input_queue.put(
|
||||||
|
ThreadedItem(
|
||||||
|
payload=pages[fed_idx],
|
||||||
|
run_id=run_id,
|
||||||
|
page_no=pages[fed_idx].page_no,
|
||||||
|
conv_res=conv_res,
|
||||||
|
),
|
||||||
|
timeout=0.0, # non-blocking try-put
|
||||||
|
)
|
||||||
|
if ok:
|
||||||
|
fed_idx += 1
|
||||||
|
if fed_idx == total_pages:
|
||||||
|
ctx.first_stage.input_queue.close()
|
||||||
|
else: # queue full - switch to draining
|
||||||
|
break
|
||||||
|
|
||||||
|
# 2) drain - pull whatever is ready from the output side
|
||||||
|
out_batch = ctx.output_queue.get_batch(batch_size, timeout=0.05)
|
||||||
|
for itm in out_batch:
|
||||||
|
if itm.run_id != run_id:
|
||||||
|
continue
|
||||||
|
if itm.is_failed or itm.error:
|
||||||
|
proc.failed_pages.append(
|
||||||
|
(itm.page_no, itm.error or RuntimeError("unknown error"))
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
assert itm.payload is not None
|
||||||
|
proc.pages.append(itm.payload)
|
||||||
|
|
||||||
|
# 3) failure safety - downstream closed early -> mark missing pages failed
|
||||||
|
if not out_batch and ctx.output_queue.closed:
|
||||||
|
missing = total_pages - (proc.success_count + proc.failure_count)
|
||||||
|
if missing > 0:
|
||||||
|
proc.failed_pages.extend(
|
||||||
|
[(-1, RuntimeError("pipeline terminated early"))] * missing
|
||||||
|
)
|
||||||
|
break
|
||||||
|
finally:
|
||||||
|
for st in ctx.stages:
|
||||||
|
st.stop()
|
||||||
|
ctx.output_queue.close()
|
||||||
|
|
||||||
|
self._integrate_results(conv_res, proc)
|
||||||
|
return conv_res
|
||||||
|
|
||||||
|
# ---------------------------------------------------- integrate_results()
|
||||||
|
def _integrate_results(
|
||||||
|
self, conv_res: ConversionResult, proc: ProcessingResult
|
||||||
|
) -> None:
|
||||||
|
page_map = {p.page_no: p for p in proc.pages}
|
||||||
|
conv_res.pages = [
|
||||||
|
page_map.get(p.page_no, p)
|
||||||
|
for p in conv_res.pages
|
||||||
|
if p.page_no in page_map
|
||||||
|
or not any(fp == p.page_no for fp, _ in proc.failed_pages)
|
||||||
|
]
|
||||||
|
if proc.is_complete_failure:
|
||||||
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
|
elif proc.is_partial_success:
|
||||||
|
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
||||||
|
else:
|
||||||
|
conv_res.status = ConversionStatus.SUCCESS
|
||||||
|
if not self.keep_images:
|
||||||
|
for p in conv_res.pages:
|
||||||
|
p._image_cache = {}
|
||||||
|
for p in conv_res.pages:
|
||||||
|
if not self.keep_backend and p._backend is not None:
|
||||||
|
p._backend.unload()
|
||||||
|
if not self.pipeline_options.generate_parsed_pages:
|
||||||
|
del p.parsed_page
|
||||||
|
p.parsed_page = None
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------- assemble
|
||||||
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
all_elements = []
|
elements, headers, body = [], [], []
|
||||||
all_headers = []
|
|
||||||
all_body = []
|
|
||||||
|
|
||||||
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
||||||
for p in conv_res.pages:
|
for p in conv_res.pages:
|
||||||
if p.assembled is not None:
|
if p.assembled:
|
||||||
for el in p.assembled.body:
|
elements.extend(p.assembled.elements)
|
||||||
all_body.append(el)
|
headers.extend(p.assembled.headers)
|
||||||
for el in p.assembled.headers:
|
body.extend(p.assembled.body)
|
||||||
all_headers.append(el)
|
|
||||||
for el in p.assembled.elements:
|
|
||||||
all_elements.append(el)
|
|
||||||
|
|
||||||
conv_res.assembled = AssembledUnit(
|
conv_res.assembled = AssembledUnit(
|
||||||
elements=all_elements, headers=all_headers, body=all_body
|
elements=elements, headers=headers, body=body
|
||||||
)
|
)
|
||||||
|
|
||||||
conv_res.document = self.reading_order_model(conv_res)
|
conv_res.document = self.reading_order_model(conv_res)
|
||||||
|
|
||||||
# Generate page images in the output
|
# Generate page images in the output
|
||||||
@@ -233,10 +709,21 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------- misc
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_default_options(cls) -> PdfPipelineOptions:
|
def get_default_options(cls) -> ThreadedPdfPipelineOptions:
|
||||||
return PdfPipelineOptions()
|
return ThreadedPdfPipelineOptions()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
def is_backend_supported(cls, backend: AbstractDocumentBackend) -> bool:
|
||||||
return isinstance(backend, PdfDocumentBackend)
|
return isinstance(backend, PdfDocumentBackend)
|
||||||
|
|
||||||
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||||
|
return conv_res.status
|
||||||
|
|
||||||
|
def _unload(self, conv_res: ConversionResult) -> None:
|
||||||
|
for p in conv_res.pages:
|
||||||
|
if p._backend is not None:
|
||||||
|
p._backend.unload()
|
||||||
|
if conv_res.input._backend:
|
||||||
|
conv_res.input._backend.unload()
|
||||||
|
|||||||
@@ -1,647 +1,5 @@
|
|||||||
# threaded_standard_pdf_pipeline.py
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
"""Thread-safe, production-ready PDF pipeline
|
|
||||||
================================================
|
|
||||||
A self-contained, thread-safe PDF conversion pipeline exploiting parallelism between pipeline stages and models.
|
|
||||||
|
|
||||||
* **Per-run isolation** - every :py:meth:`execute` call uses its own bounded queues and worker
|
|
||||||
threads so that concurrent invocations never share mutable state.
|
|
||||||
* **Deterministic run identifiers** - pages are tracked with an internal *run-id* instead of
|
|
||||||
relying on :pyfunc:`id`, which may clash after garbage collection.
|
|
||||||
* **Explicit back-pressure & shutdown** - producers block on full queues; queue *close()*
|
|
||||||
propagates downstream so stages terminate deterministically without sentinels.
|
|
||||||
* **Minimal shared state** - heavyweight models are initialised once per pipeline instance
|
|
||||||
and only read by worker threads; no runtime mutability is exposed.
|
|
||||||
* **Strict typing & clean API usage** - code is fully annotated and respects *coding_rules.md*.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
class ThreadedStandardPdfPipeline(StandardPdfPipeline):
|
||||||
|
"""Backwards compatible import for ThreadedStandardPdfPipeline."""
|
||||||
import itertools
|
|
||||||
import logging
|
|
||||||
import threading
|
|
||||||
import time
|
|
||||||
import warnings
|
|
||||||
from collections import defaultdict, deque
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Iterable, List, Optional, Sequence, Tuple, cast
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
|
||||||
from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page
|
|
||||||
from docling.datamodel.document import ConversionResult
|
|
||||||
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
|
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
|
|
||||||
from docling.models.factories import get_ocr_factory
|
|
||||||
from docling.models.layout_model import LayoutModel
|
|
||||||
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
|
||||||
from docling.models.page_preprocessing_model import (
|
|
||||||
PagePreprocessingModel,
|
|
||||||
PagePreprocessingOptions,
|
|
||||||
)
|
|
||||||
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
|
||||||
from docling.pipeline.base_pipeline import ConvertPipeline
|
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
||||||
from docling.utils.utils import chunkify
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
|
||||||
# Helper data structures
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ThreadedItem:
|
|
||||||
"""Envelope that travels between pipeline stages."""
|
|
||||||
|
|
||||||
payload: Optional[Page]
|
|
||||||
run_id: int # Unique per *execute* call, monotonic across pipeline instance
|
|
||||||
page_no: int
|
|
||||||
conv_res: ConversionResult
|
|
||||||
error: Optional[Exception] = None
|
|
||||||
is_failed: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ProcessingResult:
|
|
||||||
"""Aggregated outcome of a pipeline run."""
|
|
||||||
|
|
||||||
pages: List[Page] = field(default_factory=list)
|
|
||||||
failed_pages: List[Tuple[int, Exception]] = field(default_factory=list)
|
|
||||||
total_expected: int = 0
|
|
||||||
|
|
||||||
@property
|
|
||||||
def success_count(self) -> int:
|
|
||||||
return len(self.pages)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def failure_count(self) -> int:
|
|
||||||
return len(self.failed_pages)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def is_partial_success(self) -> bool:
|
|
||||||
return 0 < self.success_count < self.total_expected
|
|
||||||
|
|
||||||
@property
|
|
||||||
def is_complete_failure(self) -> bool:
|
|
||||||
return self.success_count == 0 and self.failure_count > 0
|
|
||||||
|
|
||||||
|
|
||||||
class ThreadedQueue:
|
|
||||||
"""Bounded queue with blocking put/ get_batch and explicit *close()* semantics."""
|
|
||||||
|
|
||||||
__slots__ = ("_closed", "_items", "_lock", "_max", "_not_empty", "_not_full")
|
|
||||||
|
|
||||||
def __init__(self, max_size: int) -> None:
|
|
||||||
self._max: int = max_size
|
|
||||||
self._items: deque[ThreadedItem] = deque()
|
|
||||||
self._lock = threading.Lock()
|
|
||||||
self._not_full = threading.Condition(self._lock)
|
|
||||||
self._not_empty = threading.Condition(self._lock)
|
|
||||||
self._closed = False
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------- put()
|
|
||||||
def put(self, item: ThreadedItem, timeout: Optional[float] | None = None) -> bool:
|
|
||||||
"""Block until queue accepts *item* or is closed. Returns *False* if closed."""
|
|
||||||
with self._not_full:
|
|
||||||
if self._closed:
|
|
||||||
return False
|
|
||||||
start = time.monotonic()
|
|
||||||
while len(self._items) >= self._max and not self._closed:
|
|
||||||
if timeout is not None:
|
|
||||||
remaining = timeout - (time.monotonic() - start)
|
|
||||||
if remaining <= 0:
|
|
||||||
return False
|
|
||||||
self._not_full.wait(remaining)
|
|
||||||
else:
|
|
||||||
self._not_full.wait()
|
|
||||||
if self._closed:
|
|
||||||
return False
|
|
||||||
self._items.append(item)
|
|
||||||
self._not_empty.notify()
|
|
||||||
return True
|
|
||||||
|
|
||||||
# ------------------------------------------------------------ get_batch()
|
|
||||||
def get_batch(
|
|
||||||
self, size: int, timeout: Optional[float] | None = None
|
|
||||||
) -> List[ThreadedItem]:
|
|
||||||
"""Return up to *size* items. Blocks until ≥1 item present or queue closed/timeout."""
|
|
||||||
with self._not_empty:
|
|
||||||
start = time.monotonic()
|
|
||||||
while not self._items and not self._closed:
|
|
||||||
if timeout is not None:
|
|
||||||
remaining = timeout - (time.monotonic() - start)
|
|
||||||
if remaining <= 0:
|
|
||||||
return []
|
|
||||||
self._not_empty.wait(remaining)
|
|
||||||
else:
|
|
||||||
self._not_empty.wait()
|
|
||||||
batch: List[ThreadedItem] = []
|
|
||||||
while self._items and len(batch) < size:
|
|
||||||
batch.append(self._items.popleft())
|
|
||||||
if batch:
|
|
||||||
self._not_full.notify_all()
|
|
||||||
return batch
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------- close()
|
|
||||||
def close(self) -> None:
|
|
||||||
with self._lock:
|
|
||||||
self._closed = True
|
|
||||||
self._not_empty.notify_all()
|
|
||||||
self._not_full.notify_all()
|
|
||||||
|
|
||||||
# -------------------------------------------------------------- property
|
|
||||||
@property
|
|
||||||
def closed(self) -> bool:
|
|
||||||
return self._closed
|
|
||||||
|
|
||||||
|
|
||||||
class ThreadedPipelineStage:
|
|
||||||
"""A single pipeline stage backed by one worker thread."""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
name: str,
|
|
||||||
model: Any,
|
|
||||||
batch_size: int,
|
|
||||||
batch_timeout: float,
|
|
||||||
queue_max_size: int,
|
|
||||||
) -> None:
|
|
||||||
self.name = name
|
|
||||||
self.model = model
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.batch_timeout = batch_timeout
|
|
||||||
self.input_queue = ThreadedQueue(queue_max_size)
|
|
||||||
self._outputs: list[ThreadedQueue] = []
|
|
||||||
self._thread: Optional[threading.Thread] = None
|
|
||||||
self._running = False
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------- wiring
|
|
||||||
def add_output_queue(self, q: ThreadedQueue) -> None:
|
|
||||||
self._outputs.append(q)
|
|
||||||
|
|
||||||
# -------------------------------------------------------------- lifecycle
|
|
||||||
def start(self) -> None:
|
|
||||||
if self._running:
|
|
||||||
return
|
|
||||||
self._running = True
|
|
||||||
self._thread = threading.Thread(
|
|
||||||
target=self._run, name=f"Stage-{self.name}", daemon=True
|
|
||||||
)
|
|
||||||
self._thread.start()
|
|
||||||
|
|
||||||
def stop(self) -> None:
|
|
||||||
if not self._running:
|
|
||||||
return
|
|
||||||
self._running = False
|
|
||||||
self.input_queue.close()
|
|
||||||
if self._thread is not None:
|
|
||||||
self._thread.join(timeout=30.0)
|
|
||||||
if self._thread.is_alive():
|
|
||||||
_log.warning("Stage %s did not terminate cleanly within 30s", self.name)
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------ _run
|
|
||||||
def _run(self) -> None:
|
|
||||||
try:
|
|
||||||
while self._running:
|
|
||||||
batch = self.input_queue.get_batch(self.batch_size, self.batch_timeout)
|
|
||||||
if not batch and self.input_queue.closed:
|
|
||||||
break
|
|
||||||
processed = self._process_batch(batch)
|
|
||||||
self._emit(processed)
|
|
||||||
except Exception: # pragma: no cover - top-level guard
|
|
||||||
_log.exception("Fatal error in stage %s", self.name)
|
|
||||||
finally:
|
|
||||||
for q in self._outputs:
|
|
||||||
q.close()
|
|
||||||
|
|
||||||
# ----------------------------------------------------- _process_batch()
|
|
||||||
def _process_batch(self, batch: Sequence[ThreadedItem]) -> list[ThreadedItem]:
|
|
||||||
"""Run *model* on *batch* grouped by run_id to maximise batching."""
|
|
||||||
groups: dict[int, list[ThreadedItem]] = defaultdict(list)
|
|
||||||
for itm in batch:
|
|
||||||
groups[itm.run_id].append(itm)
|
|
||||||
|
|
||||||
result: list[ThreadedItem] = []
|
|
||||||
for rid, items in groups.items():
|
|
||||||
good: list[ThreadedItem] = [i for i in items if not i.is_failed]
|
|
||||||
if not good:
|
|
||||||
result.extend(items)
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
# Filter out None payloads and ensure type safety
|
|
||||||
pages_with_payloads = [
|
|
||||||
(i, i.payload) for i in good if i.payload is not None
|
|
||||||
]
|
|
||||||
if len(pages_with_payloads) != len(good):
|
|
||||||
# Some items have None payloads, mark all as failed
|
|
||||||
for it in items:
|
|
||||||
it.is_failed = True
|
|
||||||
it.error = RuntimeError("Page payload is None")
|
|
||||||
result.extend(items)
|
|
||||||
continue
|
|
||||||
|
|
||||||
pages: List[Page] = [payload for _, payload in pages_with_payloads]
|
|
||||||
processed_pages = list(self.model(good[0].conv_res, pages)) # type: ignore[arg-type]
|
|
||||||
if len(processed_pages) != len(pages): # strict mismatch guard
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Model {self.name} returned wrong number of pages"
|
|
||||||
)
|
|
||||||
for idx, page in enumerate(processed_pages):
|
|
||||||
result.append(
|
|
||||||
ThreadedItem(
|
|
||||||
payload=page,
|
|
||||||
run_id=rid,
|
|
||||||
page_no=good[idx].page_no,
|
|
||||||
conv_res=good[idx].conv_res,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
except Exception as exc:
|
|
||||||
_log.error("Stage %s failed for run %d: %s", self.name, rid, exc)
|
|
||||||
for it in items:
|
|
||||||
it.is_failed = True
|
|
||||||
it.error = exc
|
|
||||||
result.extend(items)
|
|
||||||
return result
|
|
||||||
|
|
||||||
# -------------------------------------------------------------- _emit()
|
|
||||||
def _emit(self, items: Iterable[ThreadedItem]) -> None:
|
|
||||||
for item in items:
|
|
||||||
for q in self._outputs:
|
|
||||||
if not q.put(item):
|
|
||||||
_log.error("Output queue closed while emitting from %s", self.name)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class RunContext:
|
|
||||||
"""Wiring for a single *execute* call."""
|
|
||||||
|
|
||||||
stages: list[ThreadedPipelineStage]
|
|
||||||
first_stage: ThreadedPipelineStage
|
|
||||||
output_queue: ThreadedQueue
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
|
||||||
# Main pipeline
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class ThreadedStandardPdfPipeline(ConvertPipeline):
|
|
||||||
"""High-performance PDF pipeline with multi-threaded stages."""
|
|
||||||
|
|
||||||
def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
|
|
||||||
super().__init__(pipeline_options)
|
|
||||||
self.pipeline_options: ThreadedPdfPipelineOptions = pipeline_options
|
|
||||||
self._run_seq = itertools.count(1) # deterministic, monotonic run ids
|
|
||||||
|
|
||||||
# initialise heavy models once
|
|
||||||
self._init_models()
|
|
||||||
|
|
||||||
# ────────────────────────────────────────────────────────────────────────
|
|
||||||
# Heavy-model initialisation & helpers
|
|
||||||
# ────────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def _init_models(self) -> None:
|
|
||||||
art_path = self.artifacts_path
|
|
||||||
self.keep_images = (
|
|
||||||
self.pipeline_options.generate_page_images
|
|
||||||
or self.pipeline_options.generate_picture_images
|
|
||||||
or self.pipeline_options.generate_table_images
|
|
||||||
)
|
|
||||||
self.preprocessing_model = PagePreprocessingModel(
|
|
||||||
options=PagePreprocessingOptions(
|
|
||||||
images_scale=self.pipeline_options.images_scale
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.ocr_model = self._make_ocr_model(art_path)
|
|
||||||
self.layout_model = LayoutModel(
|
|
||||||
artifacts_path=art_path,
|
|
||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
|
||||||
options=self.pipeline_options.layout_options,
|
|
||||||
)
|
|
||||||
self.table_model = TableStructureModel(
|
|
||||||
enabled=self.pipeline_options.do_table_structure,
|
|
||||||
artifacts_path=art_path,
|
|
||||||
options=self.pipeline_options.table_structure_options,
|
|
||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
|
||||||
)
|
|
||||||
self.assemble_model = PageAssembleModel(options=PageAssembleOptions())
|
|
||||||
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
|
||||||
|
|
||||||
# --- optional enrichment ------------------------------------------------
|
|
||||||
self.enrichment_pipe = [
|
|
||||||
# Code Formula Enrichment Model
|
|
||||||
CodeFormulaModel(
|
|
||||||
enabled=self.pipeline_options.do_code_enrichment
|
|
||||||
or self.pipeline_options.do_formula_enrichment,
|
|
||||||
artifacts_path=self.artifacts_path,
|
|
||||||
options=CodeFormulaModelOptions(
|
|
||||||
do_code_enrichment=self.pipeline_options.do_code_enrichment,
|
|
||||||
do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
|
|
||||||
),
|
|
||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
|
||||||
),
|
|
||||||
*self.enrichment_pipe,
|
|
||||||
]
|
|
||||||
|
|
||||||
self.keep_backend = any(
|
|
||||||
(
|
|
||||||
self.pipeline_options.do_formula_enrichment,
|
|
||||||
self.pipeline_options.do_code_enrichment,
|
|
||||||
self.pipeline_options.do_picture_classification,
|
|
||||||
self.pipeline_options.do_picture_description,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------- helpers
|
|
||||||
def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
|
|
||||||
factory = get_ocr_factory(
|
|
||||||
allow_external_plugins=self.pipeline_options.allow_external_plugins
|
|
||||||
)
|
|
||||||
return factory.create_instance(
|
|
||||||
options=self.pipeline_options.ocr_options,
|
|
||||||
enabled=self.pipeline_options.do_ocr,
|
|
||||||
artifacts_path=art_path,
|
|
||||||
accelerator_options=self.pipeline_options.accelerator_options,
|
|
||||||
)
|
|
||||||
|
|
||||||
# ────────────────────────────────────────────────────────────────────────
|
|
||||||
# Build - thread pipeline
|
|
||||||
# ────────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def _create_run_ctx(self) -> RunContext:
|
|
||||||
opts = self.pipeline_options
|
|
||||||
preprocess = ThreadedPipelineStage(
|
|
||||||
name="preprocess",
|
|
||||||
model=self.preprocessing_model,
|
|
||||||
batch_size=1,
|
|
||||||
batch_timeout=opts.batch_timeout_seconds,
|
|
||||||
queue_max_size=opts.queue_max_size,
|
|
||||||
)
|
|
||||||
ocr = ThreadedPipelineStage(
|
|
||||||
name="ocr",
|
|
||||||
model=self.ocr_model,
|
|
||||||
batch_size=opts.ocr_batch_size,
|
|
||||||
batch_timeout=opts.batch_timeout_seconds,
|
|
||||||
queue_max_size=opts.queue_max_size,
|
|
||||||
)
|
|
||||||
layout = ThreadedPipelineStage(
|
|
||||||
name="layout",
|
|
||||||
model=self.layout_model,
|
|
||||||
batch_size=opts.layout_batch_size,
|
|
||||||
batch_timeout=opts.batch_timeout_seconds,
|
|
||||||
queue_max_size=opts.queue_max_size,
|
|
||||||
)
|
|
||||||
table = ThreadedPipelineStage(
|
|
||||||
name="table",
|
|
||||||
model=self.table_model,
|
|
||||||
batch_size=opts.table_batch_size,
|
|
||||||
batch_timeout=opts.batch_timeout_seconds,
|
|
||||||
queue_max_size=opts.queue_max_size,
|
|
||||||
)
|
|
||||||
assemble = ThreadedPipelineStage(
|
|
||||||
name="assemble",
|
|
||||||
model=self.assemble_model,
|
|
||||||
batch_size=1,
|
|
||||||
batch_timeout=opts.batch_timeout_seconds,
|
|
||||||
queue_max_size=opts.queue_max_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
# wire stages
|
|
||||||
output_q = ThreadedQueue(opts.queue_max_size)
|
|
||||||
preprocess.add_output_queue(ocr.input_queue)
|
|
||||||
ocr.add_output_queue(layout.input_queue)
|
|
||||||
layout.add_output_queue(table.input_queue)
|
|
||||||
table.add_output_queue(assemble.input_queue)
|
|
||||||
assemble.add_output_queue(output_q)
|
|
||||||
|
|
||||||
stages = [preprocess, ocr, layout, table, assemble]
|
|
||||||
return RunContext(stages=stages, first_stage=preprocess, output_queue=output_q)
|
|
||||||
|
|
||||||
# --------------------------------------------------------------------- build
|
|
||||||
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
||||||
"""Stream-build the document while interleaving producer and consumer work."""
|
|
||||||
run_id = next(self._run_seq)
|
|
||||||
assert isinstance(conv_res.input._backend, PdfDocumentBackend)
|
|
||||||
backend = conv_res.input._backend
|
|
||||||
|
|
||||||
# preload & initialise pages -------------------------------------------------------------
|
|
||||||
start_page, end_page = conv_res.input.limits.page_range
|
|
||||||
pages: list[Page] = []
|
|
||||||
for i in range(conv_res.input.page_count):
|
|
||||||
if start_page - 1 <= i <= end_page - 1:
|
|
||||||
page = Page(page_no=i)
|
|
||||||
page._backend = backend.load_page(i)
|
|
||||||
if page._backend and page._backend.is_valid():
|
|
||||||
page.size = page._backend.get_size()
|
|
||||||
conv_res.pages.append(page)
|
|
||||||
pages.append(page)
|
|
||||||
|
|
||||||
if not pages:
|
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
|
||||||
return conv_res
|
|
||||||
|
|
||||||
total_pages: int = len(pages)
|
|
||||||
ctx: RunContext = self._create_run_ctx()
|
|
||||||
for st in ctx.stages:
|
|
||||||
st.start()
|
|
||||||
|
|
||||||
proc = ProcessingResult(total_expected=total_pages)
|
|
||||||
fed_idx: int = 0 # number of pages successfully queued
|
|
||||||
batch_size: int = 32 # drain chunk
|
|
||||||
try:
|
|
||||||
while proc.success_count + proc.failure_count < total_pages:
|
|
||||||
# 1) feed - try to enqueue until the first queue is full
|
|
||||||
while fed_idx < total_pages:
|
|
||||||
ok = ctx.first_stage.input_queue.put(
|
|
||||||
ThreadedItem(
|
|
||||||
payload=pages[fed_idx],
|
|
||||||
run_id=run_id,
|
|
||||||
page_no=pages[fed_idx].page_no,
|
|
||||||
conv_res=conv_res,
|
|
||||||
),
|
|
||||||
timeout=0.0, # non-blocking try-put
|
|
||||||
)
|
|
||||||
if ok:
|
|
||||||
fed_idx += 1
|
|
||||||
if fed_idx == total_pages:
|
|
||||||
ctx.first_stage.input_queue.close()
|
|
||||||
else: # queue full - switch to draining
|
|
||||||
break
|
|
||||||
|
|
||||||
# 2) drain - pull whatever is ready from the output side
|
|
||||||
out_batch = ctx.output_queue.get_batch(batch_size, timeout=0.05)
|
|
||||||
for itm in out_batch:
|
|
||||||
if itm.run_id != run_id:
|
|
||||||
continue
|
|
||||||
if itm.is_failed or itm.error:
|
|
||||||
proc.failed_pages.append(
|
|
||||||
(itm.page_no, itm.error or RuntimeError("unknown error"))
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
assert itm.payload is not None
|
|
||||||
proc.pages.append(itm.payload)
|
|
||||||
|
|
||||||
# 3) failure safety - downstream closed early -> mark missing pages failed
|
|
||||||
if not out_batch and ctx.output_queue.closed:
|
|
||||||
missing = total_pages - (proc.success_count + proc.failure_count)
|
|
||||||
if missing > 0:
|
|
||||||
proc.failed_pages.extend(
|
|
||||||
[(-1, RuntimeError("pipeline terminated early"))] * missing
|
|
||||||
)
|
|
||||||
break
|
|
||||||
finally:
|
|
||||||
for st in ctx.stages:
|
|
||||||
st.stop()
|
|
||||||
ctx.output_queue.close()
|
|
||||||
|
|
||||||
self._integrate_results(conv_res, proc)
|
|
||||||
return conv_res
|
|
||||||
|
|
||||||
# ---------------------------------------------------- integrate_results()
|
|
||||||
def _integrate_results(
|
|
||||||
self, conv_res: ConversionResult, proc: ProcessingResult
|
|
||||||
) -> None:
|
|
||||||
page_map = {p.page_no: p for p in proc.pages}
|
|
||||||
conv_res.pages = [
|
|
||||||
page_map.get(p.page_no, p)
|
|
||||||
for p in conv_res.pages
|
|
||||||
if p.page_no in page_map
|
|
||||||
or not any(fp == p.page_no for fp, _ in proc.failed_pages)
|
|
||||||
]
|
|
||||||
if proc.is_complete_failure:
|
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
|
||||||
elif proc.is_partial_success:
|
|
||||||
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
|
||||||
else:
|
|
||||||
conv_res.status = ConversionStatus.SUCCESS
|
|
||||||
if not self.keep_images:
|
|
||||||
for p in conv_res.pages:
|
|
||||||
p._image_cache = {}
|
|
||||||
for p in conv_res.pages:
|
|
||||||
if not self.keep_backend and p._backend is not None:
|
|
||||||
p._backend.unload()
|
|
||||||
if not self.pipeline_options.generate_parsed_pages:
|
|
||||||
del p.parsed_page
|
|
||||||
p.parsed_page = None
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------- assemble
|
|
||||||
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
|
||||||
elements, headers, body = [], [], []
|
|
||||||
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
|
||||||
for p in conv_res.pages:
|
|
||||||
if p.assembled:
|
|
||||||
elements.extend(p.assembled.elements)
|
|
||||||
headers.extend(p.assembled.headers)
|
|
||||||
body.extend(p.assembled.body)
|
|
||||||
conv_res.assembled = AssembledUnit(
|
|
||||||
elements=elements, headers=headers, body=body
|
|
||||||
)
|
|
||||||
conv_res.document = self.reading_order_model(conv_res)
|
|
||||||
|
|
||||||
# Generate page images in the output
|
|
||||||
if self.pipeline_options.generate_page_images:
|
|
||||||
for page in conv_res.pages:
|
|
||||||
assert page.image is not None
|
|
||||||
page_no = page.page_no + 1
|
|
||||||
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
|
||||||
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate images of the requested element types
|
|
||||||
with warnings.catch_warnings(): # deprecated generate_table_images
|
|
||||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
||||||
if (
|
|
||||||
self.pipeline_options.generate_picture_images
|
|
||||||
or self.pipeline_options.generate_table_images
|
|
||||||
):
|
|
||||||
scale = self.pipeline_options.images_scale
|
|
||||||
for element, _level in conv_res.document.iterate_items():
|
|
||||||
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
|
||||||
continue
|
|
||||||
if (
|
|
||||||
isinstance(element, PictureItem)
|
|
||||||
and self.pipeline_options.generate_picture_images
|
|
||||||
) or (
|
|
||||||
isinstance(element, TableItem)
|
|
||||||
and self.pipeline_options.generate_table_images
|
|
||||||
):
|
|
||||||
page_ix = element.prov[0].page_no - 1
|
|
||||||
page = next(
|
|
||||||
(p for p in conv_res.pages if p.page_no == page_ix),
|
|
||||||
cast("Page", None),
|
|
||||||
)
|
|
||||||
assert page is not None
|
|
||||||
assert page.size is not None
|
|
||||||
assert page.image is not None
|
|
||||||
|
|
||||||
crop_bbox = (
|
|
||||||
element.prov[0]
|
|
||||||
.bbox.scaled(scale=scale)
|
|
||||||
.to_top_left_origin(
|
|
||||||
page_height=page.size.height * scale
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
|
||||||
element.image = ImageRef.from_pil(
|
|
||||||
cropped_im, dpi=int(72 * scale)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Aggregate confidence values for document:
|
|
||||||
if len(conv_res.pages) > 0:
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.filterwarnings(
|
|
||||||
"ignore",
|
|
||||||
category=RuntimeWarning,
|
|
||||||
message="Mean of empty slice|All-NaN slice encountered",
|
|
||||||
)
|
|
||||||
conv_res.confidence.layout_score = float(
|
|
||||||
np.nanmean(
|
|
||||||
[c.layout_score for c in conv_res.confidence.pages.values()]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
conv_res.confidence.parse_score = float(
|
|
||||||
np.nanquantile(
|
|
||||||
[c.parse_score for c in conv_res.confidence.pages.values()],
|
|
||||||
q=0.1, # parse score should relate to worst 10% of pages.
|
|
||||||
)
|
|
||||||
)
|
|
||||||
conv_res.confidence.table_score = float(
|
|
||||||
np.nanmean(
|
|
||||||
[c.table_score for c in conv_res.confidence.pages.values()]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
conv_res.confidence.ocr_score = float(
|
|
||||||
np.nanmean(
|
|
||||||
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return conv_res
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------- misc
|
|
||||||
@classmethod
|
|
||||||
def get_default_options(cls) -> ThreadedPdfPipelineOptions:
|
|
||||||
return ThreadedPdfPipelineOptions()
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def is_backend_supported(cls, backend: AbstractDocumentBackend) -> bool:
|
|
||||||
return isinstance(backend, PdfDocumentBackend)
|
|
||||||
|
|
||||||
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
|
||||||
return conv_res.status
|
|
||||||
|
|
||||||
def _unload(self, conv_res: ConversionResult) -> None:
|
|
||||||
for p in conv_res.pages:
|
|
||||||
if p._backend is not None:
|
|
||||||
p._backend.unload()
|
|
||||||
if conv_res.input._backend:
|
|
||||||
conv_res.input._backend.unload()
|
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ def test_threaded_pipeline_multiple_documents():
|
|||||||
layout_batch_size=1,
|
layout_batch_size=1,
|
||||||
table_batch_size=1,
|
table_batch_size=1,
|
||||||
ocr_batch_size=1,
|
ocr_batch_size=1,
|
||||||
batch_timeout_seconds=1.0,
|
batch_polling_interval_seconds=1.0,
|
||||||
do_table_structure=do_ts,
|
do_table_structure=do_ts,
|
||||||
do_ocr=do_ocr,
|
do_ocr=do_ocr,
|
||||||
),
|
),
|
||||||
|
|||||||
5
uv.lock
generated
5
uv.lock
generated
@@ -5943,6 +5943,9 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/20/8a/b35a615ae6f04550d696bb179c414538b3b477999435fdd4ad75b76139e4/pybase64-1.4.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:a370dea7b1cee2a36a4d5445d4e09cc243816c5bc8def61f602db5a6f5438e52", size = 54320, upload-time = "2025-07-27T13:03:27.495Z" },
|
{ url = "https://files.pythonhosted.org/packages/20/8a/b35a615ae6f04550d696bb179c414538b3b477999435fdd4ad75b76139e4/pybase64-1.4.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:a370dea7b1cee2a36a4d5445d4e09cc243816c5bc8def61f602db5a6f5438e52", size = 54320, upload-time = "2025-07-27T13:03:27.495Z" },
|
||||||
{ url = "https://files.pythonhosted.org/packages/d3/a9/8bd4f9bcc53689f1b457ecefed1eaa080e4949d65a62c31a38b7253d5226/pybase64-1.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9aa4de83f02e462a6f4e066811c71d6af31b52d7484de635582d0e3ec3d6cc3e", size = 56482, upload-time = "2025-07-27T13:03:28.942Z" },
|
{ url = "https://files.pythonhosted.org/packages/d3/a9/8bd4f9bcc53689f1b457ecefed1eaa080e4949d65a62c31a38b7253d5226/pybase64-1.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9aa4de83f02e462a6f4e066811c71d6af31b52d7484de635582d0e3ec3d6cc3e", size = 56482, upload-time = "2025-07-27T13:03:28.942Z" },
|
||||||
{ url = "https://files.pythonhosted.org/packages/75/e5/4a7735b54a1191f61c3f5c2952212c85c2d6b06eb5fb3671c7603395f70c/pybase64-1.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83a1c2f9ed00fee8f064d548c8654a480741131f280e5750bb32475b7ec8ee38", size = 70959, upload-time = "2025-07-27T13:03:30.171Z" },
|
{ url = "https://files.pythonhosted.org/packages/75/e5/4a7735b54a1191f61c3f5c2952212c85c2d6b06eb5fb3671c7603395f70c/pybase64-1.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83a1c2f9ed00fee8f064d548c8654a480741131f280e5750bb32475b7ec8ee38", size = 70959, upload-time = "2025-07-27T13:03:30.171Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/f4/56/5337f27a8b8d2d6693f46f7b36bae47895e5820bfa259b0072574a4e1057/pybase64-1.4.2-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:0f331aa59549de21f690b6ccc79360ffed1155c3cfbc852eb5c097c0b8565a2b", size = 33888, upload-time = "2025-07-27T13:03:35.698Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/e3/ff/470768f0fe6de0aa302a8cb1bdf2f9f5cffc3f69e60466153be68bc953aa/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:69d3f0445b0faeef7bb7f93bf8c18d850785e2a77f12835f49e524cc54af04e7", size = 30914, upload-time = "2025-07-27T13:03:38.475Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/75/6b/d328736662665e0892409dc410353ebef175b1be5eb6bab1dad579efa6df/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2372b257b1f4dd512f317fb27e77d313afd137334de64c87de8374027aacd88a", size = 31380, upload-time = "2025-07-27T13:03:39.7Z" },
|
||||||
{ url = "https://files.pythonhosted.org/packages/ca/96/7ff718f87c67f4147c181b73d0928897cefa17dc75d7abc6e37730d5908f/pybase64-1.4.2-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:fb794502b4b1ec91c4ca5d283ae71aef65e3de7721057bd9e2b3ec79f7a62d7d", size = 38230, upload-time = "2025-07-27T13:03:41.637Z" },
|
{ url = "https://files.pythonhosted.org/packages/ca/96/7ff718f87c67f4147c181b73d0928897cefa17dc75d7abc6e37730d5908f/pybase64-1.4.2-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:fb794502b4b1ec91c4ca5d283ae71aef65e3de7721057bd9e2b3ec79f7a62d7d", size = 38230, upload-time = "2025-07-27T13:03:41.637Z" },
|
||||||
{ url = "https://files.pythonhosted.org/packages/71/ab/db4dbdfccb9ca874d6ce34a0784761471885d96730de85cee3d300381529/pybase64-1.4.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d377d48acf53abf4b926c2a7a24a19deb092f366a04ffd856bf4b3aa330b025d", size = 71608, upload-time = "2025-07-27T13:03:47.01Z" },
|
{ url = "https://files.pythonhosted.org/packages/71/ab/db4dbdfccb9ca874d6ce34a0784761471885d96730de85cee3d300381529/pybase64-1.4.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d377d48acf53abf4b926c2a7a24a19deb092f366a04ffd856bf4b3aa330b025d", size = 71608, upload-time = "2025-07-27T13:03:47.01Z" },
|
||||||
{ url = "https://files.pythonhosted.org/packages/f2/58/7f2cef1ceccc682088958448d56727369de83fa6b29148478f4d2acd107a/pybase64-1.4.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:ab9cdb6a8176a5cb967f53e6ad60e40c83caaa1ae31c5e1b29e5c8f507f17538", size = 56413, upload-time = "2025-07-27T13:03:49.908Z" },
|
{ url = "https://files.pythonhosted.org/packages/f2/58/7f2cef1ceccc682088958448d56727369de83fa6b29148478f4d2acd107a/pybase64-1.4.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:ab9cdb6a8176a5cb967f53e6ad60e40c83caaa1ae31c5e1b29e5c8f507f17538", size = 56413, upload-time = "2025-07-27T13:03:49.908Z" },
|
||||||
@@ -5964,6 +5967,8 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/95/f0/c392c4ac8ccb7a34b28377c21faa2395313e3c676d76c382642e19a20703/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad59362fc267bf15498a318c9e076686e4beeb0dfe09b457fabbc2b32468b97a", size = 58103, upload-time = "2025-07-27T13:04:29.996Z" },
|
{ url = "https://files.pythonhosted.org/packages/95/f0/c392c4ac8ccb7a34b28377c21faa2395313e3c676d76c382642e19a20703/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad59362fc267bf15498a318c9e076686e4beeb0dfe09b457fabbc2b32468b97a", size = 58103, upload-time = "2025-07-27T13:04:29.996Z" },
|
||||||
{ url = "https://files.pythonhosted.org/packages/32/30/00ab21316e7df8f526aa3e3dc06f74de6711d51c65b020575d0105a025b2/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:01593bd064e7dcd6c86d04e94e44acfe364049500c20ac68ca1e708fbb2ca970", size = 60779, upload-time = "2025-07-27T13:04:31.549Z" },
|
{ url = "https://files.pythonhosted.org/packages/32/30/00ab21316e7df8f526aa3e3dc06f74de6711d51c65b020575d0105a025b2/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:01593bd064e7dcd6c86d04e94e44acfe364049500c20ac68ca1e708fbb2ca970", size = 60779, upload-time = "2025-07-27T13:04:31.549Z" },
|
||||||
{ url = "https://files.pythonhosted.org/packages/a6/65/114ca81839b1805ce4a2b7d58bc16e95634734a2059991f6382fc71caf3e/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5b81547ad8ea271c79fdf10da89a1e9313cb15edcba2a17adf8871735e9c02a0", size = 74684, upload-time = "2025-07-27T13:04:32.976Z" },
|
{ url = "https://files.pythonhosted.org/packages/a6/65/114ca81839b1805ce4a2b7d58bc16e95634734a2059991f6382fc71caf3e/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5b81547ad8ea271c79fdf10da89a1e9313cb15edcba2a17adf8871735e9c02a0", size = 74684, upload-time = "2025-07-27T13:04:32.976Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/99/bf/00a87d951473ce96c8c08af22b6983e681bfabdb78dd2dcf7ee58eac0932/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:4157ad277a32cf4f02a975dffc62a3c67d73dfa4609b2c1978ef47e722b18b8e", size = 30924, upload-time = "2025-07-27T13:04:39.189Z" },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/ae/43/dee58c9d60e60e6fb32dc6da722d84592e22f13c277297eb4ce6baf99a99/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:e113267dc349cf624eb4f4fbf53fd77835e1aa048ac6877399af426aab435757", size = 31390, upload-time = "2025-07-27T13:04:40.995Z" },
|
||||||
{ url = "https://files.pythonhosted.org/packages/e1/11/b28906fc2e330b8b1ab4bc845a7bef808b8506734e90ed79c6062b095112/pybase64-1.4.2-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:cea5aaf218fd9c5c23afacfe86fd4464dfedc1a0316dd3b5b4075b068cc67df0", size = 38212, upload-time = "2025-07-27T13:04:42.729Z" },
|
{ url = "https://files.pythonhosted.org/packages/e1/11/b28906fc2e330b8b1ab4bc845a7bef808b8506734e90ed79c6062b095112/pybase64-1.4.2-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:cea5aaf218fd9c5c23afacfe86fd4464dfedc1a0316dd3b5b4075b068cc67df0", size = 38212, upload-time = "2025-07-27T13:04:42.729Z" },
|
||||||
{ url = "https://files.pythonhosted.org/packages/e4/2e/851eb51284b97354ee5dfa1309624ab90920696e91a33cd85b13d20cc5c1/pybase64-1.4.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a3e54dcf0d0305ec88473c9d0009f698cabf86f88a8a10090efeff2879c421bb", size = 71674, upload-time = "2025-07-27T13:04:49.294Z" },
|
{ url = "https://files.pythonhosted.org/packages/e4/2e/851eb51284b97354ee5dfa1309624ab90920696e91a33cd85b13d20cc5c1/pybase64-1.4.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a3e54dcf0d0305ec88473c9d0009f698cabf86f88a8a10090efeff2879c421bb", size = 71674, upload-time = "2025-07-27T13:04:49.294Z" },
|
||||||
{ url = "https://files.pythonhosted.org/packages/a4/8e/3479266bc0e65f6cc48b3938d4a83bff045330649869d950a378f2ddece0/pybase64-1.4.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:753da25d4fd20be7bda2746f545935773beea12d5cb5ec56ec2d2960796477b1", size = 56461, upload-time = "2025-07-27T13:04:52.37Z" },
|
{ url = "https://files.pythonhosted.org/packages/a4/8e/3479266bc0e65f6cc48b3938d4a83bff045330649869d950a378f2ddece0/pybase64-1.4.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:753da25d4fd20be7bda2746f545935773beea12d5cb5ec56ec2d2960796477b1", size = 56461, upload-time = "2025-07-27T13:04:52.37Z" },
|
||||||
|
|||||||
Reference in New Issue
Block a user