Merge from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-27 12:34:22 +00:00 · 2024-12-12 20:17:24 +01:00 · 2024-12-12 20:17:24 +01:00 · 1aaf34056f
commit 1aaf34056f
parent ccab2db1d4 d1d0ddd924
5 changed files with 44 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,19 @@
+## [v2.11.0](https://github.com/DS4SD/docling/releases/tag/v2.11.0) - 2024-12-12
+
+### Feature
+
+* Add timeout limit to document parsing job. DS4SD#270 ([#552](https://github.com/DS4SD/docling/issues/552)) ([`3da166e`](https://github.com/DS4SD/docling/commit/3da166eafa3c119de961510341cb92397652c222))
+
+### Fix
+
+* Do not import python modules from deepsearch-glm ([#569](https://github.com/DS4SD/docling/issues/569)) ([`aee9c0b`](https://github.com/DS4SD/docling/commit/aee9c0b324a07190ad03ad3a6266e76c465d4cdf))
+* Handle no result from RapidOcr reader ([#558](https://github.com/DS4SD/docling/issues/558)) ([`f45499c`](https://github.com/DS4SD/docling/commit/f45499ce9349fe55538dfb36d74c395e9193d9b1))
+* Make enum serializable with human-readable value ([#555](https://github.com/DS4SD/docling/issues/555)) ([`a7df337`](https://github.com/DS4SD/docling/commit/a7df337654fa5fa7633af8740fb5e4cc4a06f250))
+
+### Documentation
+
+* Update chunking usage docs, minor reorg ([#550](https://github.com/DS4SD/docling/issues/550)) ([`d0c9e8e`](https://github.com/DS4SD/docling/commit/d0c9e8e508d7edef5e733be6cdea2cea0a9a0695))
+
 ## [v2.10.0](https://github.com/DS4SD/docling/releases/tag/v2.10.0) - 2024-12-09

 ### Feature
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -256,6 +256,13 @@ def convert(
    device: Annotated[
        AcceleratorDevice, typer.Option(..., help="Accelerator device")
    ] = AcceleratorDevice.AUTO,
+    document_timeout: Annotated[
+        Optional[float],
+        typer.Option(
+            ...,
+            help="The timeout for processing each document, in seconds.",
+        ),
+    ] = None,
 ):
    if verbose == 0:
        logging.basicConfig(level=logging.WARNING)
@ -341,6 +348,7 @@ def convert(
            do_ocr=ocr,
            ocr_options=ocr_options,
            do_table_structure=True,
+            document_timeout=document_timeout,
        )
        pipeline_options.table_structure_options.do_cell_matching = (
            True  # do_cell_matching
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -214,9 +214,10 @@ class PipelineOptions(BaseModel):
    """Base pipeline options."""

    create_legacy_output: bool = (
-        True  # This defautl will be set to False on a future version of docling
+        True  # This default will be set to False on a future version of docling
    )
    accelerator_options: AcceleratorOptions = AcceleratorOptions()
+    document_timeout: Optional[float] = None


 class PdfPipelineOptions(PipelineOptions):
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@ -126,6 +126,7 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
            # conv_res.status = ConversionStatus.FAILURE
            # return conv_res

+        total_elapsed_time = 0.0
        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):

            for i in range(0, conv_res.input.page_count):
@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                for page_batch in chunkify(
                    conv_res.pages, settings.perf.page_batch_size
                ):
-                    start_pb_time = time.time()
+                    start_batch_time = time.monotonic()

                    # 1. Initialise the page resources
                    init_pages = map(
@ -149,8 +150,21 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                    for p in pipeline_pages:  # Must exhaust!
                        pass

-                    end_pb_time = time.time() - start_pb_time
-                    _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
+                    end_batch_time = time.monotonic()
+                    total_elapsed_time += end_batch_time - start_batch_time
+                    if (
+                        self.pipeline_options.document_timeout is not None
+                        and total_elapsed_time > self.pipeline_options.document_timeout
+                    ):
+                        _log.warning(
+                            f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
+                        )
+                        conv_res.status = ConversionStatus.PARTIAL_SUCCESS
+                        break
+
+                    _log.debug(
+                        f"Finished converting page batch time={end_batch_time:.3f}"
+                    )

            except Exception as e:
                conv_res.status = ConversionStatus.FAILURE
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.10.0"  # DO NOT EDIT, updated automatically
+version = "2.11.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"