diff --git a/CHANGELOG.md b/CHANGELOG.md index 3561c135..a422acd5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,19 @@ +## [v2.11.0](https://github.com/DS4SD/docling/releases/tag/v2.11.0) - 2024-12-12 + +### Feature + +* Add timeout limit to document parsing job. DS4SD#270 ([#552](https://github.com/DS4SD/docling/issues/552)) ([`3da166e`](https://github.com/DS4SD/docling/commit/3da166eafa3c119de961510341cb92397652c222)) + +### Fix + +* Do not import python modules from deepsearch-glm ([#569](https://github.com/DS4SD/docling/issues/569)) ([`aee9c0b`](https://github.com/DS4SD/docling/commit/aee9c0b324a07190ad03ad3a6266e76c465d4cdf)) +* Handle no result from RapidOcr reader ([#558](https://github.com/DS4SD/docling/issues/558)) ([`f45499c`](https://github.com/DS4SD/docling/commit/f45499ce9349fe55538dfb36d74c395e9193d9b1)) +* Make enum serializable with human-readable value ([#555](https://github.com/DS4SD/docling/issues/555)) ([`a7df337`](https://github.com/DS4SD/docling/commit/a7df337654fa5fa7633af8740fb5e4cc4a06f250)) + +### Documentation + +* Update chunking usage docs, minor reorg ([#550](https://github.com/DS4SD/docling/issues/550)) ([`d0c9e8e`](https://github.com/DS4SD/docling/commit/d0c9e8e508d7edef5e733be6cdea2cea0a9a0695)) + ## [v2.10.0](https://github.com/DS4SD/docling/releases/tag/v2.10.0) - 2024-12-09 ### Feature diff --git a/docling/cli/main.py b/docling/cli/main.py index d20c6332..8d8fc83c 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -256,6 +256,13 @@ def convert( device: Annotated[ AcceleratorDevice, typer.Option(..., help="Accelerator device") ] = AcceleratorDevice.AUTO, + document_timeout: Annotated[ + Optional[float], + typer.Option( + ..., + help="The timeout for processing each document, in seconds.", + ), + ] = None, ): if verbose == 0: logging.basicConfig(level=logging.WARNING) @@ -341,6 +348,7 @@ def convert( do_ocr=ocr, ocr_options=ocr_options, do_table_structure=True, + document_timeout=document_timeout, ) pipeline_options.table_structure_options.do_cell_matching = ( True # do_cell_matching diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 52af10c4..6916a83f 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -214,9 +214,10 @@ class PipelineOptions(BaseModel): """Base pipeline options.""" create_legacy_output: bool = ( - True # This defautl will be set to False on a future version of docling + True # This default will be set to False on a future version of docling ) accelerator_options: AcceleratorOptions = AcceleratorOptions() + document_timeout: Optional[float] = None class PdfPipelineOptions(PipelineOptions): diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 5013ad58..5d3b7686 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -126,6 +126,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. # conv_res.status = ConversionStatus.FAILURE # return conv_res + total_elapsed_time = 0.0 with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): for i in range(0, conv_res.input.page_count): @@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. for page_batch in chunkify( conv_res.pages, settings.perf.page_batch_size ): - start_pb_time = time.time() + start_batch_time = time.monotonic() # 1. Initialise the page resources init_pages = map( @@ -149,8 +150,21 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. for p in pipeline_pages: # Must exhaust! pass - end_pb_time = time.time() - start_pb_time - _log.debug(f"Finished converting page batch time={end_pb_time:.3f}") + end_batch_time = time.monotonic() + total_elapsed_time += end_batch_time - start_batch_time + if ( + self.pipeline_options.document_timeout is not None + and total_elapsed_time > self.pipeline_options.document_timeout + ): + _log.warning( + f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds" + ) + conv_res.status = ConversionStatus.PARTIAL_SUCCESS + break + + _log.debug( + f"Finished converting page batch time={end_batch_time:.3f}" + ) except Exception as e: conv_res.status = ConversionStatus.FAILURE diff --git a/pyproject.toml b/pyproject.toml index 653bcd73..6e514ebb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "2.10.0" # DO NOT EDIT, updated automatically +version = "2.11.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Panos Vagenas ", "Peter Staar "] license = "MIT"