mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
Merge from main
Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
commit
1aaf34056f
16
CHANGELOG.md
16
CHANGELOG.md
@ -1,3 +1,19 @@
|
|||||||
|
## [v2.11.0](https://github.com/DS4SD/docling/releases/tag/v2.11.0) - 2024-12-12
|
||||||
|
|
||||||
|
### Feature
|
||||||
|
|
||||||
|
* Add timeout limit to document parsing job. DS4SD#270 ([#552](https://github.com/DS4SD/docling/issues/552)) ([`3da166e`](https://github.com/DS4SD/docling/commit/3da166eafa3c119de961510341cb92397652c222))
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* Do not import python modules from deepsearch-glm ([#569](https://github.com/DS4SD/docling/issues/569)) ([`aee9c0b`](https://github.com/DS4SD/docling/commit/aee9c0b324a07190ad03ad3a6266e76c465d4cdf))
|
||||||
|
* Handle no result from RapidOcr reader ([#558](https://github.com/DS4SD/docling/issues/558)) ([`f45499c`](https://github.com/DS4SD/docling/commit/f45499ce9349fe55538dfb36d74c395e9193d9b1))
|
||||||
|
* Make enum serializable with human-readable value ([#555](https://github.com/DS4SD/docling/issues/555)) ([`a7df337`](https://github.com/DS4SD/docling/commit/a7df337654fa5fa7633af8740fb5e4cc4a06f250))
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
* Update chunking usage docs, minor reorg ([#550](https://github.com/DS4SD/docling/issues/550)) ([`d0c9e8e`](https://github.com/DS4SD/docling/commit/d0c9e8e508d7edef5e733be6cdea2cea0a9a0695))
|
||||||
|
|
||||||
## [v2.10.0](https://github.com/DS4SD/docling/releases/tag/v2.10.0) - 2024-12-09
|
## [v2.10.0](https://github.com/DS4SD/docling/releases/tag/v2.10.0) - 2024-12-09
|
||||||
|
|
||||||
### Feature
|
### Feature
|
||||||
|
@ -256,6 +256,13 @@ def convert(
|
|||||||
device: Annotated[
|
device: Annotated[
|
||||||
AcceleratorDevice, typer.Option(..., help="Accelerator device")
|
AcceleratorDevice, typer.Option(..., help="Accelerator device")
|
||||||
] = AcceleratorDevice.AUTO,
|
] = AcceleratorDevice.AUTO,
|
||||||
|
document_timeout: Annotated[
|
||||||
|
Optional[float],
|
||||||
|
typer.Option(
|
||||||
|
...,
|
||||||
|
help="The timeout for processing each document, in seconds.",
|
||||||
|
),
|
||||||
|
] = None,
|
||||||
):
|
):
|
||||||
if verbose == 0:
|
if verbose == 0:
|
||||||
logging.basicConfig(level=logging.WARNING)
|
logging.basicConfig(level=logging.WARNING)
|
||||||
@ -341,6 +348,7 @@ def convert(
|
|||||||
do_ocr=ocr,
|
do_ocr=ocr,
|
||||||
ocr_options=ocr_options,
|
ocr_options=ocr_options,
|
||||||
do_table_structure=True,
|
do_table_structure=True,
|
||||||
|
document_timeout=document_timeout,
|
||||||
)
|
)
|
||||||
pipeline_options.table_structure_options.do_cell_matching = (
|
pipeline_options.table_structure_options.do_cell_matching = (
|
||||||
True # do_cell_matching
|
True # do_cell_matching
|
||||||
|
@ -214,9 +214,10 @@ class PipelineOptions(BaseModel):
|
|||||||
"""Base pipeline options."""
|
"""Base pipeline options."""
|
||||||
|
|
||||||
create_legacy_output: bool = (
|
create_legacy_output: bool = (
|
||||||
True # This defautl will be set to False on a future version of docling
|
True # This default will be set to False on a future version of docling
|
||||||
)
|
)
|
||||||
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
||||||
|
document_timeout: Optional[float] = None
|
||||||
|
|
||||||
|
|
||||||
class PdfPipelineOptions(PipelineOptions):
|
class PdfPipelineOptions(PipelineOptions):
|
||||||
|
@ -126,6 +126,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
# conv_res.status = ConversionStatus.FAILURE
|
# conv_res.status = ConversionStatus.FAILURE
|
||||||
# return conv_res
|
# return conv_res
|
||||||
|
|
||||||
|
total_elapsed_time = 0.0
|
||||||
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||||
|
|
||||||
for i in range(0, conv_res.input.page_count):
|
for i in range(0, conv_res.input.page_count):
|
||||||
@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
for page_batch in chunkify(
|
for page_batch in chunkify(
|
||||||
conv_res.pages, settings.perf.page_batch_size
|
conv_res.pages, settings.perf.page_batch_size
|
||||||
):
|
):
|
||||||
start_pb_time = time.time()
|
start_batch_time = time.monotonic()
|
||||||
|
|
||||||
# 1. Initialise the page resources
|
# 1. Initialise the page resources
|
||||||
init_pages = map(
|
init_pages = map(
|
||||||
@ -149,8 +150,21 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
for p in pipeline_pages: # Must exhaust!
|
for p in pipeline_pages: # Must exhaust!
|
||||||
pass
|
pass
|
||||||
|
|
||||||
end_pb_time = time.time() - start_pb_time
|
end_batch_time = time.monotonic()
|
||||||
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
|
total_elapsed_time += end_batch_time - start_batch_time
|
||||||
|
if (
|
||||||
|
self.pipeline_options.document_timeout is not None
|
||||||
|
and total_elapsed_time > self.pipeline_options.document_timeout
|
||||||
|
):
|
||||||
|
_log.warning(
|
||||||
|
f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
|
||||||
|
)
|
||||||
|
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
||||||
|
break
|
||||||
|
|
||||||
|
_log.debug(
|
||||||
|
f"Finished converting page batch time={end_batch_time:.3f}"
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "docling"
|
name = "docling"
|
||||||
version = "2.10.0" # DO NOT EDIT, updated automatically
|
version = "2.11.0" # DO NOT EDIT, updated automatically
|
||||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||||
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
Loading…
Reference in New Issue
Block a user