mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
perf: prevent temp file leftovers, reuse core type (#487)
* chore: reuse DocumentStream from docling-core Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * update docling-core version Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * [skip ci] document import line Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * fix: use new resolve_source_to_x functions to avoid tempfile leftovers (#490) use new resolve_source_to_x functions Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
This commit is contained in:
parent
2f4d38f4da
commit
b80b35c7c9
@ -2,6 +2,7 @@ import importlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
from enum import Enum
|
||||
@ -9,7 +10,7 @@ from pathlib import Path
|
||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||
|
||||
import typer
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from docling_core.utils.file import resolve_source_to_path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
@ -256,95 +257,98 @@ def convert(
|
||||
if from_formats is None:
|
||||
from_formats = [e for e in InputFormat]
|
||||
|
||||
input_doc_paths: List[Path] = []
|
||||
for src in input_sources:
|
||||
source = resolve_file_source(source=src)
|
||||
if not source.exists():
|
||||
err_console.print(
|
||||
f"[red]Error: The input file {source} does not exist.[/red]"
|
||||
)
|
||||
raise typer.Abort()
|
||||
elif source.is_dir():
|
||||
for fmt in from_formats:
|
||||
for ext in FormatToExtensions[fmt]:
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
input_doc_paths: List[Path] = []
|
||||
for src in input_sources:
|
||||
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
||||
if not source.exists():
|
||||
err_console.print(
|
||||
f"[red]Error: The input file {source} does not exist.[/red]"
|
||||
)
|
||||
raise typer.Abort()
|
||||
elif source.is_dir():
|
||||
for fmt in from_formats:
|
||||
for ext in FormatToExtensions[fmt]:
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
|
||||
else:
|
||||
input_doc_paths.append(source)
|
||||
|
||||
if to_formats is None:
|
||||
to_formats = [OutputFormat.MARKDOWN]
|
||||
|
||||
export_json = OutputFormat.JSON in to_formats
|
||||
export_md = OutputFormat.MARKDOWN in to_formats
|
||||
export_txt = OutputFormat.TEXT in to_formats
|
||||
export_doctags = OutputFormat.DOCTAGS in to_formats
|
||||
|
||||
if ocr_engine == OcrEngine.EASYOCR:
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
||||
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.TESSERACT:
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.OCRMAC:
|
||||
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.RAPIDOCR:
|
||||
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
||||
else:
|
||||
input_doc_paths.append(source)
|
||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||
|
||||
if to_formats is None:
|
||||
to_formats = [OutputFormat.MARKDOWN]
|
||||
ocr_lang_list = _split_list(ocr_lang)
|
||||
if ocr_lang_list is not None:
|
||||
ocr_options.lang = ocr_lang_list
|
||||
|
||||
export_json = OutputFormat.JSON in to_formats
|
||||
export_md = OutputFormat.MARKDOWN in to_formats
|
||||
export_txt = OutputFormat.TEXT in to_formats
|
||||
export_doctags = OutputFormat.DOCTAGS in to_formats
|
||||
|
||||
if ocr_engine == OcrEngine.EASYOCR:
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
||||
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.TESSERACT:
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.OCRMAC:
|
||||
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.RAPIDOCR:
|
||||
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||
|
||||
ocr_lang_list = _split_list(ocr_lang)
|
||||
if ocr_lang_list is not None:
|
||||
ocr_options.lang = ocr_lang_list
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||
pipeline_options.table_structure_options.mode = table_mode
|
||||
|
||||
if artifacts_path is not None:
|
||||
pipeline_options.artifacts_path = artifacts_path
|
||||
|
||||
if pdf_backend == PdfBackend.DLPARSE_V1:
|
||||
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
||||
backend = PyPdfiumDocumentBackend
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend, # pdf_backend
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
)
|
||||
}
|
||||
doc_converter = DocumentConverter(
|
||||
allowed_formats=from_formats,
|
||||
format_options=format_options,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = (
|
||||
True # do_cell_matching
|
||||
)
|
||||
pipeline_options.table_structure_options.mode = table_mode
|
||||
|
||||
start_time = time.time()
|
||||
if artifacts_path is not None:
|
||||
pipeline_options.artifacts_path = artifacts_path
|
||||
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths, raises_on_error=abort_on_error
|
||||
)
|
||||
if pdf_backend == PdfBackend.DLPARSE_V1:
|
||||
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
||||
backend = PyPdfiumDocumentBackend
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||
|
||||
output.mkdir(parents=True, exist_ok=True)
|
||||
export_documents(
|
||||
conv_results,
|
||||
output_dir=output,
|
||||
export_json=export_json,
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
)
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend, # pdf_backend
|
||||
)
|
||||
}
|
||||
doc_converter = DocumentConverter(
|
||||
allowed_formats=from_formats,
|
||||
format_options=format_options,
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths, raises_on_error=abort_on_error
|
||||
)
|
||||
|
||||
output.mkdir(parents=True, exist_ok=True)
|
||||
export_documents(
|
||||
conv_results,
|
||||
output_dir=output,
|
||||
export_json=export_json,
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
from enum import Enum, auto
|
||||
from io import BytesIO
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
@ -9,6 +8,9 @@ from docling_core.types.doc import (
|
||||
Size,
|
||||
TableCell,
|
||||
)
|
||||
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
||||
DocumentStream,
|
||||
)
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
@ -210,10 +212,3 @@ class Page(BaseModel):
|
||||
@property
|
||||
def image(self) -> Optional[Image]:
|
||||
return self.get_image(scale=self._default_image_scale)
|
||||
|
||||
|
||||
class DocumentStream(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
name: str
|
||||
stream: BytesIO
|
||||
|
@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
|
||||
)
|
||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from docling_core.utils.file import resolve_source_to_stream
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
|
||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||
) -> Iterable[InputDocument]:
|
||||
for item in self.path_or_stream_iterator:
|
||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
||||
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
||||
format = self._guess_format(obj)
|
||||
if format not in format_options.keys():
|
||||
_log.info(
|
||||
|
37
poetry.lock
generated
37
poetry.lock
generated
@ -913,6 +913,7 @@ pillow = ">=10.3.0,<11.0.0"
|
||||
pydantic = ">=2.6.0,<2.10"
|
||||
pyyaml = ">=5.1,<7.0.0"
|
||||
tabulate = ">=0.9.0,<0.10.0"
|
||||
typing-extensions = ">=4.12.2,<5.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "docling-ibm-models"
|
||||
@ -3200,6 +3201,7 @@ files = [
|
||||
{file = "nh3-0.2.19-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00810cd5275f5c3f44b9eb0e521d1a841ee2f8023622de39ffc7d88bd533d8e0"},
|
||||
{file = "nh3-0.2.19-cp38-abi3-win32.whl", hash = "sha256:7e98621856b0a911c21faa5eef8f8ea3e691526c2433f9afc2be713cb6fbdb48"},
|
||||
{file = "nh3-0.2.19-cp38-abi3-win_amd64.whl", hash = "sha256:75c7cafb840f24430b009f7368945cb5ca88b2b54bb384ebfba495f16bc9c121"},
|
||||
{file = "nh3-0.2.19.tar.gz", hash = "sha256:790056b54c068ff8dceb443eaefb696b84beff58cca6c07afd754d17692a4804"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -6034,6 +6036,7 @@ description = "A set of python modules for machine learning and data mining"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
<<<<<<< HEAD
|
||||
{file = "scikit_learn-1.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:366fb3fa47dce90afed3d6106183f4978d6f24cfd595c2373424171b915ee718"},
|
||||
{file = "scikit_learn-1.6.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:59cd96a8d9f8dfd546f5d6e9787e1b989e981388d7803abbc9efdcde61e47460"},
|
||||
{file = "scikit_learn-1.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efa7a579606c73a0b3d210e33ea410ea9e1af7933fe324cb7e6fbafae4ea5948"},
|
||||
@ -6064,6 +6067,34 @@ files = [
|
||||
{file = "scikit_learn-1.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c3fa7d3dd5a0ec2d0baba0d644916fa2ab180ee37850c5d536245df916946bd"},
|
||||
{file = "scikit_learn-1.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:df778486a32518cda33818b7e3ce48c78cef1d5f640a6bc9d97c6d2e71449a51"},
|
||||
{file = "scikit_learn-1.6.0.tar.gz", hash = "sha256:9d58481f9f7499dff4196927aedd4285a0baec8caa3790efbe205f13de37dd6e"},
|
||||
=======
|
||||
{file = "scikit_learn-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:299406827fb9a4f862626d0fe6c122f5f87f8910b86fe5daa4c32dcd742139b6"},
|
||||
{file = "scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2d4cad1119c77930b235579ad0dc25e65c917e756fe80cab96aa3b9428bd3fb0"},
|
||||
{file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c412ccc2ad9bf3755915e3908e677b367ebc8d010acbb3f182814524f2e5540"},
|
||||
{file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a686885a4b3818d9e62904d91b57fa757fc2bed3e465c8b177be652f4dd37c8"},
|
||||
{file = "scikit_learn-1.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:c15b1ca23d7c5f33cc2cb0a0d6aaacf893792271cddff0edbd6a40e8319bc113"},
|
||||
{file = "scikit_learn-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:03b6158efa3faaf1feea3faa884c840ebd61b6484167c711548fce208ea09445"},
|
||||
{file = "scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1ff45e26928d3b4eb767a8f14a9a6efbf1cbff7c05d1fb0f95f211a89fd4f5de"},
|
||||
{file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f763897fe92d0e903aa4847b0aec0e68cadfff77e8a0687cabd946c89d17e675"},
|
||||
{file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8b0ccd4a902836493e026c03256e8b206656f91fbcc4fde28c57a5b752561f1"},
|
||||
{file = "scikit_learn-1.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:6c16d84a0d45e4894832b3c4d0bf73050939e21b99b01b6fd59cbb0cf39163b6"},
|
||||
{file = "scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f932a02c3f4956dfb981391ab24bda1dbd90fe3d628e4b42caef3e041c67707a"},
|
||||
{file = "scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3b923d119d65b7bd555c73be5423bf06c0105678ce7e1f558cb4b40b0a5502b1"},
|
||||
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
|
||||
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
|
||||
{file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
|
||||
{file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
|
||||
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
|
||||
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
|
||||
{file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
|
||||
{file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca64b3089a6d9b9363cd3546f8978229dcbb737aceb2c12144ee3f70f95684b7"},
|
||||
{file = "scikit_learn-1.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:3bed4909ba187aca80580fe2ef370d9180dcf18e621a27c4cf2ef10d279a7efe"},
|
||||
{file = "scikit_learn-1.5.2.tar.gz", hash = "sha256:b4237ed7b3fdd0a4882792e68ef2545d5baa50aca3bb45aa7df468138ad8f94d"},
|
||||
>>>>>>> 051789d (perf: prevent temp file leftovers, reuse core type (#487))
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -7687,4 +7718,8 @@ tesserocr = ["tesserocr"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "3e66a54bd0433581e4909003124e2b79b42bdd1fb90d17c037f3294aeff56aa9"
|
||||
<<<<<<< HEAD
|
||||
content-hash = "3e66a54bd0433581e4909003124e2b79b42bdd1fb90d17c037f3294aeff56aa9"
|
||||
=======
|
||||
content-hash = "ee3b3d938295f0057567c10fb808a0d95ed2fe9a32f459d489b4b29aacf710c8"
|
||||
>>>>>>> 051789d (perf: prevent temp file leftovers, reuse core type (#487))
|
||||
|
@ -26,7 +26,7 @@ packages = [{include = "docling"}]
|
||||
######################
|
||||
python = "^3.9"
|
||||
pydantic = ">=2.0.0,<2.10"
|
||||
docling-core = "^2.5.1"
|
||||
docling-core = "^2.6.1"
|
||||
docling-ibm-models = "^2.0.6"
|
||||
deepsearch-glm = "^0.26.1"
|
||||
filetype = "^1.2.0"
|
||||
|
Loading…
Reference in New Issue
Block a user