Merge remote-tracking branch 'origin/main' into fix-numpy-pinning

2025-08-01 15:02:21 +00:00 · 2024-12-03 10:51:10 +01:00 · 2024-12-03 10:51:10 +01:00 · e9c6462629
commit e9c6462629
parent 84abb008f5 051789d017
6 changed files with 103 additions and 100 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -2,6 +2,7 @@ import importlib
 import json
 import logging
 import re
+import tempfile
 import time
 import warnings
 from enum import Enum
@ -9,7 +10,7 @@ from pathlib import Path
 from typing import Annotated, Dict, Iterable, List, Optional, Type

 import typer
-from docling_core.utils.file import resolve_file_source
+from docling_core.utils.file import resolve_source_to_path

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@ -256,95 +257,98 @@ def convert(
    if from_formats is None:
        from_formats = [e for e in InputFormat]

-    input_doc_paths: List[Path] = []
-    for src in input_sources:
-        source = resolve_file_source(source=src)
-        if not source.exists():
-            err_console.print(
-                f"[red]Error: The input file {source} does not exist.[/red]"
-            )
-            raise typer.Abort()
-        elif source.is_dir():
-            for fmt in from_formats:
-                for ext in FormatToExtensions[fmt]:
-                    input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
-                    input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
+    with tempfile.TemporaryDirectory() as tempdir:
+        input_doc_paths: List[Path] = []
+        for src in input_sources:
+            source = resolve_source_to_path(source=src, workdir=Path(tempdir))
+            if not source.exists():
+                err_console.print(
+                    f"[red]Error: The input file {source} does not exist.[/red]"
+                )
+                raise typer.Abort()
+            elif source.is_dir():
+                for fmt in from_formats:
+                    for ext in FormatToExtensions[fmt]:
+                        input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
+                        input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
+            else:
+                input_doc_paths.append(source)
+
+        if to_formats is None:
+            to_formats = [OutputFormat.MARKDOWN]
+
+        export_json = OutputFormat.JSON in to_formats
+        export_md = OutputFormat.MARKDOWN in to_formats
+        export_txt = OutputFormat.TEXT in to_formats
+        export_doctags = OutputFormat.DOCTAGS in to_formats
+
+        if ocr_engine == OcrEngine.EASYOCR:
+            ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.TESSERACT_CLI:
+            ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.TESSERACT:
+            ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.OCRMAC:
+            ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
+        elif ocr_engine == OcrEngine.RAPIDOCR:
+            ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
        else:
-            input_doc_paths.append(source)
+            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

-    if to_formats is None:
-        to_formats = [OutputFormat.MARKDOWN]
+        ocr_lang_list = _split_list(ocr_lang)
+        if ocr_lang_list is not None:
+            ocr_options.lang = ocr_lang_list

-    export_json = OutputFormat.JSON in to_formats
-    export_md = OutputFormat.MARKDOWN in to_formats
-    export_txt = OutputFormat.TEXT in to_formats
-    export_doctags = OutputFormat.DOCTAGS in to_formats
-
-    if ocr_engine == OcrEngine.EASYOCR:
-        ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.TESSERACT_CLI:
-        ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.TESSERACT:
-        ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.OCRMAC:
-        ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
-    elif ocr_engine == OcrEngine.RAPIDOCR:
-        ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
-    else:
-        raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
-
-    ocr_lang_list = _split_list(ocr_lang)
-    if ocr_lang_list is not None:
-        ocr_options.lang = ocr_lang_list
-
-    pipeline_options = PdfPipelineOptions(
-        do_ocr=ocr,
-        ocr_options=ocr_options,
-        do_table_structure=True,
-    )
-    pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching
-    pipeline_options.table_structure_options.mode = table_mode
-
-    if artifacts_path is not None:
-        pipeline_options.artifacts_path = artifacts_path
-
-    if pdf_backend == PdfBackend.DLPARSE_V1:
-        backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
-    elif pdf_backend == PdfBackend.DLPARSE_V2:
-        backend = DoclingParseV2DocumentBackend
-    elif pdf_backend == PdfBackend.PYPDFIUM2:
-        backend = PyPdfiumDocumentBackend
-    else:
-        raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
-
-    format_options: Dict[InputFormat, FormatOption] = {
-        InputFormat.PDF: PdfFormatOption(
-            pipeline_options=pipeline_options,
-            backend=backend,  # pdf_backend
+        pipeline_options = PdfPipelineOptions(
+            do_ocr=ocr,
+            ocr_options=ocr_options,
+            do_table_structure=True,
        )
-    }
-    doc_converter = DocumentConverter(
-        allowed_formats=from_formats,
-        format_options=format_options,
-    )
+        pipeline_options.table_structure_options.do_cell_matching = (
+            True  # do_cell_matching
+        )
+        pipeline_options.table_structure_options.mode = table_mode

-    start_time = time.time()
+        if artifacts_path is not None:
+            pipeline_options.artifacts_path = artifacts_path

-    conv_results = doc_converter.convert_all(
-        input_doc_paths, raises_on_error=abort_on_error
-    )
+        if pdf_backend == PdfBackend.DLPARSE_V1:
+            backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
+        elif pdf_backend == PdfBackend.DLPARSE_V2:
+            backend = DoclingParseV2DocumentBackend
+        elif pdf_backend == PdfBackend.PYPDFIUM2:
+            backend = PyPdfiumDocumentBackend
+        else:
+            raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")

-    output.mkdir(parents=True, exist_ok=True)
-    export_documents(
-        conv_results,
-        output_dir=output,
-        export_json=export_json,
-        export_md=export_md,
-        export_txt=export_txt,
-        export_doctags=export_doctags,
-    )
+        format_options: Dict[InputFormat, FormatOption] = {
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+                backend=backend,  # pdf_backend
+            )
+        }
+        doc_converter = DocumentConverter(
+            allowed_formats=from_formats,
+            format_options=format_options,
+        )

-    end_time = time.time() - start_time
+        start_time = time.time()
+
+        conv_results = doc_converter.convert_all(
+            input_doc_paths, raises_on_error=abort_on_error
+        )
+
+        output.mkdir(parents=True, exist_ok=True)
+        export_documents(
+            conv_results,
+            output_dir=output,
+            export_json=export_json,
+            export_md=export_md,
+            export_txt=export_txt,
+            export_doctags=export_doctags,
+        )
+
+        end_time = time.time() - start_time

    _log.info(f"All documents were converted in {end_time:.2f} seconds.")

--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -1,5 +1,4 @@
 from enum import Enum, auto
-from io import BytesIO
 from typing import TYPE_CHECKING, Dict, List, Optional, Union

 from docling_core.types.doc import (
@ -9,6 +8,9 @@ from docling_core.types.doc import (
    Size,
    TableCell,
 )
+from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from this location
+    DocumentStream,
+)
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict

@ -207,10 +209,3 @@ class Page(BaseModel):
    @property
    def image(self) -> Optional[Image]:
        return self.get_image(scale=self._default_image_scale)
-
-
-class DocumentStream(BaseModel):
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-
-    name: str
-    stream: BytesIO
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
 )
 from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
-from docling_core.utils.file import resolve_file_source
+from docling_core.utils.file import resolve_source_to_stream
 from pydantic import BaseModel
 from typing_extensions import deprecated

@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
        self, format_options: Dict[InputFormat, "FormatOption"]
    ) -> Iterable[InputDocument]:
        for item in self.path_or_stream_iterator:
-            obj = resolve_file_source(item) if isinstance(item, str) else item
+            obj = resolve_source_to_stream(item) if isinstance(item, str) else item
            format = self._guess_format(obj)
            if format not in format_options.keys():
                _log.info(
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -1,5 +1,6 @@
 import io
 import logging
+import os
 import tempfile
 from subprocess import DEVNULL, PIPE, Popen
 from typing import Iterable, Optional, Tuple
@ -130,14 +131,17 @@ class TesseractOcrCliModel(BaseOcrModel):
                        high_res_image = page._backend.get_page_image(
                            scale=self.scale, cropbox=ocr_rect
                        )
-
-                        with tempfile.NamedTemporaryFile(
-                            suffix=".png", mode="w"
-                        ) as image_file:
-                            fname = image_file.name
-                            high_res_image.save(fname)
+                        try:
+                            with tempfile.NamedTemporaryFile(
+                                suffix=".png", mode="w+b", delete=False
+                            ) as image_file:
+                                fname = image_file.name
+                                high_res_image.save(image_file)

                            df = self._run_tesseract(fname)
+                        finally:
+                            if os.path.exists(fname):
+                                os.remove(fname)

                        # _log.info(df)

--- a/poetry.lock
+++ b/poetry.lock
@ -7647,4 +7647,4 @@ tesserocr = ["tesserocr"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "cbcb4f196d6d2631ce82af90af2d356c557c9dcd2c12bb7ee193043962ba729f"
+content-hash = "33ee730cf750e618ec005ad44ad09617bc8f95632b30ac02b5290a03a33bdf5b"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -26,7 +26,7 @@ packages = [{include = "docling"}]
 ######################
 python = "^3.9"
 pydantic = ">=2.0.0,<2.10"
-docling-core = "^2.5.1"
+docling-core = "^2.6.1"
 docling-ibm-models = "^2.0.6"
 deepsearch-glm = "^0.26.1"
 filetype = "^1.2.0"