Merge branch 'DS4SD:main' into simonas/base-options

2025-08-02 07:22:14 +00:00 · 2024-12-03 16:25:35 +02:00 · 2024-12-03 16:25:35 +02:00 · 1c14a2ac56
commit 1c14a2ac56
parent 1cd30ed448 34c7c79858
12 changed files with 933 additions and 807 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,21 @@
 ## [v2.8.2](https://github.com/DS4SD/docling/releases/tag/v2.8.2) - 2024-12-03
 ### Fix
 * ParserError EOF inside string (#470) ([#472](https://github.com/DS4SD/docling/issues/472)) ([`c90c41c`](https://github.com/DS4SD/docling/commit/c90c41c391de4366db554d7a71ce9a35467c981e))
 * PermissionError when using tesseract_ocr_cli_model ([#496](https://github.com/DS4SD/docling/issues/496)) ([`d3f84b2`](https://github.com/DS4SD/docling/commit/d3f84b2457125feacd0c21d6513e7ae69a308ea5))
 ### Documentation
 * Add styling for faq ([#502](https://github.com/DS4SD/docling/issues/502)) ([`5ba3807`](https://github.com/DS4SD/docling/commit/5ba3807f315a01b1a4e8df9bab40e34a4238205a))
 * Typo in faq ([#484](https://github.com/DS4SD/docling/issues/484)) ([`33cff98`](https://github.com/DS4SD/docling/commit/33cff98d360c02a382a66850c696a0cf511659ac))
 * Add automatic api reference ([#475](https://github.com/DS4SD/docling/issues/475)) ([`d487210`](https://github.com/DS4SD/docling/commit/d4872103b8f24e38b37a8cd3ac414d3e02e7d6e8))
 * Introduce faq section ([#468](https://github.com/DS4SD/docling/issues/468)) ([`8ccb3c6`](https://github.com/DS4SD/docling/commit/8ccb3c6db69318789af7deec26cfa2a3fd71302e))
 ### Performance
 * Prevent temp file leftovers, reuse core type ([#487](https://github.com/DS4SD/docling/issues/487)) ([`051789d`](https://github.com/DS4SD/docling/commit/051789d01706d3823dd6307eca4dc5faacd1b7ce))
 ## [v2.8.1](https://github.com/DS4SD/docling/releases/tag/v2.8.1) - 2024-11-29
 ### Fix
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -2,6 +2,7 @@ import importlib
 import json
 import logging
 import re
 import tempfile
 import time
 import warnings
 from enum import Enum
@ -9,7 +10,7 @@ from pathlib import Path
 from typing import Annotated, Dict, Iterable, List, Optional, Type
 import typer
-from docling_core.utils.file import resolve_file_source
+from docling_core.utils.file import resolve_source_to_path
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@ -256,9 +257,10 @@ def convert(
    if from_formats is None:
        from_formats = [e for e in InputFormat]
    with tempfile.TemporaryDirectory() as tempdir:
        input_doc_paths: List[Path] = []
        for src in input_sources:
-        source = resolve_file_source(source=src)
+            source = resolve_source_to_path(source=src, workdir=Path(tempdir))
            if not source.exists():
                err_console.print(
                    f"[red]Error: The input file {source} does not exist.[/red]"
@ -302,7 +304,9 @@ def convert(
            ocr_options=ocr_options,
            do_table_structure=True,
        )
-    pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching
+        pipeline_options.table_structure_options.do_cell_matching = (
            True  # do_cell_matching
        )
        pipeline_options.table_structure_options.mode = table_mode
        if artifacts_path is not None:
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -1,5 +1,4 @@
 from enum import Enum, auto
 from io import BytesIO
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 from docling_core.types.doc import (
@ -9,6 +8,9 @@ from docling_core.types.doc import (
    Size,
    TableCell,
 )
 from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from this location
    DocumentStream,
 )
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict
@ -22,6 +24,7 @@ class ConversionStatus(str, Enum):
    FAILURE = auto()
    SUCCESS = auto()
    PARTIAL_SUCCESS = auto()
    SKIPPED = auto()
 class InputFormat(str, Enum):
@ -93,6 +96,7 @@ class DoclingComponentType(str, Enum):
    DOCUMENT_BACKEND = auto()
    MODEL = auto()
    DOC_ASSEMBLER = auto()
    USER_INPUT = auto()
 class ErrorItem(BaseModel):
@ -207,10 +211,3 @@ class Page(BaseModel):
    @property
    def image(self) -> Optional[Image]:
        return self.get_image(scale=self._default_image_scale)
 class DocumentStream(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    name: str
    stream: BytesIO
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -3,7 +3,7 @@ import re
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
 import filetype
 from docling_core.types.doc import (
@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
 )
 from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
-from docling_core.utils.file import resolve_file_source
+from docling_core.utils.file import resolve_source_to_stream
 from pydantic import BaseModel
 from typing_extensions import deprecated
@ -164,12 +164,6 @@ class InputDocument(BaseModel):
        backend: Type[AbstractDocumentBackend],
        path_or_stream: Union[BytesIO, Path],
    ) -> None:
        if backend is None:
            raise RuntimeError(
                f"No backend configuration provided for file {self.file.name} with format {self.format}. "
                f"Please check your format configuration on DocumentConverter."
            )
        self._backend = backend(self, path_or_stream=path_or_stream)
        if not self._backend.is_valid():
            self.valid = False
@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
        return ds_doc
 class _DummyBackend(AbstractDocumentBackend):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    def is_valid(self) -> bool:
        return False
    @classmethod
    def supported_formats(cls) -> Set[InputFormat]:
        return set()
    @classmethod
    def supports_pagination(cls) -> bool:
        return False
    def unload(self):
        return super().unload()
 class _DocumentConversionInput(BaseModel):
    path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
@ -459,13 +472,14 @@ class _DocumentConversionInput(BaseModel):
        self, format_options: Dict[InputFormat, "FormatOption"]
    ) -> Iterable[InputDocument]:
        for item in self.path_or_stream_iterator:
-            obj = resolve_file_source(item) if isinstance(item, str) else item
+            obj = resolve_source_to_stream(item) if isinstance(item, str) else item
            format = self._guess_format(obj)
            backend: Type[AbstractDocumentBackend]
            if format not in format_options.keys():
-                _log.info(
+                _log.error(
-                    f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
+                    f"Input document {obj.name} does not match any allowed format."
                )
-                continue
+                backend = _DummyBackend
            else:
                backend = format_options[format].backend
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
+from docling.datamodel.base_models import (
    ConversionStatus,
    DoclingComponentType,
    DocumentStream,
    ErrorItem,
    InputFormat,
 )
 from docling.datamodel.document import (
    ConversionResult,
    InputDocument,
@ -23,6 +29,7 @@ from docling.datamodel.document import (
 )
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import DocumentLimits, settings
 from docling.exceptions import ConversionError
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.pipeline.simple_pipeline import SimplePipeline
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@ -85,7 +92,8 @@ class ImageFormatOption(FormatOption):
    backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
-_format_to_default_options = {
+def _get_default_option(format: InputFormat) -> FormatOption:
    format_to_default_options = {
        InputFormat.XLSX: FormatOption(
            pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
        ),
@ -111,6 +119,10 @@ _format_to_default_options = {
            pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
        ),
    }
    if (options := format_to_default_options.get(format)) is not None:
        return options
    else:
        raise RuntimeError(f"No default options configured for {format}")
 class DocumentConverter:
@ -121,36 +133,26 @@ class DocumentConverter:
        allowed_formats: Optional[List[InputFormat]] = None,
        format_options: Optional[Dict[InputFormat, FormatOption]] = None,
    ):
-        self.allowed_formats = allowed_formats
+        self.allowed_formats = (
-        self.format_to_options = format_options
+            allowed_formats if allowed_formats is not None else [e for e in InputFormat]
-
+        )
-        if self.allowed_formats is None:
+        self.format_to_options = {
-            # if self.format_to_options is not None:
+            format: (
-            #    self.allowed_formats = self.format_to_options.keys()
+                _get_default_option(format=format)
-            # else:
+                if (custom_option := (format_options or {}).get(format)) is None
-            self.allowed_formats = [e for e in InputFormat]  # all formats
+                else custom_option
-
+            )
-        if self.format_to_options is None:
+            for format in self.allowed_formats
-            self.format_to_options = _format_to_default_options
+        }
        else:
            for f in self.allowed_formats:
                if f not in self.format_to_options.keys():
                    _log.debug(f"Requested format {f} will use default options.")
                    self.format_to_options[f] = _format_to_default_options[f]
            remove_keys = []
            for f in self.format_to_options.keys():
                if f not in self.allowed_formats:
                    remove_keys.append(f)
            for f in remove_keys:
                self.format_to_options.pop(f)
        self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
    def initialize_pipeline(self, format: InputFormat):
        """Initialize the conversion pipeline for the selected format."""
-        self._get_pipeline(doc_format=format)
+        pipeline = self._get_pipeline(doc_format=format)
        if pipeline is None:
            raise ConversionError(
                f"No pipeline could be initialized for format {format}"
            )
    @validate_call(config=ConfigDict(strict=True))
    def convert(
@ -186,22 +188,28 @@ class DocumentConverter:
            limits=limits,
        )
        conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
        had_result = False
        for conv_res in conv_res_iter:
            had_result = True
            if raises_on_error and conv_res.status not in {
                ConversionStatus.SUCCESS,
                ConversionStatus.PARTIAL_SUCCESS,
            }:
-                raise RuntimeError(
+                raise ConversionError(
                    f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
                )
            else:
                yield conv_res
        if not had_result and raises_on_error:
            raise ConversionError(
                f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
            )
    def _convert(
        self, conv_input: _DocumentConversionInput, raises_on_error: bool
    ) -> Iterator[ConversionResult]:
        assert self.format_to_options is not None
        start_time = time.monotonic()
        for input_batch in chunkify(
@ -223,27 +231,22 @@ class DocumentConverter:
            ):
                elapsed = time.monotonic() - start_time
                start_time = time.monotonic()
                if item is not None:
                _log.info(
                    f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
                )
                yield item
                else:
                    _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
    def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
        assert self.format_to_options is not None
        fopt = self.format_to_options.get(doc_format)
        if fopt is None:
-            raise RuntimeError(f"Could not get pipeline for {doc_format}")
+            return None
        else:
            pipeline_class = fopt.pipeline_cls
            pipeline_options = fopt.pipeline_options
-        assert pipeline_options is not None
+        if pipeline_options is None:
            return None
        # TODO this will ignore if different options have been defined for the same pipeline class.
        if (
            pipeline_class not in self.initialized_pipelines
@ -257,11 +260,26 @@ class DocumentConverter:
    def _process_document(
        self, in_doc: InputDocument, raises_on_error: bool
-    ) -> Optional[ConversionResult]:
+    ) -> ConversionResult:
        assert self.allowed_formats is not None
        assert in_doc.format in self.allowed_formats
        valid = (
            self.allowed_formats is not None and in_doc.format in self.allowed_formats
        )
        if valid:
            conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
        else:
            error_message = f"File format not allowed: {in_doc.file}"
            if raises_on_error:
                raise ConversionError(error_message)
            else:
                error_item = ErrorItem(
                    component_type=DoclingComponentType.USER_INPUT,
                    module_name="",
                    error_message=error_message,
                )
                conv_res = ConversionResult(
                    input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
                )
        return conv_res
@ -270,26 +288,28 @@ class DocumentConverter:
    ) -> ConversionResult:
        if in_doc.valid:
            pipeline = self._get_pipeline(in_doc.format)
-            if pipeline is None:  # Can't find a default pipeline. Should this raise?
+            if pipeline is not None:
                conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
            else:
                if raises_on_error:
-                    raise RuntimeError(
+                    raise ConversionError(
                        f"No pipeline could be initialized for {in_doc.file}."
                    )
                else:
-                    conv_res = ConversionResult(input=in_doc)
+                    conv_res = ConversionResult(
-                    conv_res.status = ConversionStatus.FAILURE
+                        input=in_doc,
-                    return conv_res
+                        status=ConversionStatus.FAILURE,
-
+                    )
            conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
        else:
            if raises_on_error:
-                raise RuntimeError(f"Input document {in_doc.file} is not valid.")
+                raise ConversionError(f"Input document {in_doc.file} is not valid.")
            else:
                # invalid doc or not of desired format
-                conv_res = ConversionResult(input=in_doc)
+                conv_res = ConversionResult(
-                conv_res.status = ConversionStatus.FAILURE
+                    input=in_doc,
                    status=ConversionStatus.FAILURE,
                )
                # TODO add error log why it failed.
        return conv_res
--- a/docling/exceptions.py
+++ b/docling/exceptions.py
@ -0,0 +1,6 @@
 class BaseError(RuntimeError):
    pass
 class ConversionError(BaseError):
    pass
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -1,5 +1,7 @@
 import csv
 import io
 import logging
 import os
 import tempfile
 from subprocess import DEVNULL, PIPE, Popen
 from typing import Iterable, Optional, Tuple
@ -95,7 +97,7 @@ class TesseractOcrCliModel(BaseOcrModel):
        # _log.info(decoded_data)
        # Read the TSV file generated by Tesseract
-        df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
+        df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
        # Display the dataframe (optional)
        # _log.info("df: ", df.head())
@ -130,14 +132,17 @@ class TesseractOcrCliModel(BaseOcrModel):
                        high_res_image = page._backend.get_page_image(
                            scale=self.scale, cropbox=ocr_rect
                        )
-
+                        try:
                            with tempfile.NamedTemporaryFile(
-                            suffix=".png", mode="w"
+                                suffix=".png", mode="w+b", delete=False
                            ) as image_file:
                                fname = image_file.name
-                            high_res_image.save(fname)
+                                high_res_image.save(image_file)
                            df = self._run_tesseract(fname)
                        finally:
                            if os.path.exists(fname):
                                os.remove(fname)
                        # _log.info(df)
--- a/docs/faq.md
+++ b/docs/faq.md
@ -3,7 +3,9 @@
 This is a collection of FAQ collected from the user questions on <https://github.com/DS4SD/docling/discussions>.
-### Python 3.13 support
+??? question "Is Python 3.13 supported?"
    ### Is Python 3.13 supported?
    Full support for Python 3.13 is currently waiting for [pytorch](https://github.com/pytorch/pytorch).
@ -15,7 +17,7 @@ python3.13 -m venv venv
    source ./venv/bin/activate
    # Install torch nightly builds, see https://pytorch.org/
-pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+    pip3 install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
    # Install docling
    pip3 install docling
@ -29,8 +31,14 @@ _Note: we are disabling OCR since easyocr and the nightly torch builds have some
    Source: Issue [#136](https://github.com/DS4SD/docling/issues/136)
 ??? question "Install conflicts with numpy (python 3.13)"
    ### Install conflicts with numpy (python 3.13)
    When using `docling-ibm-models>=2.0.7` and `deepsearch-glm>=0.26.2` these issues should not show up anymore.
    Docling supports numpy versions `>=1.24.4,<3.0.0` which should match all usages.
    **For older versions**
    This has been observed installing docling and langchain via poetry.
@ -54,19 +62,20 @@ numpy = [
    ]
    ```
    Source: Issue [#283](https://github.com/DS4SD/docling/issues/283#issuecomment-2465035868)
-### GPU support
+??? question "Are text styles (bold, underline, etc) supported?"
-TBA
+    ### Are text styles (bold, underline, etc) supported?
    Currently text styles are not supported in the `DoclingDocument` format.
    If you are interest in contributing this feature, please open a discussion topic to brainstorm on the design.
    _Note: this is not a simple topic_
-### Text styles (bold, underline, etc)
+??? question "How do I run completely offline?"
 TBA
    ### How do I run completely offline?
@ -89,6 +98,7 @@ converter = DocumentConverter(
    Source: Issue [#326](https://github.com/DS4SD/docling/issues/326)
 ??? question " Which model weights are needed to run Docling?"
    ### Which model weights are needed to run Docling?
    Model weights are needed for the AI models used in the PDF pipeline. Other document types (docx, pptx, etc) do not have any such requirement.
@ -98,6 +108,7 @@ For processing PDF documents, Docling requires the model weights from <https://h
    When OCR is enabled, some engines also require model artifacts. For example EasyOCR, for which Docling has [special pipeline options](https://github.com/DS4SD/docling/blob/main/docling/datamodel/pipeline_options.py#L68) to control the runtime behavior.
 ??? question "SSL error downloading model weights"
    ### SSL error downloading model weights
@ -114,6 +125,8 @@ Possible solutions were
    - Use [pip-system-certs](https://pypi.org/project/pip-system-certs/) to use the latest trusted certificates on your system.
 ??? question "Which OCR languages are supported?"
    ### Which OCR languages are supported?
    Docling supports multiple OCR engine, each one has its own list of supported languages.
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.8.1"  # DO NOT EDIT, updated automatically
+version = "2.8.2"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@ -26,7 +26,7 @@ packages = [{include = "docling"}]
 ######################
 python = "^3.9"
 pydantic = ">=2.0.0,<2.10"
-docling-core = "^2.5.1"
+docling-core = "^2.6.1"
 docling-ibm-models = "^2.0.6"
 deepsearch-glm = "^0.26.1"
 filetype = "^1.2.0"
@ -90,10 +90,13 @@ langchain-huggingface = "^0.0.3"
 langchain-milvus = "^0.1.4"
 langchain-text-splitters = "^0.2.4"
 [tool.poetry.group.constraints]
 optional = true
 [tool.poetry.group.constraints.dependencies]
 numpy = [
-    { version = "^2.1.0", markers = 'python_version >= "3.13"' },
+    { version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
-    { version = "^1.24.4", markers = 'python_version < "3.13"' },
+    { version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
 ]
 [tool.poetry.group.mac_intel]
--- a/tests/test_interfaces.py
+++ b/tests/test_interfaces.py
@ -10,7 +10,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
-GENERATE = True
+GENERATE = False
 def get_pdf_path():
--- a/tests/test_invalid_input.py
+++ b/tests/test_invalid_input.py
@ -0,0 +1,45 @@
 from io import BytesIO
 from pathlib import Path
 import pytest
 from docling.datamodel.base_models import ConversionStatus, DocumentStream
 from docling.document_converter import ConversionError, DocumentConverter
 def get_pdf_path():
    pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
    return pdf_path
@pytest.fixture
 def converter():
    converter = DocumentConverter()
    return converter
 def test_convert_unsupported_doc_format_wout_exception(converter: DocumentConverter):
    result = converter.convert(
        DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False
    )
    assert result.status == ConversionStatus.SKIPPED
 def test_convert_unsupported_doc_format_with_exception(converter: DocumentConverter):
    with pytest.raises(ConversionError):
        converter.convert(
            DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")),
            raises_on_error=True,
        )
 def test_convert_too_small_filesize_limit_wout_exception(converter: DocumentConverter):
    result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False)
    assert result.status == ConversionStatus.FAILURE
 def test_convert_too_small_filesize_limit_with_exception(converter: DocumentConverter):
    with pytest.raises(ConversionError):
        converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=True)