improve handling of unsupported types

- Introduced new explicit exception types instead of `RuntimeError` - Introduced new `ConversionStatus` value for unsupported formats - Tidied up converter member typing & removed asserts Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2025-07-31 14:34:40 +00:00 · 2024-11-26 21:33:32 +01:00 · 2024-11-26 21:33:32 +01:00 · 0bb1e203b6
commit 0bb1e203b6
parent bafe673b97
5 changed files with 94 additions and 69 deletions
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -22,6 +22,7 @@ class ConversionStatus(str, Enum):
    FAILURE = auto()
    SUCCESS = auto()
    PARTIAL_SUCCESS = auto()
+    UNSUPPORTED = auto()


 class InputFormat(str, Enum):
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -3,7 +3,7 @@ import re
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union

 import filetype
 from docling_core.types.doc import (
@ -164,12 +164,6 @@ class InputDocument(BaseModel):
        backend: Type[AbstractDocumentBackend],
        path_or_stream: Union[BytesIO, Path],
    ) -> None:
-        if backend is None:
-            raise RuntimeError(
-                f"No backend configuration provided for file {self.file.name} with format {self.format}. "
-                f"Please check your format configuration on DocumentConverter."
-            )
-
        self._backend = backend(self, path_or_stream=path_or_stream)
        if not self._backend.is_valid():
            self.valid = False
@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
        return ds_doc


+class _DummyBackend(AbstractDocumentBackend):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def is_valid(self) -> bool:
+        return False
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return set()
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    def unload(self):
+        return super().unload()
+
+
 class _DocumentConversionInput(BaseModel):

    path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
@ -461,11 +474,12 @@ class _DocumentConversionInput(BaseModel):
        for item in self.path_or_stream_iterator:
            obj = resolve_file_source(item) if isinstance(item, str) else item
            format = self._guess_format(obj)
+            backend: Type[AbstractDocumentBackend]
            if format not in format_options.keys():
-                _log.info(
-                    f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
+                _log.error(
+                    f"Input document {obj.name} does not match any allowed format."
                )
-                continue
+                backend = _DummyBackend
            else:
                backend = format_options[format].backend

--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -23,6 +23,7 @@ from docling.datamodel.document import (
 )
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import DocumentLimits, settings
+from docling.exceptions import ConversionError
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.pipeline.simple_pipeline import SimplePipeline
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@ -121,18 +122,13 @@ class DocumentConverter:
        allowed_formats: Optional[List[InputFormat]] = None,
        format_options: Optional[Dict[InputFormat, FormatOption]] = None,
    ):
-        self.allowed_formats = allowed_formats
-        self.format_to_options = format_options
-
-        if self.allowed_formats is None:
-            # if self.format_to_options is not None:
-            #    self.allowed_formats = self.format_to_options.keys()
-            # else:
-            self.allowed_formats = [e for e in InputFormat]  # all formats
-
-        if self.format_to_options is None:
-            self.format_to_options = _format_to_default_options
-        else:
+        self.allowed_formats = (
+            allowed_formats if allowed_formats is not None else [e for e in InputFormat]
+        )
+        self.format_to_options = (
+            format_options if format_options is not None else _format_to_default_options
+        )
+        if format_options is not None:
            for f in self.allowed_formats:
                if f not in self.format_to_options.keys():
                    _log.debug(f"Requested format {f} will use default options.")
@ -150,7 +146,11 @@ class DocumentConverter:

    def initialize_pipeline(self, format: InputFormat):
        """Initialize the conversion pipeline for the selected format."""
-        self._get_pipeline(doc_format=format)
+        pipeline = self._get_pipeline(doc_format=format)
+        if pipeline is None:
+            raise ConversionError(
+                f"No pipeline could be initialized for format {format}"
+            )

    @validate_call(config=ConfigDict(strict=True))
    def convert(
@ -159,7 +159,7 @@ class DocumentConverter:
        raises_on_error: bool = True,
        max_num_pages: int = sys.maxsize,
        max_file_size: int = sys.maxsize,
-    ) -> Optional[ConversionResult]:
+    ) -> ConversionResult:

        all_res = self.convert_all(
            source=[source],
@ -167,7 +167,7 @@ class DocumentConverter:
            max_num_pages=max_num_pages,
            max_file_size=max_file_size,
        )
-        return next(all_res, None)
+        return next(all_res)

    @validate_call(config=ConfigDict(strict=True))
    def convert_all(
@ -194,22 +194,20 @@ class DocumentConverter:
                ConversionStatus.SUCCESS,
                ConversionStatus.PARTIAL_SUCCESS,
            }:
-                raise RuntimeError(
+                raise ConversionError(
                    f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
                )
            else:
                yield conv_res

        if not had_result and raises_on_error:
-            raise RuntimeError(
+            raise ConversionError(
                f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
            )

    def _convert(
        self, conv_input: _DocumentConversionInput, raises_on_error: bool
    ) -> Iterator[ConversionResult]:
-        assert self.format_to_options is not None
-
        start_time = time.monotonic()

        for input_batch in chunkify(
@ -231,27 +229,22 @@ class DocumentConverter:
            ):
                elapsed = time.monotonic() - start_time
                start_time = time.monotonic()
-
-                if item is not None:
-                    _log.info(
-                        f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
-                    )
-                    yield item
-                else:
-                    _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
+                _log.info(
+                    f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
+                )
+                yield item

    def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
-        assert self.format_to_options is not None
-
        fopt = self.format_to_options.get(doc_format)

        if fopt is None:
-            raise RuntimeError(f"Could not get pipeline for {doc_format}")
+            return None
        else:
            pipeline_class = fopt.pipeline_cls
            pipeline_options = fopt.pipeline_options

-        assert pipeline_options is not None
+        if pipeline_options is None:
+            return None
        # TODO this will ignore if different options have been defined for the same pipeline class.
        if (
            pipeline_class not in self.initialized_pipelines
@ -265,11 +258,20 @@ class DocumentConverter:

    def _process_document(
        self, in_doc: InputDocument, raises_on_error: bool
-    ) -> Optional[ConversionResult]:
-        assert self.allowed_formats is not None
-        assert in_doc.format in self.allowed_formats
+    ) -> ConversionResult:

-        conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
+        valid = (
+            self.allowed_formats is not None and in_doc.format in self.allowed_formats
+        )
+        if valid:
+            conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
+        else:
+            if raises_on_error:
+                raise ConversionError(f"Unsupported format in: {in_doc.file}")
+            else:
+                conv_res = ConversionResult(
+                    input=in_doc, status=ConversionStatus.UNSUPPORTED
+                )

        return conv_res

@ -278,26 +280,28 @@ class DocumentConverter:
    ) -> ConversionResult:
        if in_doc.valid:
            pipeline = self._get_pipeline(in_doc.format)
-            if pipeline is None:  # Can't find a default pipeline. Should this raise?
+            if pipeline is not None:
+                conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
+            else:
                if raises_on_error:
-                    raise RuntimeError(
+                    raise ConversionError(
                        f"No pipeline could be initialized for {in_doc.file}."
                    )
                else:
-                    conv_res = ConversionResult(input=in_doc)
-                    conv_res.status = ConversionStatus.FAILURE
-                    return conv_res
-
-            conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
-
+                    conv_res = ConversionResult(
+                        input=in_doc,
+                        status=ConversionStatus.FAILURE,
+                    )
        else:
            if raises_on_error:
-                raise RuntimeError(f"Input document {in_doc.file} is not valid.")
+                raise ConversionError(f"Input document {in_doc.file} is not valid.")

            else:
                # invalid doc or not of desired format
-                conv_res = ConversionResult(input=in_doc)
-                conv_res.status = ConversionStatus.FAILURE
+                conv_res = ConversionResult(
+                    input=in_doc,
+                    status=ConversionStatus.FAILURE,
+                )
                # TODO add error log why it failed.

        return conv_res
--- a/docling/exceptions.py
+++ b/docling/exceptions.py
@ -0,0 +1,6 @@
+class BaseError(RuntimeError):
+    pass
+
+
+class ConversionError(BaseError):
+    pass
--- a/tests/test_invalid_input.py
+++ b/tests/test_invalid_input.py
@ -4,7 +4,7 @@ from pathlib import Path
 import pytest

 from docling.datamodel.base_models import ConversionStatus, DocumentStream
-from docling.document_converter import DocumentConverter
+from docling.document_converter import ConversionError, DocumentConverter


 def get_pdf_path():
@ -20,26 +20,26 @@ def converter():
    return converter


-def test_convert_invalid_doc(converter: DocumentConverter):
-
-    # Test with unrecognizable file format (xyz)
+def test_convert_unsupported_doc_format_wout_exception(converter: DocumentConverter):
    result = converter.convert(
        DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False
    )
-    assert result is None  # No result comes back at all, since this file is skipped.
+    assert result.status == ConversionStatus.UNSUPPORTED

-    with pytest.raises(RuntimeError):
-        result = converter.convert(
+
+def test_convert_unsupported_doc_format_with_exception(converter: DocumentConverter):
+    with pytest.raises(ConversionError):
+        converter.convert(
            DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")),
            raises_on_error=True,
        )

-    # Test with too small filesize limit
+
+def test_convert_too_small_filesize_limit_wout_exception(converter: DocumentConverter):
    result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False)
-    assert result is not None
    assert result.status == ConversionStatus.FAILURE

-    with pytest.raises(RuntimeError):
-        result = converter.convert(
-            get_pdf_path(), max_file_size=1, raises_on_error=True
-        )
+
+def test_convert_too_small_filesize_limit_with_exception(converter: DocumentConverter):
+    with pytest.raises(ConversionError):
+        converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=True)