improve handling of unsupported types

- Introduced new explicit exception types instead of `RuntimeError`
- Introduced new `ConversionStatus` value for unsupported formats
- Tidied up converter member typing & removed asserts

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2024-11-26 21:33:32 +01:00
parent bafe673b97
commit 0bb1e203b6
5 changed files with 94 additions and 69 deletions

View File

@ -22,6 +22,7 @@ class ConversionStatus(str, Enum):
FAILURE = auto() FAILURE = auto()
SUCCESS = auto() SUCCESS = auto()
PARTIAL_SUCCESS = auto() PARTIAL_SUCCESS = auto()
UNSUPPORTED = auto()
class InputFormat(str, Enum): class InputFormat(str, Enum):

View File

@ -3,7 +3,7 @@ import re
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
import filetype import filetype
from docling_core.types.doc import ( from docling_core.types.doc import (
@ -164,12 +164,6 @@ class InputDocument(BaseModel):
backend: Type[AbstractDocumentBackend], backend: Type[AbstractDocumentBackend],
path_or_stream: Union[BytesIO, Path], path_or_stream: Union[BytesIO, Path],
) -> None: ) -> None:
if backend is None:
raise RuntimeError(
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
f"Please check your format configuration on DocumentConverter."
)
self._backend = backend(self, path_or_stream=path_or_stream) self._backend = backend(self, path_or_stream=path_or_stream)
if not self._backend.is_valid(): if not self._backend.is_valid():
self.valid = False self.valid = False
@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
return ds_doc return ds_doc
class _DummyBackend(AbstractDocumentBackend):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def is_valid(self) -> bool:
return False
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return set()
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self):
return super().unload()
class _DocumentConversionInput(BaseModel): class _DocumentConversionInput(BaseModel):
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]] path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
@ -461,11 +474,12 @@ class _DocumentConversionInput(BaseModel):
for item in self.path_or_stream_iterator: for item in self.path_or_stream_iterator:
obj = resolve_file_source(item) if isinstance(item, str) else item obj = resolve_file_source(item) if isinstance(item, str) else item
format = self._guess_format(obj) format = self._guess_format(obj)
backend: Type[AbstractDocumentBackend]
if format not in format_options.keys(): if format not in format_options.keys():
_log.info( _log.error(
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats." f"Input document {obj.name} does not match any allowed format."
) )
continue backend = _DummyBackend
else: else:
backend = format_options[format].backend backend = format_options[format].backend

View File

@ -23,6 +23,7 @@ from docling.datamodel.document import (
) )
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import DocumentLimits, settings from docling.datamodel.settings import DocumentLimits, settings
from docling.exceptions import ConversionError
from docling.pipeline.base_pipeline import BasePipeline from docling.pipeline.base_pipeline import BasePipeline
from docling.pipeline.simple_pipeline import SimplePipeline from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@ -121,18 +122,13 @@ class DocumentConverter:
allowed_formats: Optional[List[InputFormat]] = None, allowed_formats: Optional[List[InputFormat]] = None,
format_options: Optional[Dict[InputFormat, FormatOption]] = None, format_options: Optional[Dict[InputFormat, FormatOption]] = None,
): ):
self.allowed_formats = allowed_formats self.allowed_formats = (
self.format_to_options = format_options allowed_formats if allowed_formats is not None else [e for e in InputFormat]
)
if self.allowed_formats is None: self.format_to_options = (
# if self.format_to_options is not None: format_options if format_options is not None else _format_to_default_options
# self.allowed_formats = self.format_to_options.keys() )
# else: if format_options is not None:
self.allowed_formats = [e for e in InputFormat] # all formats
if self.format_to_options is None:
self.format_to_options = _format_to_default_options
else:
for f in self.allowed_formats: for f in self.allowed_formats:
if f not in self.format_to_options.keys(): if f not in self.format_to_options.keys():
_log.debug(f"Requested format {f} will use default options.") _log.debug(f"Requested format {f} will use default options.")
@ -150,7 +146,11 @@ class DocumentConverter:
def initialize_pipeline(self, format: InputFormat): def initialize_pipeline(self, format: InputFormat):
"""Initialize the conversion pipeline for the selected format.""" """Initialize the conversion pipeline for the selected format."""
self._get_pipeline(doc_format=format) pipeline = self._get_pipeline(doc_format=format)
if pipeline is None:
raise ConversionError(
f"No pipeline could be initialized for format {format}"
)
@validate_call(config=ConfigDict(strict=True)) @validate_call(config=ConfigDict(strict=True))
def convert( def convert(
@ -159,7 +159,7 @@ class DocumentConverter:
raises_on_error: bool = True, raises_on_error: bool = True,
max_num_pages: int = sys.maxsize, max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize, max_file_size: int = sys.maxsize,
) -> Optional[ConversionResult]: ) -> ConversionResult:
all_res = self.convert_all( all_res = self.convert_all(
source=[source], source=[source],
@ -167,7 +167,7 @@ class DocumentConverter:
max_num_pages=max_num_pages, max_num_pages=max_num_pages,
max_file_size=max_file_size, max_file_size=max_file_size,
) )
return next(all_res, None) return next(all_res)
@validate_call(config=ConfigDict(strict=True)) @validate_call(config=ConfigDict(strict=True))
def convert_all( def convert_all(
@ -194,22 +194,20 @@ class DocumentConverter:
ConversionStatus.SUCCESS, ConversionStatus.SUCCESS,
ConversionStatus.PARTIAL_SUCCESS, ConversionStatus.PARTIAL_SUCCESS,
}: }:
raise RuntimeError( raise ConversionError(
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}" f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
) )
else: else:
yield conv_res yield conv_res
if not had_result and raises_on_error: if not had_result and raises_on_error:
raise RuntimeError( raise ConversionError(
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats." f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
) )
def _convert( def _convert(
self, conv_input: _DocumentConversionInput, raises_on_error: bool self, conv_input: _DocumentConversionInput, raises_on_error: bool
) -> Iterator[ConversionResult]: ) -> Iterator[ConversionResult]:
assert self.format_to_options is not None
start_time = time.monotonic() start_time = time.monotonic()
for input_batch in chunkify( for input_batch in chunkify(
@ -231,27 +229,22 @@ class DocumentConverter:
): ):
elapsed = time.monotonic() - start_time elapsed = time.monotonic() - start_time
start_time = time.monotonic() start_time = time.monotonic()
_log.info(
if item is not None: f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
_log.info( )
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec." yield item
)
yield item
else:
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]: def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
assert self.format_to_options is not None
fopt = self.format_to_options.get(doc_format) fopt = self.format_to_options.get(doc_format)
if fopt is None: if fopt is None:
raise RuntimeError(f"Could not get pipeline for {doc_format}") return None
else: else:
pipeline_class = fopt.pipeline_cls pipeline_class = fopt.pipeline_cls
pipeline_options = fopt.pipeline_options pipeline_options = fopt.pipeline_options
assert pipeline_options is not None if pipeline_options is None:
return None
# TODO this will ignore if different options have been defined for the same pipeline class. # TODO this will ignore if different options have been defined for the same pipeline class.
if ( if (
pipeline_class not in self.initialized_pipelines pipeline_class not in self.initialized_pipelines
@ -265,11 +258,20 @@ class DocumentConverter:
def _process_document( def _process_document(
self, in_doc: InputDocument, raises_on_error: bool self, in_doc: InputDocument, raises_on_error: bool
) -> Optional[ConversionResult]: ) -> ConversionResult:
assert self.allowed_formats is not None
assert in_doc.format in self.allowed_formats
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) valid = (
self.allowed_formats is not None and in_doc.format in self.allowed_formats
)
if valid:
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
else:
if raises_on_error:
raise ConversionError(f"Unsupported format in: {in_doc.file}")
else:
conv_res = ConversionResult(
input=in_doc, status=ConversionStatus.UNSUPPORTED
)
return conv_res return conv_res
@ -278,26 +280,28 @@ class DocumentConverter:
) -> ConversionResult: ) -> ConversionResult:
if in_doc.valid: if in_doc.valid:
pipeline = self._get_pipeline(in_doc.format) pipeline = self._get_pipeline(in_doc.format)
if pipeline is None: # Can't find a default pipeline. Should this raise? if pipeline is not None:
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
else:
if raises_on_error: if raises_on_error:
raise RuntimeError( raise ConversionError(
f"No pipeline could be initialized for {in_doc.file}." f"No pipeline could be initialized for {in_doc.file}."
) )
else: else:
conv_res = ConversionResult(input=in_doc) conv_res = ConversionResult(
conv_res.status = ConversionStatus.FAILURE input=in_doc,
return conv_res status=ConversionStatus.FAILURE,
)
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
else: else:
if raises_on_error: if raises_on_error:
raise RuntimeError(f"Input document {in_doc.file} is not valid.") raise ConversionError(f"Input document {in_doc.file} is not valid.")
else: else:
# invalid doc or not of desired format # invalid doc or not of desired format
conv_res = ConversionResult(input=in_doc) conv_res = ConversionResult(
conv_res.status = ConversionStatus.FAILURE input=in_doc,
status=ConversionStatus.FAILURE,
)
# TODO add error log why it failed. # TODO add error log why it failed.
return conv_res return conv_res

6
docling/exceptions.py Normal file
View File

@ -0,0 +1,6 @@
class BaseError(RuntimeError):
pass
class ConversionError(BaseError):
pass

View File

@ -4,7 +4,7 @@ from pathlib import Path
import pytest import pytest
from docling.datamodel.base_models import ConversionStatus, DocumentStream from docling.datamodel.base_models import ConversionStatus, DocumentStream
from docling.document_converter import DocumentConverter from docling.document_converter import ConversionError, DocumentConverter
def get_pdf_path(): def get_pdf_path():
@ -20,26 +20,26 @@ def converter():
return converter return converter
def test_convert_invalid_doc(converter: DocumentConverter): def test_convert_unsupported_doc_format_wout_exception(converter: DocumentConverter):
# Test with unrecognizable file format (xyz)
result = converter.convert( result = converter.convert(
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False
) )
assert result is None # No result comes back at all, since this file is skipped. assert result.status == ConversionStatus.UNSUPPORTED
with pytest.raises(RuntimeError):
result = converter.convert( def test_convert_unsupported_doc_format_with_exception(converter: DocumentConverter):
with pytest.raises(ConversionError):
converter.convert(
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")),
raises_on_error=True, raises_on_error=True,
) )
# Test with too small filesize limit
def test_convert_too_small_filesize_limit_wout_exception(converter: DocumentConverter):
result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False) result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False)
assert result is not None
assert result.status == ConversionStatus.FAILURE assert result.status == ConversionStatus.FAILURE
with pytest.raises(RuntimeError):
result = converter.convert( def test_convert_too_small_filesize_limit_with_exception(converter: DocumentConverter):
get_pdf_path(), max_file_size=1, raises_on_error=True with pytest.raises(ConversionError):
) converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=True)