From 0bb1e203b6c5b08de9b6cadf566d8e63f4a1ff12 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Tue, 26 Nov 2024 21:33:32 +0100 Subject: [PATCH] improve handling of unsupported types - Introduced new explicit exception types instead of `RuntimeError` - Introduced new `ConversionStatus` value for unsupported formats - Tidied up converter member typing & removed asserts Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/datamodel/base_models.py | 1 + docling/datamodel/document.py | 34 +++++++---- docling/document_converter.py | 96 +++++++++++++++++--------------- docling/exceptions.py | 6 ++ tests/test_invalid_input.py | 26 ++++----- 5 files changed, 94 insertions(+), 69 deletions(-) create mode 100644 docling/exceptions.py diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 311d6d01..6cf5aa5a 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -22,6 +22,7 @@ class ConversionStatus(str, Enum): FAILURE = auto() SUCCESS = auto() PARTIAL_SUCCESS = auto() + UNSUPPORTED = auto() class InputFormat(str, Enum): diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index be4e9a12..d7befd01 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -3,7 +3,7 @@ import re from enum import Enum from io import BytesIO from pathlib import Path, PurePath -from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union import filetype from docling_core.types.doc import ( @@ -164,12 +164,6 @@ class InputDocument(BaseModel): backend: Type[AbstractDocumentBackend], path_or_stream: Union[BytesIO, Path], ) -> None: - if backend is None: - raise RuntimeError( - f"No backend configuration provided for file {self.file.name} with format {self.format}. " - f"Please check your format configuration on DocumentConverter." - ) - self._backend = backend(self, path_or_stream=path_or_stream) if not self._backend.is_valid(): self.valid = False @@ -450,6 +444,25 @@ class ConversionResult(BaseModel): return ds_doc +class _DummyBackend(AbstractDocumentBackend): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def is_valid(self) -> bool: + return False + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return set() + + @classmethod + def supports_pagination(cls) -> bool: + return False + + def unload(self): + return super().unload() + + class _DocumentConversionInput(BaseModel): path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]] @@ -461,11 +474,12 @@ class _DocumentConversionInput(BaseModel): for item in self.path_or_stream_iterator: obj = resolve_file_source(item) if isinstance(item, str) else item format = self._guess_format(obj) + backend: Type[AbstractDocumentBackend] if format not in format_options.keys(): - _log.info( - f"Skipping input document {obj.name} because it isn't matching any of the allowed formats." + _log.error( + f"Input document {obj.name} does not match any allowed format." ) - continue + backend = _DummyBackend else: backend = format_options[format].backend diff --git a/docling/document_converter.py b/docling/document_converter.py index fae111be..82c50a0f 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -23,6 +23,7 @@ from docling.datamodel.document import ( ) from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.settings import DocumentLimits, settings +from docling.exceptions import ConversionError from docling.pipeline.base_pipeline import BasePipeline from docling.pipeline.simple_pipeline import SimplePipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline @@ -121,18 +122,13 @@ class DocumentConverter: allowed_formats: Optional[List[InputFormat]] = None, format_options: Optional[Dict[InputFormat, FormatOption]] = None, ): - self.allowed_formats = allowed_formats - self.format_to_options = format_options - - if self.allowed_formats is None: - # if self.format_to_options is not None: - # self.allowed_formats = self.format_to_options.keys() - # else: - self.allowed_formats = [e for e in InputFormat] # all formats - - if self.format_to_options is None: - self.format_to_options = _format_to_default_options - else: + self.allowed_formats = ( + allowed_formats if allowed_formats is not None else [e for e in InputFormat] + ) + self.format_to_options = ( + format_options if format_options is not None else _format_to_default_options + ) + if format_options is not None: for f in self.allowed_formats: if f not in self.format_to_options.keys(): _log.debug(f"Requested format {f} will use default options.") @@ -150,7 +146,11 @@ class DocumentConverter: def initialize_pipeline(self, format: InputFormat): """Initialize the conversion pipeline for the selected format.""" - self._get_pipeline(doc_format=format) + pipeline = self._get_pipeline(doc_format=format) + if pipeline is None: + raise ConversionError( + f"No pipeline could be initialized for format {format}" + ) @validate_call(config=ConfigDict(strict=True)) def convert( @@ -159,7 +159,7 @@ class DocumentConverter: raises_on_error: bool = True, max_num_pages: int = sys.maxsize, max_file_size: int = sys.maxsize, - ) -> Optional[ConversionResult]: + ) -> ConversionResult: all_res = self.convert_all( source=[source], @@ -167,7 +167,7 @@ class DocumentConverter: max_num_pages=max_num_pages, max_file_size=max_file_size, ) - return next(all_res, None) + return next(all_res) @validate_call(config=ConfigDict(strict=True)) def convert_all( @@ -194,22 +194,20 @@ class DocumentConverter: ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS, }: - raise RuntimeError( + raise ConversionError( f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}" ) else: yield conv_res if not had_result and raises_on_error: - raise RuntimeError( + raise ConversionError( f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats." ) def _convert( self, conv_input: _DocumentConversionInput, raises_on_error: bool ) -> Iterator[ConversionResult]: - assert self.format_to_options is not None - start_time = time.monotonic() for input_batch in chunkify( @@ -231,27 +229,22 @@ class DocumentConverter: ): elapsed = time.monotonic() - start_time start_time = time.monotonic() - - if item is not None: - _log.info( - f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec." - ) - yield item - else: - _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.") + _log.info( + f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec." + ) + yield item def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]: - assert self.format_to_options is not None - fopt = self.format_to_options.get(doc_format) if fopt is None: - raise RuntimeError(f"Could not get pipeline for {doc_format}") + return None else: pipeline_class = fopt.pipeline_cls pipeline_options = fopt.pipeline_options - assert pipeline_options is not None + if pipeline_options is None: + return None # TODO this will ignore if different options have been defined for the same pipeline class. if ( pipeline_class not in self.initialized_pipelines @@ -265,11 +258,20 @@ class DocumentConverter: def _process_document( self, in_doc: InputDocument, raises_on_error: bool - ) -> Optional[ConversionResult]: - assert self.allowed_formats is not None - assert in_doc.format in self.allowed_formats + ) -> ConversionResult: - conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) + valid = ( + self.allowed_formats is not None and in_doc.format in self.allowed_formats + ) + if valid: + conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) + else: + if raises_on_error: + raise ConversionError(f"Unsupported format in: {in_doc.file}") + else: + conv_res = ConversionResult( + input=in_doc, status=ConversionStatus.UNSUPPORTED + ) return conv_res @@ -278,26 +280,28 @@ class DocumentConverter: ) -> ConversionResult: if in_doc.valid: pipeline = self._get_pipeline(in_doc.format) - if pipeline is None: # Can't find a default pipeline. Should this raise? + if pipeline is not None: + conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error) + else: if raises_on_error: - raise RuntimeError( + raise ConversionError( f"No pipeline could be initialized for {in_doc.file}." ) else: - conv_res = ConversionResult(input=in_doc) - conv_res.status = ConversionStatus.FAILURE - return conv_res - - conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error) - + conv_res = ConversionResult( + input=in_doc, + status=ConversionStatus.FAILURE, + ) else: if raises_on_error: - raise RuntimeError(f"Input document {in_doc.file} is not valid.") + raise ConversionError(f"Input document {in_doc.file} is not valid.") else: # invalid doc or not of desired format - conv_res = ConversionResult(input=in_doc) - conv_res.status = ConversionStatus.FAILURE + conv_res = ConversionResult( + input=in_doc, + status=ConversionStatus.FAILURE, + ) # TODO add error log why it failed. return conv_res diff --git a/docling/exceptions.py b/docling/exceptions.py new file mode 100644 index 00000000..13145b9c --- /dev/null +++ b/docling/exceptions.py @@ -0,0 +1,6 @@ +class BaseError(RuntimeError): + pass + + +class ConversionError(BaseError): + pass diff --git a/tests/test_invalid_input.py b/tests/test_invalid_input.py index cbd12d09..6e6fc53f 100644 --- a/tests/test_invalid_input.py +++ b/tests/test_invalid_input.py @@ -4,7 +4,7 @@ from pathlib import Path import pytest from docling.datamodel.base_models import ConversionStatus, DocumentStream -from docling.document_converter import DocumentConverter +from docling.document_converter import ConversionError, DocumentConverter def get_pdf_path(): @@ -20,26 +20,26 @@ def converter(): return converter -def test_convert_invalid_doc(converter: DocumentConverter): - - # Test with unrecognizable file format (xyz) +def test_convert_unsupported_doc_format_wout_exception(converter: DocumentConverter): result = converter.convert( DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False ) - assert result is None # No result comes back at all, since this file is skipped. + assert result.status == ConversionStatus.UNSUPPORTED - with pytest.raises(RuntimeError): - result = converter.convert( + +def test_convert_unsupported_doc_format_with_exception(converter: DocumentConverter): + with pytest.raises(ConversionError): + converter.convert( DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=True, ) - # Test with too small filesize limit + +def test_convert_too_small_filesize_limit_wout_exception(converter: DocumentConverter): result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False) - assert result is not None assert result.status == ConversionStatus.FAILURE - with pytest.raises(RuntimeError): - result = converter.convert( - get_pdf_path(), max_file_size=1, raises_on_error=True - ) + +def test_convert_too_small_filesize_limit_with_exception(converter: DocumentConverter): + with pytest.raises(ConversionError): + converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=True)