mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
improve handling of unsupported types
- Introduced new explicit exception types instead of `RuntimeError` - Introduced new `ConversionStatus` value for unsupported formats - Tidied up converter member typing & removed asserts Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
bafe673b97
commit
0bb1e203b6
@ -22,6 +22,7 @@ class ConversionStatus(str, Enum):
|
|||||||
FAILURE = auto()
|
FAILURE = auto()
|
||||||
SUCCESS = auto()
|
SUCCESS = auto()
|
||||||
PARTIAL_SUCCESS = auto()
|
PARTIAL_SUCCESS = auto()
|
||||||
|
UNSUPPORTED = auto()
|
||||||
|
|
||||||
|
|
||||||
class InputFormat(str, Enum):
|
class InputFormat(str, Enum):
|
||||||
|
@ -3,7 +3,7 @@ import re
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
|
||||||
|
|
||||||
import filetype
|
import filetype
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
@ -164,12 +164,6 @@ class InputDocument(BaseModel):
|
|||||||
backend: Type[AbstractDocumentBackend],
|
backend: Type[AbstractDocumentBackend],
|
||||||
path_or_stream: Union[BytesIO, Path],
|
path_or_stream: Union[BytesIO, Path],
|
||||||
) -> None:
|
) -> None:
|
||||||
if backend is None:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
|
||||||
f"Please check your format configuration on DocumentConverter."
|
|
||||||
)
|
|
||||||
|
|
||||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||||
if not self._backend.is_valid():
|
if not self._backend.is_valid():
|
||||||
self.valid = False
|
self.valid = False
|
||||||
@ -450,6 +444,25 @@ class ConversionResult(BaseModel):
|
|||||||
return ds_doc
|
return ds_doc
|
||||||
|
|
||||||
|
|
||||||
|
class _DummyBackend(AbstractDocumentBackend):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supports_pagination(cls) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
return super().unload()
|
||||||
|
|
||||||
|
|
||||||
class _DocumentConversionInput(BaseModel):
|
class _DocumentConversionInput(BaseModel):
|
||||||
|
|
||||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||||
@ -461,11 +474,12 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
for item in self.path_or_stream_iterator:
|
for item in self.path_or_stream_iterator:
|
||||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
obj = resolve_file_source(item) if isinstance(item, str) else item
|
||||||
format = self._guess_format(obj)
|
format = self._guess_format(obj)
|
||||||
|
backend: Type[AbstractDocumentBackend]
|
||||||
if format not in format_options.keys():
|
if format not in format_options.keys():
|
||||||
_log.info(
|
_log.error(
|
||||||
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
|
f"Input document {obj.name} does not match any allowed format."
|
||||||
)
|
)
|
||||||
continue
|
backend = _DummyBackend
|
||||||
else:
|
else:
|
||||||
backend = format_options[format].backend
|
backend = format_options[format].backend
|
||||||
|
|
||||||
|
@ -23,6 +23,7 @@ from docling.datamodel.document import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import PipelineOptions
|
||||||
from docling.datamodel.settings import DocumentLimits, settings
|
from docling.datamodel.settings import DocumentLimits, settings
|
||||||
|
from docling.exceptions import ConversionError
|
||||||
from docling.pipeline.base_pipeline import BasePipeline
|
from docling.pipeline.base_pipeline import BasePipeline
|
||||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||||
@ -121,18 +122,13 @@ class DocumentConverter:
|
|||||||
allowed_formats: Optional[List[InputFormat]] = None,
|
allowed_formats: Optional[List[InputFormat]] = None,
|
||||||
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
||||||
):
|
):
|
||||||
self.allowed_formats = allowed_formats
|
self.allowed_formats = (
|
||||||
self.format_to_options = format_options
|
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
|
||||||
|
)
|
||||||
if self.allowed_formats is None:
|
self.format_to_options = (
|
||||||
# if self.format_to_options is not None:
|
format_options if format_options is not None else _format_to_default_options
|
||||||
# self.allowed_formats = self.format_to_options.keys()
|
)
|
||||||
# else:
|
if format_options is not None:
|
||||||
self.allowed_formats = [e for e in InputFormat] # all formats
|
|
||||||
|
|
||||||
if self.format_to_options is None:
|
|
||||||
self.format_to_options = _format_to_default_options
|
|
||||||
else:
|
|
||||||
for f in self.allowed_formats:
|
for f in self.allowed_formats:
|
||||||
if f not in self.format_to_options.keys():
|
if f not in self.format_to_options.keys():
|
||||||
_log.debug(f"Requested format {f} will use default options.")
|
_log.debug(f"Requested format {f} will use default options.")
|
||||||
@ -150,7 +146,11 @@ class DocumentConverter:
|
|||||||
|
|
||||||
def initialize_pipeline(self, format: InputFormat):
|
def initialize_pipeline(self, format: InputFormat):
|
||||||
"""Initialize the conversion pipeline for the selected format."""
|
"""Initialize the conversion pipeline for the selected format."""
|
||||||
self._get_pipeline(doc_format=format)
|
pipeline = self._get_pipeline(doc_format=format)
|
||||||
|
if pipeline is None:
|
||||||
|
raise ConversionError(
|
||||||
|
f"No pipeline could be initialized for format {format}"
|
||||||
|
)
|
||||||
|
|
||||||
@validate_call(config=ConfigDict(strict=True))
|
@validate_call(config=ConfigDict(strict=True))
|
||||||
def convert(
|
def convert(
|
||||||
@ -159,7 +159,7 @@ class DocumentConverter:
|
|||||||
raises_on_error: bool = True,
|
raises_on_error: bool = True,
|
||||||
max_num_pages: int = sys.maxsize,
|
max_num_pages: int = sys.maxsize,
|
||||||
max_file_size: int = sys.maxsize,
|
max_file_size: int = sys.maxsize,
|
||||||
) -> Optional[ConversionResult]:
|
) -> ConversionResult:
|
||||||
|
|
||||||
all_res = self.convert_all(
|
all_res = self.convert_all(
|
||||||
source=[source],
|
source=[source],
|
||||||
@ -167,7 +167,7 @@ class DocumentConverter:
|
|||||||
max_num_pages=max_num_pages,
|
max_num_pages=max_num_pages,
|
||||||
max_file_size=max_file_size,
|
max_file_size=max_file_size,
|
||||||
)
|
)
|
||||||
return next(all_res, None)
|
return next(all_res)
|
||||||
|
|
||||||
@validate_call(config=ConfigDict(strict=True))
|
@validate_call(config=ConfigDict(strict=True))
|
||||||
def convert_all(
|
def convert_all(
|
||||||
@ -194,22 +194,20 @@ class DocumentConverter:
|
|||||||
ConversionStatus.SUCCESS,
|
ConversionStatus.SUCCESS,
|
||||||
ConversionStatus.PARTIAL_SUCCESS,
|
ConversionStatus.PARTIAL_SUCCESS,
|
||||||
}:
|
}:
|
||||||
raise RuntimeError(
|
raise ConversionError(
|
||||||
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
yield conv_res
|
yield conv_res
|
||||||
|
|
||||||
if not had_result and raises_on_error:
|
if not had_result and raises_on_error:
|
||||||
raise RuntimeError(
|
raise ConversionError(
|
||||||
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
||||||
)
|
)
|
||||||
|
|
||||||
def _convert(
|
def _convert(
|
||||||
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
||||||
) -> Iterator[ConversionResult]:
|
) -> Iterator[ConversionResult]:
|
||||||
assert self.format_to_options is not None
|
|
||||||
|
|
||||||
start_time = time.monotonic()
|
start_time = time.monotonic()
|
||||||
|
|
||||||
for input_batch in chunkify(
|
for input_batch in chunkify(
|
||||||
@ -231,27 +229,22 @@ class DocumentConverter:
|
|||||||
):
|
):
|
||||||
elapsed = time.monotonic() - start_time
|
elapsed = time.monotonic() - start_time
|
||||||
start_time = time.monotonic()
|
start_time = time.monotonic()
|
||||||
|
|
||||||
if item is not None:
|
|
||||||
_log.info(
|
_log.info(
|
||||||
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
||||||
)
|
)
|
||||||
yield item
|
yield item
|
||||||
else:
|
|
||||||
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
|
||||||
|
|
||||||
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
||||||
assert self.format_to_options is not None
|
|
||||||
|
|
||||||
fopt = self.format_to_options.get(doc_format)
|
fopt = self.format_to_options.get(doc_format)
|
||||||
|
|
||||||
if fopt is None:
|
if fopt is None:
|
||||||
raise RuntimeError(f"Could not get pipeline for {doc_format}")
|
return None
|
||||||
else:
|
else:
|
||||||
pipeline_class = fopt.pipeline_cls
|
pipeline_class = fopt.pipeline_cls
|
||||||
pipeline_options = fopt.pipeline_options
|
pipeline_options = fopt.pipeline_options
|
||||||
|
|
||||||
assert pipeline_options is not None
|
if pipeline_options is None:
|
||||||
|
return None
|
||||||
# TODO this will ignore if different options have been defined for the same pipeline class.
|
# TODO this will ignore if different options have been defined for the same pipeline class.
|
||||||
if (
|
if (
|
||||||
pipeline_class not in self.initialized_pipelines
|
pipeline_class not in self.initialized_pipelines
|
||||||
@ -265,11 +258,20 @@ class DocumentConverter:
|
|||||||
|
|
||||||
def _process_document(
|
def _process_document(
|
||||||
self, in_doc: InputDocument, raises_on_error: bool
|
self, in_doc: InputDocument, raises_on_error: bool
|
||||||
) -> Optional[ConversionResult]:
|
) -> ConversionResult:
|
||||||
assert self.allowed_formats is not None
|
|
||||||
assert in_doc.format in self.allowed_formats
|
|
||||||
|
|
||||||
|
valid = (
|
||||||
|
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
||||||
|
)
|
||||||
|
if valid:
|
||||||
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
||||||
|
else:
|
||||||
|
if raises_on_error:
|
||||||
|
raise ConversionError(f"Unsupported format in: {in_doc.file}")
|
||||||
|
else:
|
||||||
|
conv_res = ConversionResult(
|
||||||
|
input=in_doc, status=ConversionStatus.UNSUPPORTED
|
||||||
|
)
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
@ -278,26 +280,28 @@ class DocumentConverter:
|
|||||||
) -> ConversionResult:
|
) -> ConversionResult:
|
||||||
if in_doc.valid:
|
if in_doc.valid:
|
||||||
pipeline = self._get_pipeline(in_doc.format)
|
pipeline = self._get_pipeline(in_doc.format)
|
||||||
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
if pipeline is not None:
|
||||||
|
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
||||||
|
else:
|
||||||
if raises_on_error:
|
if raises_on_error:
|
||||||
raise RuntimeError(
|
raise ConversionError(
|
||||||
f"No pipeline could be initialized for {in_doc.file}."
|
f"No pipeline could be initialized for {in_doc.file}."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
conv_res = ConversionResult(input=in_doc)
|
conv_res = ConversionResult(
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
input=in_doc,
|
||||||
return conv_res
|
status=ConversionStatus.FAILURE,
|
||||||
|
)
|
||||||
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if raises_on_error:
|
if raises_on_error:
|
||||||
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
|
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# invalid doc or not of desired format
|
# invalid doc or not of desired format
|
||||||
conv_res = ConversionResult(input=in_doc)
|
conv_res = ConversionResult(
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
input=in_doc,
|
||||||
|
status=ConversionStatus.FAILURE,
|
||||||
|
)
|
||||||
# TODO add error log why it failed.
|
# TODO add error log why it failed.
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
6
docling/exceptions.py
Normal file
6
docling/exceptions.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
class BaseError(RuntimeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ConversionError(BaseError):
|
||||||
|
pass
|
@ -4,7 +4,7 @@ from pathlib import Path
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream
|
from docling.datamodel.base_models import ConversionStatus, DocumentStream
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import ConversionError, DocumentConverter
|
||||||
|
|
||||||
|
|
||||||
def get_pdf_path():
|
def get_pdf_path():
|
||||||
@ -20,26 +20,26 @@ def converter():
|
|||||||
return converter
|
return converter
|
||||||
|
|
||||||
|
|
||||||
def test_convert_invalid_doc(converter: DocumentConverter):
|
def test_convert_unsupported_doc_format_wout_exception(converter: DocumentConverter):
|
||||||
|
|
||||||
# Test with unrecognizable file format (xyz)
|
|
||||||
result = converter.convert(
|
result = converter.convert(
|
||||||
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False
|
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False
|
||||||
)
|
)
|
||||||
assert result is None # No result comes back at all, since this file is skipped.
|
assert result.status == ConversionStatus.UNSUPPORTED
|
||||||
|
|
||||||
with pytest.raises(RuntimeError):
|
|
||||||
result = converter.convert(
|
def test_convert_unsupported_doc_format_with_exception(converter: DocumentConverter):
|
||||||
|
with pytest.raises(ConversionError):
|
||||||
|
converter.convert(
|
||||||
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")),
|
DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")),
|
||||||
raises_on_error=True,
|
raises_on_error=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test with too small filesize limit
|
|
||||||
|
def test_convert_too_small_filesize_limit_wout_exception(converter: DocumentConverter):
|
||||||
result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False)
|
result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False)
|
||||||
assert result is not None
|
|
||||||
assert result.status == ConversionStatus.FAILURE
|
assert result.status == ConversionStatus.FAILURE
|
||||||
|
|
||||||
with pytest.raises(RuntimeError):
|
|
||||||
result = converter.convert(
|
def test_convert_too_small_filesize_limit_with_exception(converter: DocumentConverter):
|
||||||
get_pdf_path(), max_file_size=1, raises_on_error=True
|
with pytest.raises(ConversionError):
|
||||||
)
|
converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=True)
|
||||||
|
Loading…
Reference in New Issue
Block a user