diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 6cf5aa5a..86dc6ee5 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -22,7 +22,7 @@ class ConversionStatus(str, Enum): FAILURE = auto() SUCCESS = auto() PARTIAL_SUCCESS = auto() - UNSUPPORTED = auto() + SKIPPED = auto() class InputFormat(str, Enum): @@ -94,6 +94,7 @@ class DoclingComponentType(str, Enum): DOCUMENT_BACKEND = auto() MODEL = auto() DOC_ASSEMBLER = auto() + USER_INPUT = auto() class ErrorItem(BaseModel): diff --git a/docling/document_converter.py b/docling/document_converter.py index 054e03f2..503a4c5b 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend -from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat +from docling.datamodel.base_models import ( + ConversionStatus, + DoclingComponentType, + DocumentStream, + ErrorItem, + InputFormat, +) from docling.datamodel.document import ( ConversionResult, InputDocument, @@ -262,11 +268,17 @@ class DocumentConverter: if valid: conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) else: + error_message = f"File format not allowed: {in_doc.file}" if raises_on_error: - raise ConversionError(f"Unsupported format in: {in_doc.file}") + raise ConversionError(error_message) else: + error_item = ErrorItem( + component_type=DoclingComponentType.USER_INPUT, + module_name="", + error_message=error_message, + ) conv_res = ConversionResult( - input=in_doc, status=ConversionStatus.UNSUPPORTED + input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item] ) return conv_res diff --git a/tests/test_invalid_input.py b/tests/test_invalid_input.py index 6e6fc53f..f40d79e4 100644 --- a/tests/test_invalid_input.py +++ b/tests/test_invalid_input.py @@ -24,7 +24,7 @@ def test_convert_unsupported_doc_format_wout_exception(converter: DocumentConver result = converter.convert( DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False ) - assert result.status == ConversionStatus.UNSUPPORTED + assert result.status == ConversionStatus.SKIPPED def test_convert_unsupported_doc_format_with_exception(converter: DocumentConverter):