From 8e57c85bf4019937ca867aaf39221e1ed3a953be Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 2 Dec 2024 13:32:05 +0100 Subject: [PATCH] rename new status, populate ConversionResult errors Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/datamodel/base_models.py | 3 ++- docling/document_converter.py | 18 +++++++++++++++--- tests/test_invalid_input.py | 2 +- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 6cf5aa5a..86dc6ee5 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -22,7 +22,7 @@ class ConversionStatus(str, Enum): FAILURE = auto() SUCCESS = auto() PARTIAL_SUCCESS = auto() - UNSUPPORTED = auto() + SKIPPED = auto() class InputFormat(str, Enum): @@ -94,6 +94,7 @@ class DoclingComponentType(str, Enum): DOCUMENT_BACKEND = auto() MODEL = auto() DOC_ASSEMBLER = auto() + USER_INPUT = auto() class ErrorItem(BaseModel): diff --git a/docling/document_converter.py b/docling/document_converter.py index 054e03f2..503a4c5b 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend -from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat +from docling.datamodel.base_models import ( + ConversionStatus, + DoclingComponentType, + DocumentStream, + ErrorItem, + InputFormat, +) from docling.datamodel.document import ( ConversionResult, InputDocument, @@ -262,11 +268,17 @@ class DocumentConverter: if valid: conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) else: + error_message = f"File format not allowed: {in_doc.file}" if raises_on_error: - raise ConversionError(f"Unsupported format in: {in_doc.file}") + raise ConversionError(error_message) else: + error_item = ErrorItem( + component_type=DoclingComponentType.USER_INPUT, + module_name="", + error_message=error_message, + ) conv_res = ConversionResult( - input=in_doc, status=ConversionStatus.UNSUPPORTED + input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item] ) return conv_res diff --git a/tests/test_invalid_input.py b/tests/test_invalid_input.py index 6e6fc53f..f40d79e4 100644 --- a/tests/test_invalid_input.py +++ b/tests/test_invalid_input.py @@ -24,7 +24,7 @@ def test_convert_unsupported_doc_format_wout_exception(converter: DocumentConver result = converter.convert( DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False ) - assert result.status == ConversionStatus.UNSUPPORTED + assert result.status == ConversionStatus.SKIPPED def test_convert_unsupported_doc_format_with_exception(converter: DocumentConverter):