diff --git a/docling/document_converter.py b/docling/document_converter.py index 74e6f84a..9635078e 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -159,7 +159,7 @@ class DocumentConverter: raises_on_error: bool = True, max_num_pages: int = sys.maxsize, max_file_size: int = sys.maxsize, - ) -> ConversionResult: + ) -> Optional[ConversionResult]: all_res = self.convert_all( source=[source], @@ -167,7 +167,7 @@ class DocumentConverter: max_num_pages=max_num_pages, max_file_size=max_file_size, ) - return next(all_res) + return next(all_res, None) @validate_call(config=ConfigDict(strict=True)) def convert_all( @@ -186,7 +186,10 @@ class DocumentConverter: limits=limits, ) conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error) + + had_result = False for conv_res in conv_res_iter: + had_result = True if raises_on_error and conv_res.status not in { ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS, @@ -197,6 +200,14 @@ class DocumentConverter: else: yield conv_res + if not had_result: + if raises_on_error: + raise RuntimeError( + f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats." + ) + else: + return None + def _convert( self, conv_input: _DocumentConversionInput, raises_on_error: bool ) -> Iterator[ConversionResult]: diff --git a/tests/test_invalid_input.py b/tests/test_invalid_input.py new file mode 100644 index 00000000..0429612b --- /dev/null +++ b/tests/test_invalid_input.py @@ -0,0 +1,51 @@ +from io import BytesIO +from pathlib import Path + +import pytest + +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption + +from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 + +GENERATE = False + + +def get_pdf_path(): + + pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf") + return pdf_path + + +@pytest.fixture +def converter(): + converter = DocumentConverter() + + return converter + + +def test_convert_invalid_doc(converter: DocumentConverter): + + # Test with unrecognizable file format (xyz) + result = converter.convert( + DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False + ) + assert result is None # No result comes back at all, since this file is skipped. + + with pytest.raises(RuntimeError): + result = converter.convert( + DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), + raises_on_error=True, + ) + + # Test with too small filesize limit + result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False) + assert result is not None + assert result.status == ConversionStatus.FAILURE + + with pytest.raises(RuntimeError): + result = converter.convert( + get_pdf_path(), max_file_size=1, raises_on_error=True + )