Fixes for DPv4 backend init, better test coverage

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2025-03-17 09:26:31 +01:00
parent e34c0750a7
commit fe45d30942
3 changed files with 54 additions and 14 deletions
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@@ -1,6 +1,9 @@
 from io import BytesIO
 from pathlib import Path

+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import DocumentStream, InputFormat
 from docling.datamodel.document import InputDocument, _DocumentConversionInput
@@ -40,7 +43,39 @@ def test_in_doc_from_invalid_buf():
    assert doc.valid == False


+def test_image_in_pdf_backend():
+
+    in_doc = InputDocument(
+        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
+        format=InputFormat.IMAGE,
+        backend=PyPdfiumDocumentBackend,
+    )
+
+    assert in_doc.valid
+    in_doc = InputDocument(
+        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
+        format=InputFormat.IMAGE,
+        backend=DoclingParseDocumentBackend,
+    )
+    assert in_doc.valid
+
+    in_doc = InputDocument(
+        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
+        format=InputFormat.IMAGE,
+        backend=DoclingParseV2DocumentBackend,
+    )
+    assert in_doc.valid
+
+    in_doc = InputDocument(
+        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
+        format=InputFormat.IMAGE,
+        backend=DoclingParseV4DocumentBackend,
+    )
+    assert in_doc.valid
+
+
 def test_in_doc_with_page_range():
+
    test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    limits = DocumentLimits()
    limits.page_range = (1, 10)
@@ -192,7 +227,7 @@ def _make_input_doc(path):
    in_doc = InputDocument(
        path_or_stream=path,
        format=InputFormat.PDF,
-        backend=PyPdfiumDocumentBackend,
+        backend=DoclingParseV4DocumentBackend,
    )
    return in_doc

@@ -202,6 +237,6 @@ def _make_input_doc_from_stream(doc_stream):
        path_or_stream=doc_stream.stream,
        format=InputFormat.PDF,
        filename=doc_stream.name,
-        backend=PyPdfiumDocumentBackend,
+        backend=DoclingParseV4DocumentBackend,
    )
    return in_doc
--- a/tests/test_legacy_format_transform.py
+++ b/tests/test_legacy_format_transform.py
@@ -4,6 +4,7 @@ from pathlib import Path
 import pytest

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
@@ -30,7 +31,7 @@ def get_converter():
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
            InputFormat.IMAGE: PdfFormatOption(
-                pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
+                pipeline_options=pipeline_options,
            ),
        }
    )