feat: add the Image backend (#2627)

* feat: add the Image backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the pre-commit Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Fixed single- versus multi-frame image formats Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix: Proper usage of ImageDocumentBackend in the pipeline, deprecate old code. Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Adapt tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: correct mets_gbs backend test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Make ImagePageBackend.get_bitmap_rects() yield Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2025-11-17 11:37:22 +01:00
parent ae30373ee7
commit 3495b73de8
12 changed files with 494 additions and 82 deletions
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@@ -4,9 +4,6 @@ from pathlib import Path
 import pytest
 from pydantic import ValidationError

-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
-from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.backend_options import (
@@ -17,7 +14,7 @@ from docling.datamodel.backend_options import (
 from docling.datamodel.base_models import DocumentStream, InputFormat
 from docling.datamodel.document import InputDocument, _DocumentConversionInput
 from docling.datamodel.settings import DocumentLimits
-from docling.document_converter import PdfFormatOption
+from docling.document_converter import ImageFormatOption, PdfFormatOption


 def test_in_doc_from_valid_path():
@@ -51,36 +48,6 @@ def test_in_doc_from_invalid_buf():
    assert doc.valid is False


-def test_image_in_pdf_backend():
-    in_doc = InputDocument(
-        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
-        format=InputFormat.IMAGE,
-        backend=PyPdfiumDocumentBackend,
-    )
-
-    assert in_doc.valid
-    in_doc = InputDocument(
-        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
-        format=InputFormat.IMAGE,
-        backend=DoclingParseDocumentBackend,
-    )
-    assert in_doc.valid
-
-    in_doc = InputDocument(
-        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
-        format=InputFormat.IMAGE,
-        backend=DoclingParseV2DocumentBackend,
-    )
-    assert in_doc.valid
-
-    in_doc = InputDocument(
-        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
-        format=InputFormat.IMAGE,
-        backend=DoclingParseV4DocumentBackend,
-    )
-    assert in_doc.valid
-
-
 def test_in_doc_with_page_range():
    test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    limits = DocumentLimits()
@@ -297,7 +264,7 @@ def test_tiff_two_pages():
    doc = InputDocument(
        path_or_stream=tiff_path,
        format=InputFormat.IMAGE,
-        backend=PdfFormatOption().backend,  # use default backend
+        backend=ImageFormatOption().backend,  # use default backend
    )
    assert doc.valid is True
    assert doc.page_count == 2