feat: add the Image backend (#2627)

* feat: add the Image backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the pre-commit Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Fixed single- versus multi-frame image formats Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix: Proper usage of ImageDocumentBackend in the pipeline, deprecate old code. Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Adapt tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: correct mets_gbs backend test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Make ImagePageBackend.get_bitmap_rects() yield Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2025-11-17 11:37:22 +01:00
parent ae30373ee7
commit 3495b73de8
12 changed files with 494 additions and 82 deletions
--- a/tests/test_backend_image_native.py
+++ b/tests/test_backend_image_native.py
@@ -0,0 +1,218 @@
+from io import BytesIO
+from pathlib import Path
+
+import pytest
+from docling_core.types.doc import BoundingBox, CoordOrigin
+from PIL import Image
+
+from docling.backend.image_backend import ImageDocumentBackend, _ImagePageBackend
+from docling.datamodel.base_models import DocumentStream, InputFormat
+from docling.datamodel.document import InputDocument, _DocumentConversionInput
+from docling.document_converter import DocumentConverter, ImageFormatOption
+from docling.document_extractor import DocumentExtractor
+
+
+def _make_png_stream(
+    width: int = 64, height: int = 48, color=(123, 45, 67)
+) -> DocumentStream:
+    img = Image.new("RGB", (width, height), color)
+    buf = BytesIO()
+    img.save(buf, format="PNG")
+    buf.seek(0)
+    return DocumentStream(name="test.png", stream=buf)
+
+
+def _make_multipage_tiff_stream(num_pages: int = 3, size=(32, 32)) -> DocumentStream:
+    frames = [
+        Image.new("RGB", size, (i * 10 % 255, i * 20 % 255, i * 30 % 255))
+        for i in range(num_pages)
+    ]
+    buf = BytesIO()
+    frames[0].save(buf, format="TIFF", save_all=True, append_images=frames[1:])
+    buf.seek(0)
+    return DocumentStream(name="test.tiff", stream=buf)
+
+
+def test_docs_builder_uses_image_backend_for_image_stream():
+    stream = _make_png_stream()
+    conv_input = _DocumentConversionInput(path_or_stream_iterator=[stream])
+    # Provide format options mapping that includes IMAGE -> ImageFormatOption (which carries ImageDocumentBackend)
+    format_options = {InputFormat.IMAGE: ImageFormatOption()}
+
+    docs = list(conv_input.docs(format_options))
+    assert len(docs) == 1
+    in_doc = docs[0]
+    assert in_doc.format == InputFormat.IMAGE
+    assert isinstance(in_doc._backend, ImageDocumentBackend)
+    assert in_doc.page_count == 1
+
+
+def test_docs_builder_multipage_tiff_counts_frames():
+    stream = _make_multipage_tiff_stream(num_pages=4)
+    conv_input = _DocumentConversionInput(path_or_stream_iterator=[stream])
+    format_options = {InputFormat.IMAGE: ImageFormatOption()}
+
+    in_doc = next(conv_input.docs(format_options))
+    assert isinstance(in_doc._backend, ImageDocumentBackend)
+    assert in_doc.page_count == 4
+
+
+def test_converter_default_maps_image_to_image_backend():
+    converter = DocumentConverter(allowed_formats=[InputFormat.IMAGE])
+    backend_cls = converter.format_to_options[InputFormat.IMAGE].backend
+    assert backend_cls is ImageDocumentBackend
+
+
+def test_extractor_default_maps_image_to_image_backend():
+    extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE])
+    backend_cls = extractor.extraction_format_to_options[InputFormat.IMAGE].backend
+    assert backend_cls is ImageDocumentBackend
+
+
+def _get_backend_from_stream(stream: DocumentStream):
+    """Helper to create InputDocument with ImageDocumentBackend from a stream."""
+    in_doc = InputDocument(
+        path_or_stream=stream.stream,
+        format=InputFormat.IMAGE,
+        backend=ImageDocumentBackend,
+        filename=stream.name,
+    )
+    return in_doc._backend
+
+
+def test_num_pages_single():
+    """Test page count for single-page image."""
+    stream = _make_png_stream(width=100, height=80)
+    doc_backend = _get_backend_from_stream(stream)
+    assert doc_backend.page_count() == 1
+
+
+def test_num_pages_multipage():
+    """Test page count for multi-page TIFF."""
+    stream = _make_multipage_tiff_stream(num_pages=5, size=(64, 64))
+    doc_backend = _get_backend_from_stream(stream)
+    assert doc_backend.page_count() == 5
+
+
+def test_get_size():
+    """Test getting page size."""
+    width, height = 120, 90
+    stream = _make_png_stream(width=width, height=height)
+    doc_backend = _get_backend_from_stream(stream)
+    page_backend: _ImagePageBackend = doc_backend.load_page(0)
+    size = page_backend.get_size()
+    assert size.width == width
+    assert size.height == height
+
+
+def test_get_page_image_full():
+    """Test getting full page image."""
+    width, height = 100, 80
+    stream = _make_png_stream(width=width, height=height)
+    doc_backend = _get_backend_from_stream(stream)
+    page_backend: _ImagePageBackend = doc_backend.load_page(0)
+    img = page_backend.get_page_image()
+    assert img.width == width
+    assert img.height == height
+
+
+def test_get_page_image_scaled():
+    """Test getting scaled page image."""
+    width, height = 100, 80
+    scale = 2.0
+    stream = _make_png_stream(width=width, height=height)
+    doc_backend = _get_backend_from_stream(stream)
+    page_backend: _ImagePageBackend = doc_backend.load_page(0)
+    img = page_backend.get_page_image(scale=scale)
+    assert img.width == round(width * scale)
+    assert img.height == round(height * scale)
+
+
+def test_crop_page_image():
+    """Test cropping page image."""
+    width, height = 200, 150
+    stream = _make_png_stream(width=width, height=height)
+    doc_backend = _get_backend_from_stream(stream)
+    page_backend: _ImagePageBackend = doc_backend.load_page(0)
+
+    # Crop a region from the center
+    cropbox = BoundingBox(l=50, t=30, r=150, b=120, coord_origin=CoordOrigin.TOPLEFT)
+    img = page_backend.get_page_image(cropbox=cropbox)
+    assert img.width == 100  # 150 - 50
+    assert img.height == 90  # 120 - 30
+
+
+def test_crop_page_image_scaled():
+    """Test cropping and scaling page image."""
+    width, height = 200, 150
+    scale = 0.5
+    stream = _make_png_stream(width=width, height=height)
+    doc_backend = _get_backend_from_stream(stream)
+    page_backend: _ImagePageBackend = doc_backend.load_page(0)
+
+    cropbox = BoundingBox(l=50, t=30, r=150, b=120, coord_origin=CoordOrigin.TOPLEFT)
+    img = page_backend.get_page_image(scale=scale, cropbox=cropbox)
+    assert img.width == round(100 * scale)  # cropped width * scale
+    assert img.height == round(90 * scale)  # cropped height * scale
+
+
+def test_get_bitmap_rects():
+    """Test getting bitmap rects - should return full page rectangle."""
+    width, height = 100, 80
+    stream = _make_png_stream(width=width, height=height)
+    doc_backend = _get_backend_from_stream(stream)
+    page_backend: _ImagePageBackend = doc_backend.load_page(0)
+
+    rects = list(page_backend.get_bitmap_rects())
+    assert len(rects) == 1
+    bbox = rects[0]
+    assert bbox.l == 0.0
+    assert bbox.t == 0.0
+    assert bbox.r == float(width)
+    assert bbox.b == float(height)
+    assert bbox.coord_origin == CoordOrigin.TOPLEFT
+
+
+def test_get_bitmap_rects_scaled():
+    """Test getting bitmap rects with scaling."""
+    width, height = 100, 80
+    scale = 2.0
+    stream = _make_png_stream(width=width, height=height)
+    doc_backend = _get_backend_from_stream(stream)
+    page_backend: _ImagePageBackend = doc_backend.load_page(0)
+
+    rects = list(page_backend.get_bitmap_rects(scale=scale))
+    assert len(rects) == 1
+    bbox = rects[0]
+    assert bbox.l == 0.0
+    assert bbox.t == 0.0
+    assert bbox.r == float(width * scale)
+    assert bbox.b == float(height * scale)
+    assert bbox.coord_origin == CoordOrigin.TOPLEFT
+
+
+def test_get_text_in_rect():
+    """Test that get_text_in_rect returns empty string for images (no OCR)."""
+    stream = _make_png_stream()
+    doc_backend = _get_backend_from_stream(stream)
+    page_backend: _ImagePageBackend = doc_backend.load_page(0)
+
+    bbox = BoundingBox(l=10, t=10, r=50, b=50, coord_origin=CoordOrigin.TOPLEFT)
+    text = page_backend.get_text_in_rect(bbox)
+    assert text == ""
+
+
+def test_multipage_access():
+    """Test accessing different pages in multi-page image."""
+    num_pages = 4
+    stream = _make_multipage_tiff_stream(num_pages=num_pages, size=(64, 64))
+    doc_backend = _get_backend_from_stream(stream)
+    assert doc_backend.page_count() == num_pages
+
+    # Access each page
+    for i in range(num_pages):
+        page_backend = doc_backend.load_page(i)
+        assert page_backend.is_valid()
+        size = page_backend.get_size()
+        assert size.width == 64
+        assert size.height == 64
--- a/tests/test_backend_mets_gbs.py
+++ b/tests/test_backend_mets_gbs.py
@@ -15,7 +15,7 @@ def test_doc_path():
 def _get_backend(pdf_doc):
    in_doc = InputDocument(
        path_or_stream=pdf_doc,
-        format=InputFormat.PDF,
+        format=InputFormat.METS_GBS,
        backend=MetsGbsDocumentBackend,
    )

--- a/tests/test_backend_webp.py
+++ b/tests/test_backend_webp.py
@@ -2,6 +2,8 @@ import sys
 from pathlib import Path
 from typing import List

+from pydantic.type_adapter import R
+
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult, DoclingDocument
 from docling.datamodel.pipeline_options import (
@@ -72,7 +74,9 @@ def test_e2e_webp_conversions():
        for webp_path in webp_paths:
            print(f"converting {webp_path}")

-            doc_result: ConversionResult = converter.convert(webp_path)
+            doc_result: ConversionResult = converter.convert(
+                webp_path, raises_on_error=True
+            )

            verify_conversion_result_v2(
                input_path=webp_path,
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@@ -4,9 +4,6 @@ from pathlib import Path
 import pytest
 from pydantic import ValidationError

-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
-from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.backend_options import (
@@ -17,7 +14,7 @@ from docling.datamodel.backend_options import (
 from docling.datamodel.base_models import DocumentStream, InputFormat
 from docling.datamodel.document import InputDocument, _DocumentConversionInput
 from docling.datamodel.settings import DocumentLimits
-from docling.document_converter import PdfFormatOption
+from docling.document_converter import ImageFormatOption, PdfFormatOption


 def test_in_doc_from_valid_path():
@@ -51,36 +48,6 @@ def test_in_doc_from_invalid_buf():
    assert doc.valid is False


-def test_image_in_pdf_backend():
-    in_doc = InputDocument(
-        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
-        format=InputFormat.IMAGE,
-        backend=PyPdfiumDocumentBackend,
-    )
-
-    assert in_doc.valid
-    in_doc = InputDocument(
-        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
-        format=InputFormat.IMAGE,
-        backend=DoclingParseDocumentBackend,
-    )
-    assert in_doc.valid
-
-    in_doc = InputDocument(
-        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
-        format=InputFormat.IMAGE,
-        backend=DoclingParseV2DocumentBackend,
-    )
-    assert in_doc.valid
-
-    in_doc = InputDocument(
-        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
-        format=InputFormat.IMAGE,
-        backend=DoclingParseV4DocumentBackend,
-    )
-    assert in_doc.valid
-
-
 def test_in_doc_with_page_range():
    test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    limits = DocumentLimits()
@@ -297,7 +264,7 @@ def test_tiff_two_pages():
    doc = InputDocument(
        path_or_stream=tiff_path,
        format=InputFormat.IMAGE,
-        backend=PdfFormatOption().backend,  # use default backend
+        backend=ImageFormatOption().backend,  # use default backend
    )
    assert doc.valid is True
    assert doc.page_count == 2