mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat: add the Image backend (#2627)
* feat: add the Image backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the pre-commit Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Fixed single- versus multi-frame image formats Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix: Proper usage of ImageDocumentBackend in the pipeline, deprecate old code. Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Adapt tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: correct mets_gbs backend test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Make ImagePageBackend.get_bitmap_rects() yield Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
ae30373ee7
commit
3495b73de8
218
tests/test_backend_image_native.py
Normal file
218
tests/test_backend_image_native.py
Normal file
@@ -0,0 +1,218 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from PIL import Image
|
||||
|
||||
from docling.backend.image_backend import ImageDocumentBackend, _ImagePageBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter, ImageFormatOption
|
||||
from docling.document_extractor import DocumentExtractor
|
||||
|
||||
|
||||
def _make_png_stream(
|
||||
width: int = 64, height: int = 48, color=(123, 45, 67)
|
||||
) -> DocumentStream:
|
||||
img = Image.new("RGB", (width, height), color)
|
||||
buf = BytesIO()
|
||||
img.save(buf, format="PNG")
|
||||
buf.seek(0)
|
||||
return DocumentStream(name="test.png", stream=buf)
|
||||
|
||||
|
||||
def _make_multipage_tiff_stream(num_pages: int = 3, size=(32, 32)) -> DocumentStream:
|
||||
frames = [
|
||||
Image.new("RGB", size, (i * 10 % 255, i * 20 % 255, i * 30 % 255))
|
||||
for i in range(num_pages)
|
||||
]
|
||||
buf = BytesIO()
|
||||
frames[0].save(buf, format="TIFF", save_all=True, append_images=frames[1:])
|
||||
buf.seek(0)
|
||||
return DocumentStream(name="test.tiff", stream=buf)
|
||||
|
||||
|
||||
def test_docs_builder_uses_image_backend_for_image_stream():
|
||||
stream = _make_png_stream()
|
||||
conv_input = _DocumentConversionInput(path_or_stream_iterator=[stream])
|
||||
# Provide format options mapping that includes IMAGE -> ImageFormatOption (which carries ImageDocumentBackend)
|
||||
format_options = {InputFormat.IMAGE: ImageFormatOption()}
|
||||
|
||||
docs = list(conv_input.docs(format_options))
|
||||
assert len(docs) == 1
|
||||
in_doc = docs[0]
|
||||
assert in_doc.format == InputFormat.IMAGE
|
||||
assert isinstance(in_doc._backend, ImageDocumentBackend)
|
||||
assert in_doc.page_count == 1
|
||||
|
||||
|
||||
def test_docs_builder_multipage_tiff_counts_frames():
|
||||
stream = _make_multipage_tiff_stream(num_pages=4)
|
||||
conv_input = _DocumentConversionInput(path_or_stream_iterator=[stream])
|
||||
format_options = {InputFormat.IMAGE: ImageFormatOption()}
|
||||
|
||||
in_doc = next(conv_input.docs(format_options))
|
||||
assert isinstance(in_doc._backend, ImageDocumentBackend)
|
||||
assert in_doc.page_count == 4
|
||||
|
||||
|
||||
def test_converter_default_maps_image_to_image_backend():
|
||||
converter = DocumentConverter(allowed_formats=[InputFormat.IMAGE])
|
||||
backend_cls = converter.format_to_options[InputFormat.IMAGE].backend
|
||||
assert backend_cls is ImageDocumentBackend
|
||||
|
||||
|
||||
def test_extractor_default_maps_image_to_image_backend():
|
||||
extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE])
|
||||
backend_cls = extractor.extraction_format_to_options[InputFormat.IMAGE].backend
|
||||
assert backend_cls is ImageDocumentBackend
|
||||
|
||||
|
||||
def _get_backend_from_stream(stream: DocumentStream):
|
||||
"""Helper to create InputDocument with ImageDocumentBackend from a stream."""
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=stream.stream,
|
||||
format=InputFormat.IMAGE,
|
||||
backend=ImageDocumentBackend,
|
||||
filename=stream.name,
|
||||
)
|
||||
return in_doc._backend
|
||||
|
||||
|
||||
def test_num_pages_single():
|
||||
"""Test page count for single-page image."""
|
||||
stream = _make_png_stream(width=100, height=80)
|
||||
doc_backend = _get_backend_from_stream(stream)
|
||||
assert doc_backend.page_count() == 1
|
||||
|
||||
|
||||
def test_num_pages_multipage():
|
||||
"""Test page count for multi-page TIFF."""
|
||||
stream = _make_multipage_tiff_stream(num_pages=5, size=(64, 64))
|
||||
doc_backend = _get_backend_from_stream(stream)
|
||||
assert doc_backend.page_count() == 5
|
||||
|
||||
|
||||
def test_get_size():
|
||||
"""Test getting page size."""
|
||||
width, height = 120, 90
|
||||
stream = _make_png_stream(width=width, height=height)
|
||||
doc_backend = _get_backend_from_stream(stream)
|
||||
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||
size = page_backend.get_size()
|
||||
assert size.width == width
|
||||
assert size.height == height
|
||||
|
||||
|
||||
def test_get_page_image_full():
|
||||
"""Test getting full page image."""
|
||||
width, height = 100, 80
|
||||
stream = _make_png_stream(width=width, height=height)
|
||||
doc_backend = _get_backend_from_stream(stream)
|
||||
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||
img = page_backend.get_page_image()
|
||||
assert img.width == width
|
||||
assert img.height == height
|
||||
|
||||
|
||||
def test_get_page_image_scaled():
|
||||
"""Test getting scaled page image."""
|
||||
width, height = 100, 80
|
||||
scale = 2.0
|
||||
stream = _make_png_stream(width=width, height=height)
|
||||
doc_backend = _get_backend_from_stream(stream)
|
||||
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||
img = page_backend.get_page_image(scale=scale)
|
||||
assert img.width == round(width * scale)
|
||||
assert img.height == round(height * scale)
|
||||
|
||||
|
||||
def test_crop_page_image():
|
||||
"""Test cropping page image."""
|
||||
width, height = 200, 150
|
||||
stream = _make_png_stream(width=width, height=height)
|
||||
doc_backend = _get_backend_from_stream(stream)
|
||||
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Crop a region from the center
|
||||
cropbox = BoundingBox(l=50, t=30, r=150, b=120, coord_origin=CoordOrigin.TOPLEFT)
|
||||
img = page_backend.get_page_image(cropbox=cropbox)
|
||||
assert img.width == 100 # 150 - 50
|
||||
assert img.height == 90 # 120 - 30
|
||||
|
||||
|
||||
def test_crop_page_image_scaled():
|
||||
"""Test cropping and scaling page image."""
|
||||
width, height = 200, 150
|
||||
scale = 0.5
|
||||
stream = _make_png_stream(width=width, height=height)
|
||||
doc_backend = _get_backend_from_stream(stream)
|
||||
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||
|
||||
cropbox = BoundingBox(l=50, t=30, r=150, b=120, coord_origin=CoordOrigin.TOPLEFT)
|
||||
img = page_backend.get_page_image(scale=scale, cropbox=cropbox)
|
||||
assert img.width == round(100 * scale) # cropped width * scale
|
||||
assert img.height == round(90 * scale) # cropped height * scale
|
||||
|
||||
|
||||
def test_get_bitmap_rects():
|
||||
"""Test getting bitmap rects - should return full page rectangle."""
|
||||
width, height = 100, 80
|
||||
stream = _make_png_stream(width=width, height=height)
|
||||
doc_backend = _get_backend_from_stream(stream)
|
||||
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||
|
||||
rects = list(page_backend.get_bitmap_rects())
|
||||
assert len(rects) == 1
|
||||
bbox = rects[0]
|
||||
assert bbox.l == 0.0
|
||||
assert bbox.t == 0.0
|
||||
assert bbox.r == float(width)
|
||||
assert bbox.b == float(height)
|
||||
assert bbox.coord_origin == CoordOrigin.TOPLEFT
|
||||
|
||||
|
||||
def test_get_bitmap_rects_scaled():
|
||||
"""Test getting bitmap rects with scaling."""
|
||||
width, height = 100, 80
|
||||
scale = 2.0
|
||||
stream = _make_png_stream(width=width, height=height)
|
||||
doc_backend = _get_backend_from_stream(stream)
|
||||
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||
|
||||
rects = list(page_backend.get_bitmap_rects(scale=scale))
|
||||
assert len(rects) == 1
|
||||
bbox = rects[0]
|
||||
assert bbox.l == 0.0
|
||||
assert bbox.t == 0.0
|
||||
assert bbox.r == float(width * scale)
|
||||
assert bbox.b == float(height * scale)
|
||||
assert bbox.coord_origin == CoordOrigin.TOPLEFT
|
||||
|
||||
|
||||
def test_get_text_in_rect():
|
||||
"""Test that get_text_in_rect returns empty string for images (no OCR)."""
|
||||
stream = _make_png_stream()
|
||||
doc_backend = _get_backend_from_stream(stream)
|
||||
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||
|
||||
bbox = BoundingBox(l=10, t=10, r=50, b=50, coord_origin=CoordOrigin.TOPLEFT)
|
||||
text = page_backend.get_text_in_rect(bbox)
|
||||
assert text == ""
|
||||
|
||||
|
||||
def test_multipage_access():
|
||||
"""Test accessing different pages in multi-page image."""
|
||||
num_pages = 4
|
||||
stream = _make_multipage_tiff_stream(num_pages=num_pages, size=(64, 64))
|
||||
doc_backend = _get_backend_from_stream(stream)
|
||||
assert doc_backend.page_count() == num_pages
|
||||
|
||||
# Access each page
|
||||
for i in range(num_pages):
|
||||
page_backend = doc_backend.load_page(i)
|
||||
assert page_backend.is_valid()
|
||||
size = page_backend.get_size()
|
||||
assert size.width == 64
|
||||
assert size.height == 64
|
||||
@@ -15,7 +15,7 @@ def test_doc_path():
|
||||
def _get_backend(pdf_doc):
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=pdf_doc,
|
||||
format=InputFormat.PDF,
|
||||
format=InputFormat.METS_GBS,
|
||||
backend=MetsGbsDocumentBackend,
|
||||
)
|
||||
|
||||
|
||||
@@ -2,6 +2,8 @@ import sys
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from pydantic.type_adapter import R
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DoclingDocument
|
||||
from docling.datamodel.pipeline_options import (
|
||||
@@ -72,7 +74,9 @@ def test_e2e_webp_conversions():
|
||||
for webp_path in webp_paths:
|
||||
print(f"converting {webp_path}")
|
||||
|
||||
doc_result: ConversionResult = converter.convert(webp_path)
|
||||
doc_result: ConversionResult = converter.convert(
|
||||
webp_path, raises_on_error=True
|
||||
)
|
||||
|
||||
verify_conversion_result_v2(
|
||||
input_path=webp_path,
|
||||
|
||||
@@ -4,9 +4,6 @@ from pathlib import Path
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.backend_options import (
|
||||
@@ -17,7 +14,7 @@ from docling.datamodel.backend_options import (
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
from docling.document_converter import PdfFormatOption
|
||||
from docling.document_converter import ImageFormatOption, PdfFormatOption
|
||||
|
||||
|
||||
def test_in_doc_from_valid_path():
|
||||
@@ -51,36 +48,6 @@ def test_in_doc_from_invalid_buf():
|
||||
assert doc.valid is False
|
||||
|
||||
|
||||
def test_image_in_pdf_backend():
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
)
|
||||
|
||||
assert in_doc.valid
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
assert in_doc.valid
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=DoclingParseV2DocumentBackend,
|
||||
)
|
||||
assert in_doc.valid
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=DoclingParseV4DocumentBackend,
|
||||
)
|
||||
assert in_doc.valid
|
||||
|
||||
|
||||
def test_in_doc_with_page_range():
|
||||
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
limits = DocumentLimits()
|
||||
@@ -297,7 +264,7 @@ def test_tiff_two_pages():
|
||||
doc = InputDocument(
|
||||
path_or_stream=tiff_path,
|
||||
format=InputFormat.IMAGE,
|
||||
backend=PdfFormatOption().backend, # use default backend
|
||||
backend=ImageFormatOption().backend, # use default backend
|
||||
)
|
||||
assert doc.valid is True
|
||||
assert doc.page_count == 2
|
||||
|
||||
Reference in New Issue
Block a user