feat: add the Image backend (#2627)

* feat: add the Image backend

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fixed the pre-commit

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* Fixed single- versus multi-frame image formats

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fix: Proper usage of ImageDocumentBackend in the pipeline, deprecate old code.

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Adapt tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: correct mets_gbs backend test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Make ImagePageBackend.get_bitmap_rects() yield

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2025-11-17 11:37:22 +01:00
committed by GitHub
parent ae30373ee7
commit 3495b73de8
12 changed files with 494 additions and 82 deletions

View File

@@ -0,0 +1,188 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from PIL import Image
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.backend_options import PdfBackendOptions
from docling.datamodel.base_models import InputFormat, Size
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class _ImagePageBackend(PdfPageBackend):
def __init__(self, image: Image.Image):
self._image: Optional[Image.Image] = image
self.valid: bool = self._image is not None
def is_valid(self) -> bool:
return self.valid
def get_text_in_rect(self, bbox: BoundingBox) -> str:
# No text extraction from raw images without OCR
return ""
def get_segmented_page(self) -> SegmentedPdfPage:
# Return empty segmented page with proper dimensions for raw images
assert self._image is not None
page_size = self.get_size()
bbox = BoundingBox(
l=0.0,
t=0.0,
r=float(page_size.width),
b=float(page_size.height),
coord_origin=CoordOrigin.BOTTOMLEFT,
)
dimension = PdfPageGeometry(
angle=0.0,
rect=BoundingRectangle.from_bounding_box(bbox),
boundary_type=PdfPageBoundaryType.CROP_BOX,
art_bbox=bbox,
bleed_bbox=bbox,
crop_bbox=bbox,
media_bbox=bbox,
trim_bbox=bbox,
)
return SegmentedPdfPage(
dimension=dimension,
char_cells=[],
word_cells=[],
textline_cells=[],
has_chars=False,
has_words=False,
has_lines=False,
)
def get_text_cells(self) -> Iterable[TextCell]:
# No text cells on raw images
return []
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
# For raw images, the entire page is a bitmap
assert self._image is not None
page_size = self.get_size()
full_page_bbox = BoundingBox(
l=0.0,
t=0.0,
r=float(page_size.width),
b=float(page_size.height),
coord_origin=CoordOrigin.TOPLEFT,
)
if scale != 1:
full_page_bbox = full_page_bbox.scaled(scale=scale)
yield full_page_bbox
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
assert self._image is not None
img = self._image
if cropbox is not None:
# Expected cropbox comes in TOPLEFT coords in our pipeline
if cropbox.coord_origin != CoordOrigin.TOPLEFT:
# Convert to TOPLEFT relative to current image height
cropbox = cropbox.to_top_left_origin(img.height)
left, top, right, bottom = cropbox.as_tuple()
left = max(0, round(left))
top = max(0, round(top))
right = min(img.width, round(right))
bottom = min(img.height, round(bottom))
img = img.crop((left, top, right, bottom))
if scale != 1:
new_w = max(1, round(img.width * scale))
new_h = max(1, round(img.height * scale))
img = img.resize((new_w, new_h))
return img
def get_size(self) -> Size:
assert self._image is not None
return Size(width=self._image.width, height=self._image.height)
def unload(self):
# Help GC and free memory
self._image = None
class ImageDocumentBackend(PdfDocumentBackend):
"""Image-native backend that bypasses pypdfium2.
Notes:
- Subclasses PdfDocumentBackend to satisfy pipeline type checks.
- Intentionally avoids calling PdfDocumentBackend.__init__ to skip
the image→PDF conversion and any pypdfium2 usage.
- Handles multi-page TIFF by extracting frames eagerly to separate
Image objects to keep thread-safety when pages process in parallel.
"""
def __init__(
self,
in_doc: InputDocument,
path_or_stream: Union[BytesIO, Path],
options: PdfBackendOptions = PdfBackendOptions(),
):
# Bypass PdfDocumentBackend.__init__ to avoid image→PDF conversion
AbstractDocumentBackend.__init__(self, in_doc, path_or_stream, options)
self.options: PdfBackendOptions = options
if self.input_format not in {InputFormat.IMAGE}:
raise RuntimeError(
f"Incompatible file format {self.input_format} was passed to ImageDocumentBackend."
)
# Load frames eagerly for thread-safety across pages
self._frames: List[Image.Image] = []
try:
img = Image.open(self.path_or_stream) # type: ignore[arg-type]
# Handle multi-frame and single-frame images
# - multiframe formats: TIFF, GIF, ICO
# - singleframe formats: JPEG (.jpg, .jpeg), PNG (.png), BMP, WEBP (unless animated), HEIC
frame_count = getattr(img, "n_frames", 1)
if frame_count > 1:
for i in range(frame_count):
img.seek(i)
self._frames.append(img.copy().convert("RGB"))
else:
self._frames.append(img.convert("RGB"))
except Exception as e:
raise RuntimeError(f"Could not load image for document {self.file}") from e
def is_valid(self) -> bool:
return len(self._frames) > 0
def page_count(self) -> int:
return len(self._frames)
def load_page(self, page_no: int) -> _ImagePageBackend:
if not (0 <= page_no < len(self._frames)):
raise IndexError(f"Page index out of range: {page_no}")
return _ImagePageBackend(self._frames[page_no])
@classmethod
def supported_formats(cls) -> set[InputFormat]:
# Only IMAGE here; PDF handling remains in PDF-oriented backends
return {InputFormat.IMAGE}
@classmethod
def supports_pagination(cls) -> bool:
return True
def unload(self):
super().unload()
self._frames = []

View File

@@ -60,38 +60,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
super().__init__(in_doc, path_or_stream, options) super().__init__(in_doc, path_or_stream, options)
self.options: PdfBackendOptions self.options: PdfBackendOptions
if self.input_format is not InputFormat.PDF: if self.input_format not in self.supported_formats():
if self.input_format is InputFormat.IMAGE:
buf = BytesIO()
img = Image.open(self.path_or_stream)
# Handle multi-page TIFF images
if hasattr(img, "n_frames") and img.n_frames > 1:
# Extract all frames from multi-page image
frames = []
try:
for i in range(img.n_frames):
img.seek(i)
frame = img.copy().convert("RGB")
frames.append(frame)
except EOFError:
pass
# Save as multi-page PDF
if frames:
frames[0].save(
buf, "PDF", save_all=True, append_images=frames[1:]
)
else:
# Fallback to single page if frame extraction fails
img.convert("RGB").save(buf, "PDF")
else:
# Single page image - convert to RGB and save
img.convert("RGB").save(buf, "PDF")
buf.seek(0)
self.path_or_stream = buf
elif self.input_format not in self.supported_formats():
raise RuntimeError( raise RuntimeError(
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}." f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
) )
@@ -106,7 +75,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
@classmethod @classmethod
def supported_formats(cls) -> Set[InputFormat]: def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.PDF, InputFormat.IMAGE} return {InputFormat.PDF}
@classmethod @classmethod
def supports_pagination(cls) -> bool: def supports_pagination(cls) -> bool:

View File

@@ -26,6 +26,7 @@ from rich.console import Console
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.image_backend import ImageDocumentBackend
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -698,9 +699,16 @@ def convert( # noqa: C901
if artifacts_path is not None: if artifacts_path is not None:
simple_format_option.artifacts_path = artifacts_path simple_format_option.artifacts_path = artifacts_path
# Use image-native backend for IMAGE to avoid pypdfium2 locking
image_format_option = PdfFormatOption(
pipeline_options=pipeline_options,
backend=ImageDocumentBackend,
backend_options=pdf_backend_options,
)
format_options = { format_options = {
InputFormat.PDF: pdf_format_option, InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option, InputFormat.IMAGE: image_format_option,
InputFormat.METS_GBS: mets_gbs_format_option, InputFormat.METS_GBS: mets_gbs_format_option,
InputFormat.DOCX: WordFormatOption( InputFormat.DOCX: WordFormatOption(
pipeline_options=simple_format_option pipeline_options=simple_format_option

View File

@@ -164,6 +164,7 @@ class DoclingComponentType(str, Enum):
MODEL = "model" MODEL = "model"
DOC_ASSEMBLER = "doc_assembler" DOC_ASSEMBLER = "doc_assembler"
USER_INPUT = "user_input" USER_INPUT = "user_input"
PIPELINE = "pipeline"
class VlmStopReason(str, Enum): class VlmStopReason(str, Enum):

View File

@@ -3,6 +3,7 @@ import logging
import sys import sys
import threading import threading
import time import time
import warnings
from collections.abc import Iterable, Iterator from collections.abc import Iterable, Iterator
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from datetime import datetime from datetime import datetime
@@ -21,6 +22,7 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.csv_backend import CsvDocumentBackend from docling.backend.csv_backend import CsvDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.image_backend import ImageDocumentBackend
from docling.backend.json.docling_json_backend import DoclingJSONBackend from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
@@ -129,7 +131,7 @@ class XMLJatsFormatOption(FormatOption):
class ImageFormatOption(FormatOption): class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend backend: Type[AbstractDocumentBackend] = ImageDocumentBackend
class PdfFormatOption(FormatOption): class PdfFormatOption(FormatOption):
@@ -184,10 +186,35 @@ class DocumentConverter:
self.allowed_formats = ( self.allowed_formats = (
allowed_formats if allowed_formats is not None else list(InputFormat) allowed_formats if allowed_formats is not None else list(InputFormat)
) )
# Normalize format options: ensure IMAGE format uses ImageDocumentBackend
# for backward compatibility (old code might use PdfFormatOption or other backends for images)
normalized_format_options: dict[InputFormat, FormatOption] = {}
if format_options:
for format, option in format_options.items():
if (
format == InputFormat.IMAGE
and option.backend is not ImageDocumentBackend
):
warnings.warn(
f"Using {option.backend.__name__} for InputFormat.IMAGE is deprecated. "
"Images should use ImageDocumentBackend via ImageFormatOption. "
"Automatically correcting the backend, please update your code to avoid this warning.",
DeprecationWarning,
stacklevel=2,
)
# Convert to ImageFormatOption while preserving pipeline and backend options
normalized_format_options[format] = ImageFormatOption(
pipeline_options=option.pipeline_options,
backend_options=option.backend_options,
)
else:
normalized_format_options[format] = option
self.format_to_options: dict[InputFormat, FormatOption] = { self.format_to_options: dict[InputFormat, FormatOption] = {
format: ( format: (
_get_default_option(format=format) _get_default_option(format=format)
if (custom_option := (format_options or {}).get(format)) is None if (custom_option := normalized_format_options.get(format)) is None
else custom_option else custom_option
) )
for format in self.allowed_formats for format in self.allowed_formats
@@ -263,8 +290,12 @@ class DocumentConverter:
ConversionStatus.SUCCESS, ConversionStatus.SUCCESS,
ConversionStatus.PARTIAL_SUCCESS, ConversionStatus.PARTIAL_SUCCESS,
}: }:
error_details = ""
if conv_res.errors:
error_messages = [err.error_message for err in conv_res.errors]
error_details = f" Errors: {'; '.join(error_messages)}"
raise ConversionError( raise ConversionError(
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}" f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}.{error_details}"
) )
else: else:
yield conv_res yield conv_res

View File

@@ -14,6 +14,7 @@ from pydantic import ConfigDict, model_validator, validate_call
from typing_extensions import Self from typing_extensions import Self
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.image_backend import ImageDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
BaseFormatOption, BaseFormatOption,
@@ -72,7 +73,7 @@ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
the format registry between convert/extract. the format registry between convert/extract.
""" """
format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = { format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
InputFormat.IMAGE: PyPdfiumDocumentBackend, InputFormat.IMAGE: ImageDocumentBackend,
InputFormat.PDF: PyPdfiumDocumentBackend, InputFormat.PDF: PyPdfiumDocumentBackend,
} }

View File

@@ -76,8 +76,15 @@ class BasePipeline(ABC):
conv_res.status = self._determine_status(conv_res) conv_res.status = self._determine_status(conv_res)
except Exception as e: except Exception as e:
conv_res.status = ConversionStatus.FAILURE conv_res.status = ConversionStatus.FAILURE
if raises_on_error: if not raises_on_error:
raise e error_item = ErrorItem(
component_type=DoclingComponentType.PIPELINE,
module_name=self.__class__.__name__,
error_message=str(e),
)
conv_res.errors.append(error_item)
else:
raise RuntimeError(f"Pipeline {self.__class__.__name__} failed") from e
finally: finally:
self._unload(conv_res) self._unload(conv_res)

View File

@@ -30,7 +30,13 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page from docling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
)
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
@@ -265,7 +271,9 @@ class ThreadedPipelineStage:
) )
) )
except Exception as exc: except Exception as exc:
_log.error("Stage %s failed for run %d: %s", self.name, rid, exc) _log.error(
"Stage %s failed for run %d: %s", self.name, rid, exc, exc_info=True
)
for it in items: for it in items:
it.is_failed = True it.is_failed = True
it.error = exc it.error = exc
@@ -598,6 +606,16 @@ class StandardPdfPipeline(ConvertPipeline):
if p.page_no in page_map if p.page_no in page_map
or not any(fp == p.page_no for fp, _ in proc.failed_pages) or not any(fp == p.page_no for fp, _ in proc.failed_pages)
] ]
# Add error details from failed pages
for page_no, error in proc.failed_pages:
page_label = f"Page {page_no + 1}" if page_no >= 0 else "Unknown page"
error_msg = str(error) if error else ""
error_item = ErrorItem(
component_type=DoclingComponentType.PIPELINE,
module_name=self.__class__.__name__,
error_message=f"{page_label}: {error_msg}" if error_msg else page_label,
)
conv_res.errors.append(error_item)
if proc.is_complete_failure: if proc.is_complete_failure:
conv_res.status = ConversionStatus.FAILURE conv_res.status = ConversionStatus.FAILURE
elif proc.is_partial_success: elif proc.is_partial_success:

View File

@@ -0,0 +1,218 @@
from io import BytesIO
from pathlib import Path
import pytest
from docling_core.types.doc import BoundingBox, CoordOrigin
from PIL import Image
from docling.backend.image_backend import ImageDocumentBackend, _ImagePageBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument, _DocumentConversionInput
from docling.document_converter import DocumentConverter, ImageFormatOption
from docling.document_extractor import DocumentExtractor
def _make_png_stream(
width: int = 64, height: int = 48, color=(123, 45, 67)
) -> DocumentStream:
img = Image.new("RGB", (width, height), color)
buf = BytesIO()
img.save(buf, format="PNG")
buf.seek(0)
return DocumentStream(name="test.png", stream=buf)
def _make_multipage_tiff_stream(num_pages: int = 3, size=(32, 32)) -> DocumentStream:
frames = [
Image.new("RGB", size, (i * 10 % 255, i * 20 % 255, i * 30 % 255))
for i in range(num_pages)
]
buf = BytesIO()
frames[0].save(buf, format="TIFF", save_all=True, append_images=frames[1:])
buf.seek(0)
return DocumentStream(name="test.tiff", stream=buf)
def test_docs_builder_uses_image_backend_for_image_stream():
stream = _make_png_stream()
conv_input = _DocumentConversionInput(path_or_stream_iterator=[stream])
# Provide format options mapping that includes IMAGE -> ImageFormatOption (which carries ImageDocumentBackend)
format_options = {InputFormat.IMAGE: ImageFormatOption()}
docs = list(conv_input.docs(format_options))
assert len(docs) == 1
in_doc = docs[0]
assert in_doc.format == InputFormat.IMAGE
assert isinstance(in_doc._backend, ImageDocumentBackend)
assert in_doc.page_count == 1
def test_docs_builder_multipage_tiff_counts_frames():
stream = _make_multipage_tiff_stream(num_pages=4)
conv_input = _DocumentConversionInput(path_or_stream_iterator=[stream])
format_options = {InputFormat.IMAGE: ImageFormatOption()}
in_doc = next(conv_input.docs(format_options))
assert isinstance(in_doc._backend, ImageDocumentBackend)
assert in_doc.page_count == 4
def test_converter_default_maps_image_to_image_backend():
converter = DocumentConverter(allowed_formats=[InputFormat.IMAGE])
backend_cls = converter.format_to_options[InputFormat.IMAGE].backend
assert backend_cls is ImageDocumentBackend
def test_extractor_default_maps_image_to_image_backend():
extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE])
backend_cls = extractor.extraction_format_to_options[InputFormat.IMAGE].backend
assert backend_cls is ImageDocumentBackend
def _get_backend_from_stream(stream: DocumentStream):
"""Helper to create InputDocument with ImageDocumentBackend from a stream."""
in_doc = InputDocument(
path_or_stream=stream.stream,
format=InputFormat.IMAGE,
backend=ImageDocumentBackend,
filename=stream.name,
)
return in_doc._backend
def test_num_pages_single():
"""Test page count for single-page image."""
stream = _make_png_stream(width=100, height=80)
doc_backend = _get_backend_from_stream(stream)
assert doc_backend.page_count() == 1
def test_num_pages_multipage():
"""Test page count for multi-page TIFF."""
stream = _make_multipage_tiff_stream(num_pages=5, size=(64, 64))
doc_backend = _get_backend_from_stream(stream)
assert doc_backend.page_count() == 5
def test_get_size():
"""Test getting page size."""
width, height = 120, 90
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
size = page_backend.get_size()
assert size.width == width
assert size.height == height
def test_get_page_image_full():
"""Test getting full page image."""
width, height = 100, 80
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
img = page_backend.get_page_image()
assert img.width == width
assert img.height == height
def test_get_page_image_scaled():
"""Test getting scaled page image."""
width, height = 100, 80
scale = 2.0
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
img = page_backend.get_page_image(scale=scale)
assert img.width == round(width * scale)
assert img.height == round(height * scale)
def test_crop_page_image():
"""Test cropping page image."""
width, height = 200, 150
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
# Crop a region from the center
cropbox = BoundingBox(l=50, t=30, r=150, b=120, coord_origin=CoordOrigin.TOPLEFT)
img = page_backend.get_page_image(cropbox=cropbox)
assert img.width == 100 # 150 - 50
assert img.height == 90 # 120 - 30
def test_crop_page_image_scaled():
"""Test cropping and scaling page image."""
width, height = 200, 150
scale = 0.5
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
cropbox = BoundingBox(l=50, t=30, r=150, b=120, coord_origin=CoordOrigin.TOPLEFT)
img = page_backend.get_page_image(scale=scale, cropbox=cropbox)
assert img.width == round(100 * scale) # cropped width * scale
assert img.height == round(90 * scale) # cropped height * scale
def test_get_bitmap_rects():
"""Test getting bitmap rects - should return full page rectangle."""
width, height = 100, 80
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
rects = list(page_backend.get_bitmap_rects())
assert len(rects) == 1
bbox = rects[0]
assert bbox.l == 0.0
assert bbox.t == 0.0
assert bbox.r == float(width)
assert bbox.b == float(height)
assert bbox.coord_origin == CoordOrigin.TOPLEFT
def test_get_bitmap_rects_scaled():
"""Test getting bitmap rects with scaling."""
width, height = 100, 80
scale = 2.0
stream = _make_png_stream(width=width, height=height)
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
rects = list(page_backend.get_bitmap_rects(scale=scale))
assert len(rects) == 1
bbox = rects[0]
assert bbox.l == 0.0
assert bbox.t == 0.0
assert bbox.r == float(width * scale)
assert bbox.b == float(height * scale)
assert bbox.coord_origin == CoordOrigin.TOPLEFT
def test_get_text_in_rect():
"""Test that get_text_in_rect returns empty string for images (no OCR)."""
stream = _make_png_stream()
doc_backend = _get_backend_from_stream(stream)
page_backend: _ImagePageBackend = doc_backend.load_page(0)
bbox = BoundingBox(l=10, t=10, r=50, b=50, coord_origin=CoordOrigin.TOPLEFT)
text = page_backend.get_text_in_rect(bbox)
assert text == ""
def test_multipage_access():
"""Test accessing different pages in multi-page image."""
num_pages = 4
stream = _make_multipage_tiff_stream(num_pages=num_pages, size=(64, 64))
doc_backend = _get_backend_from_stream(stream)
assert doc_backend.page_count() == num_pages
# Access each page
for i in range(num_pages):
page_backend = doc_backend.load_page(i)
assert page_backend.is_valid()
size = page_backend.get_size()
assert size.width == 64
assert size.height == 64

View File

@@ -15,7 +15,7 @@ def test_doc_path():
def _get_backend(pdf_doc): def _get_backend(pdf_doc):
in_doc = InputDocument( in_doc = InputDocument(
path_or_stream=pdf_doc, path_or_stream=pdf_doc,
format=InputFormat.PDF, format=InputFormat.METS_GBS,
backend=MetsGbsDocumentBackend, backend=MetsGbsDocumentBackend,
) )

View File

@@ -2,6 +2,8 @@ import sys
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from pydantic.type_adapter import R
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
@@ -72,7 +74,9 @@ def test_e2e_webp_conversions():
for webp_path in webp_paths: for webp_path in webp_paths:
print(f"converting {webp_path}") print(f"converting {webp_path}")
doc_result: ConversionResult = converter.convert(webp_path) doc_result: ConversionResult = converter.convert(
webp_path, raises_on_error=True
)
verify_conversion_result_v2( verify_conversion_result_v2(
input_path=webp_path, input_path=webp_path,

View File

@@ -4,9 +4,6 @@ from pathlib import Path
import pytest import pytest
from pydantic import ValidationError from pydantic import ValidationError
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.backend_options import ( from docling.datamodel.backend_options import (
@@ -17,7 +14,7 @@ from docling.datamodel.backend_options import (
from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument, _DocumentConversionInput from docling.datamodel.document import InputDocument, _DocumentConversionInput
from docling.datamodel.settings import DocumentLimits from docling.datamodel.settings import DocumentLimits
from docling.document_converter import PdfFormatOption from docling.document_converter import ImageFormatOption, PdfFormatOption
def test_in_doc_from_valid_path(): def test_in_doc_from_valid_path():
@@ -51,36 +48,6 @@ def test_in_doc_from_invalid_buf():
assert doc.valid is False assert doc.valid is False
def test_image_in_pdf_backend():
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=PyPdfiumDocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseDocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseV2DocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseV4DocumentBackend,
)
assert in_doc.valid
def test_in_doc_with_page_range(): def test_in_doc_with_page_range():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
limits = DocumentLimits() limits = DocumentLimits()
@@ -297,7 +264,7 @@ def test_tiff_two_pages():
doc = InputDocument( doc = InputDocument(
path_or_stream=tiff_path, path_or_stream=tiff_path,
format=InputFormat.IMAGE, format=InputFormat.IMAGE,
backend=PdfFormatOption().backend, # use default backend backend=ImageFormatOption().backend, # use default backend
) )
assert doc.valid is True assert doc.valid is True
assert doc.page_count == 2 assert doc.page_count == 2