feat: add the Image backend (#2627)

* feat: add the Image backend

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fixed the pre-commit

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* Fixed single- versus multi-frame image formats

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fix: Proper usage of ImageDocumentBackend in the pipeline, deprecate old code.

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Adapt tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: correct mets_gbs backend test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Make ImagePageBackend.get_bitmap_rects() yield

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2025-11-17 11:37:22 +01:00
committed by GitHub
parent ae30373ee7
commit 3495b73de8
12 changed files with 494 additions and 82 deletions

View File

@@ -0,0 +1,188 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from PIL import Image
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.backend_options import PdfBackendOptions
from docling.datamodel.base_models import InputFormat, Size
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class _ImagePageBackend(PdfPageBackend):
def __init__(self, image: Image.Image):
self._image: Optional[Image.Image] = image
self.valid: bool = self._image is not None
def is_valid(self) -> bool:
return self.valid
def get_text_in_rect(self, bbox: BoundingBox) -> str:
# No text extraction from raw images without OCR
return ""
def get_segmented_page(self) -> SegmentedPdfPage:
# Return empty segmented page with proper dimensions for raw images
assert self._image is not None
page_size = self.get_size()
bbox = BoundingBox(
l=0.0,
t=0.0,
r=float(page_size.width),
b=float(page_size.height),
coord_origin=CoordOrigin.BOTTOMLEFT,
)
dimension = PdfPageGeometry(
angle=0.0,
rect=BoundingRectangle.from_bounding_box(bbox),
boundary_type=PdfPageBoundaryType.CROP_BOX,
art_bbox=bbox,
bleed_bbox=bbox,
crop_bbox=bbox,
media_bbox=bbox,
trim_bbox=bbox,
)
return SegmentedPdfPage(
dimension=dimension,
char_cells=[],
word_cells=[],
textline_cells=[],
has_chars=False,
has_words=False,
has_lines=False,
)
def get_text_cells(self) -> Iterable[TextCell]:
# No text cells on raw images
return []
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
# For raw images, the entire page is a bitmap
assert self._image is not None
page_size = self.get_size()
full_page_bbox = BoundingBox(
l=0.0,
t=0.0,
r=float(page_size.width),
b=float(page_size.height),
coord_origin=CoordOrigin.TOPLEFT,
)
if scale != 1:
full_page_bbox = full_page_bbox.scaled(scale=scale)
yield full_page_bbox
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
assert self._image is not None
img = self._image
if cropbox is not None:
# Expected cropbox comes in TOPLEFT coords in our pipeline
if cropbox.coord_origin != CoordOrigin.TOPLEFT:
# Convert to TOPLEFT relative to current image height
cropbox = cropbox.to_top_left_origin(img.height)
left, top, right, bottom = cropbox.as_tuple()
left = max(0, round(left))
top = max(0, round(top))
right = min(img.width, round(right))
bottom = min(img.height, round(bottom))
img = img.crop((left, top, right, bottom))
if scale != 1:
new_w = max(1, round(img.width * scale))
new_h = max(1, round(img.height * scale))
img = img.resize((new_w, new_h))
return img
def get_size(self) -> Size:
assert self._image is not None
return Size(width=self._image.width, height=self._image.height)
def unload(self):
# Help GC and free memory
self._image = None
class ImageDocumentBackend(PdfDocumentBackend):
"""Image-native backend that bypasses pypdfium2.
Notes:
- Subclasses PdfDocumentBackend to satisfy pipeline type checks.
- Intentionally avoids calling PdfDocumentBackend.__init__ to skip
the image→PDF conversion and any pypdfium2 usage.
- Handles multi-page TIFF by extracting frames eagerly to separate
Image objects to keep thread-safety when pages process in parallel.
"""
def __init__(
self,
in_doc: InputDocument,
path_or_stream: Union[BytesIO, Path],
options: PdfBackendOptions = PdfBackendOptions(),
):
# Bypass PdfDocumentBackend.__init__ to avoid image→PDF conversion
AbstractDocumentBackend.__init__(self, in_doc, path_or_stream, options)
self.options: PdfBackendOptions = options
if self.input_format not in {InputFormat.IMAGE}:
raise RuntimeError(
f"Incompatible file format {self.input_format} was passed to ImageDocumentBackend."
)
# Load frames eagerly for thread-safety across pages
self._frames: List[Image.Image] = []
try:
img = Image.open(self.path_or_stream) # type: ignore[arg-type]
# Handle multi-frame and single-frame images
# - multiframe formats: TIFF, GIF, ICO
# - singleframe formats: JPEG (.jpg, .jpeg), PNG (.png), BMP, WEBP (unless animated), HEIC
frame_count = getattr(img, "n_frames", 1)
if frame_count > 1:
for i in range(frame_count):
img.seek(i)
self._frames.append(img.copy().convert("RGB"))
else:
self._frames.append(img.convert("RGB"))
except Exception as e:
raise RuntimeError(f"Could not load image for document {self.file}") from e
def is_valid(self) -> bool:
return len(self._frames) > 0
def page_count(self) -> int:
return len(self._frames)
def load_page(self, page_no: int) -> _ImagePageBackend:
if not (0 <= page_no < len(self._frames)):
raise IndexError(f"Page index out of range: {page_no}")
return _ImagePageBackend(self._frames[page_no])
@classmethod
def supported_formats(cls) -> set[InputFormat]:
# Only IMAGE here; PDF handling remains in PDF-oriented backends
return {InputFormat.IMAGE}
@classmethod
def supports_pagination(cls) -> bool:
return True
def unload(self):
super().unload()
self._frames = []

View File

@@ -60,41 +60,10 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
super().__init__(in_doc, path_or_stream, options)
self.options: PdfBackendOptions
if self.input_format is not InputFormat.PDF:
if self.input_format is InputFormat.IMAGE:
buf = BytesIO()
img = Image.open(self.path_or_stream)
# Handle multi-page TIFF images
if hasattr(img, "n_frames") and img.n_frames > 1:
# Extract all frames from multi-page image
frames = []
try:
for i in range(img.n_frames):
img.seek(i)
frame = img.copy().convert("RGB")
frames.append(frame)
except EOFError:
pass
# Save as multi-page PDF
if frames:
frames[0].save(
buf, "PDF", save_all=True, append_images=frames[1:]
)
else:
# Fallback to single page if frame extraction fails
img.convert("RGB").save(buf, "PDF")
else:
# Single page image - convert to RGB and save
img.convert("RGB").save(buf, "PDF")
buf.seek(0)
self.path_or_stream = buf
elif self.input_format not in self.supported_formats():
raise RuntimeError(
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
)
if self.input_format not in self.supported_formats():
raise RuntimeError(
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
)
@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
@@ -106,7 +75,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.PDF, InputFormat.IMAGE}
return {InputFormat.PDF}
@classmethod
def supports_pagination(cls) -> bool:

View File

@@ -26,6 +26,7 @@ from rich.console import Console
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.image_backend import ImageDocumentBackend
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@@ -698,9 +699,16 @@ def convert( # noqa: C901
if artifacts_path is not None:
simple_format_option.artifacts_path = artifacts_path
# Use image-native backend for IMAGE to avoid pypdfium2 locking
image_format_option = PdfFormatOption(
pipeline_options=pipeline_options,
backend=ImageDocumentBackend,
backend_options=pdf_backend_options,
)
format_options = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
InputFormat.IMAGE: image_format_option,
InputFormat.METS_GBS: mets_gbs_format_option,
InputFormat.DOCX: WordFormatOption(
pipeline_options=simple_format_option

View File

@@ -164,6 +164,7 @@ class DoclingComponentType(str, Enum):
MODEL = "model"
DOC_ASSEMBLER = "doc_assembler"
USER_INPUT = "user_input"
PIPELINE = "pipeline"
class VlmStopReason(str, Enum):

View File

@@ -3,6 +3,7 @@ import logging
import sys
import threading
import time
import warnings
from collections.abc import Iterable, Iterator
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
@@ -21,6 +22,7 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.csv_backend import CsvDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.image_backend import ImageDocumentBackend
from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
@@ -129,7 +131,7 @@ class XMLJatsFormatOption(FormatOption):
class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
backend: Type[AbstractDocumentBackend] = ImageDocumentBackend
class PdfFormatOption(FormatOption):
@@ -184,10 +186,35 @@ class DocumentConverter:
self.allowed_formats = (
allowed_formats if allowed_formats is not None else list(InputFormat)
)
# Normalize format options: ensure IMAGE format uses ImageDocumentBackend
# for backward compatibility (old code might use PdfFormatOption or other backends for images)
normalized_format_options: dict[InputFormat, FormatOption] = {}
if format_options:
for format, option in format_options.items():
if (
format == InputFormat.IMAGE
and option.backend is not ImageDocumentBackend
):
warnings.warn(
f"Using {option.backend.__name__} for InputFormat.IMAGE is deprecated. "
"Images should use ImageDocumentBackend via ImageFormatOption. "
"Automatically correcting the backend, please update your code to avoid this warning.",
DeprecationWarning,
stacklevel=2,
)
# Convert to ImageFormatOption while preserving pipeline and backend options
normalized_format_options[format] = ImageFormatOption(
pipeline_options=option.pipeline_options,
backend_options=option.backend_options,
)
else:
normalized_format_options[format] = option
self.format_to_options: dict[InputFormat, FormatOption] = {
format: (
_get_default_option(format=format)
if (custom_option := (format_options or {}).get(format)) is None
if (custom_option := normalized_format_options.get(format)) is None
else custom_option
)
for format in self.allowed_formats
@@ -263,8 +290,12 @@ class DocumentConverter:
ConversionStatus.SUCCESS,
ConversionStatus.PARTIAL_SUCCESS,
}:
error_details = ""
if conv_res.errors:
error_messages = [err.error_message for err in conv_res.errors]
error_details = f" Errors: {'; '.join(error_messages)}"
raise ConversionError(
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}.{error_details}"
)
else:
yield conv_res

View File

@@ -14,6 +14,7 @@ from pydantic import ConfigDict, model_validator, validate_call
from typing_extensions import Self
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.image_backend import ImageDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import (
BaseFormatOption,
@@ -72,7 +73,7 @@ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
the format registry between convert/extract.
"""
format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
InputFormat.IMAGE: PyPdfiumDocumentBackend,
InputFormat.IMAGE: ImageDocumentBackend,
InputFormat.PDF: PyPdfiumDocumentBackend,
}

View File

@@ -76,8 +76,15 @@ class BasePipeline(ABC):
conv_res.status = self._determine_status(conv_res)
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
if raises_on_error:
raise e
if not raises_on_error:
error_item = ErrorItem(
component_type=DoclingComponentType.PIPELINE,
module_name=self.__class__.__name__,
error_message=str(e),
)
conv_res.errors.append(error_item)
else:
raise RuntimeError(f"Pipeline {self.__class__.__name__} failed") from e
finally:
self._unload(conv_res)

View File

@@ -30,7 +30,13 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page
from docling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
from docling.datamodel.settings import settings
@@ -265,7 +271,9 @@ class ThreadedPipelineStage:
)
)
except Exception as exc:
_log.error("Stage %s failed for run %d: %s", self.name, rid, exc)
_log.error(
"Stage %s failed for run %d: %s", self.name, rid, exc, exc_info=True
)
for it in items:
it.is_failed = True
it.error = exc
@@ -598,6 +606,16 @@ class StandardPdfPipeline(ConvertPipeline):
if p.page_no in page_map
or not any(fp == p.page_no for fp, _ in proc.failed_pages)
]
# Add error details from failed pages
for page_no, error in proc.failed_pages:
page_label = f"Page {page_no + 1}" if page_no >= 0 else "Unknown page"
error_msg = str(error) if error else ""
error_item = ErrorItem(
component_type=DoclingComponentType.PIPELINE,
module_name=self.__class__.__name__,
error_message=f"{page_label}: {error_msg}" if error_msg else page_label,
)
conv_res.errors.append(error_item)
if proc.is_complete_failure:
conv_res.status = ConversionStatus.FAILURE
elif proc.is_partial_success: