mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: add the Image backend (#2627)
* feat: add the Image backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the pre-commit Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Fixed single- versus multi-frame image formats Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix: Proper usage of ImageDocumentBackend in the pipeline, deprecate old code. Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Adapt tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: correct mets_gbs backend test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Make ImagePageBackend.get_bitmap_rects() yield Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
ae30373ee7
commit
3495b73de8
188
docling/backend/image_backend.py
Normal file
188
docling/backend/image_backend.py
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
import logging
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List, Optional, Union
|
||||||
|
|
||||||
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from docling_core.types.doc.page import (
|
||||||
|
BoundingRectangle,
|
||||||
|
PdfPageBoundaryType,
|
||||||
|
PdfPageGeometry,
|
||||||
|
SegmentedPdfPage,
|
||||||
|
TextCell,
|
||||||
|
)
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
|
from docling.datamodel.backend_options import PdfBackendOptions
|
||||||
|
from docling.datamodel.base_models import InputFormat, Size
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class _ImagePageBackend(PdfPageBackend):
|
||||||
|
def __init__(self, image: Image.Image):
|
||||||
|
self._image: Optional[Image.Image] = image
|
||||||
|
self.valid: bool = self._image is not None
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return self.valid
|
||||||
|
|
||||||
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||||
|
# No text extraction from raw images without OCR
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def get_segmented_page(self) -> SegmentedPdfPage:
|
||||||
|
# Return empty segmented page with proper dimensions for raw images
|
||||||
|
assert self._image is not None
|
||||||
|
page_size = self.get_size()
|
||||||
|
bbox = BoundingBox(
|
||||||
|
l=0.0,
|
||||||
|
t=0.0,
|
||||||
|
r=float(page_size.width),
|
||||||
|
b=float(page_size.height),
|
||||||
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||||
|
)
|
||||||
|
dimension = PdfPageGeometry(
|
||||||
|
angle=0.0,
|
||||||
|
rect=BoundingRectangle.from_bounding_box(bbox),
|
||||||
|
boundary_type=PdfPageBoundaryType.CROP_BOX,
|
||||||
|
art_bbox=bbox,
|
||||||
|
bleed_bbox=bbox,
|
||||||
|
crop_bbox=bbox,
|
||||||
|
media_bbox=bbox,
|
||||||
|
trim_bbox=bbox,
|
||||||
|
)
|
||||||
|
return SegmentedPdfPage(
|
||||||
|
dimension=dimension,
|
||||||
|
char_cells=[],
|
||||||
|
word_cells=[],
|
||||||
|
textline_cells=[],
|
||||||
|
has_chars=False,
|
||||||
|
has_words=False,
|
||||||
|
has_lines=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_text_cells(self) -> Iterable[TextCell]:
|
||||||
|
# No text cells on raw images
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||||
|
# For raw images, the entire page is a bitmap
|
||||||
|
assert self._image is not None
|
||||||
|
page_size = self.get_size()
|
||||||
|
full_page_bbox = BoundingBox(
|
||||||
|
l=0.0,
|
||||||
|
t=0.0,
|
||||||
|
r=float(page_size.width),
|
||||||
|
b=float(page_size.height),
|
||||||
|
coord_origin=CoordOrigin.TOPLEFT,
|
||||||
|
)
|
||||||
|
if scale != 1:
|
||||||
|
full_page_bbox = full_page_bbox.scaled(scale=scale)
|
||||||
|
yield full_page_bbox
|
||||||
|
|
||||||
|
def get_page_image(
|
||||||
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||||
|
) -> Image.Image:
|
||||||
|
assert self._image is not None
|
||||||
|
img = self._image
|
||||||
|
|
||||||
|
if cropbox is not None:
|
||||||
|
# Expected cropbox comes in TOPLEFT coords in our pipeline
|
||||||
|
if cropbox.coord_origin != CoordOrigin.TOPLEFT:
|
||||||
|
# Convert to TOPLEFT relative to current image height
|
||||||
|
cropbox = cropbox.to_top_left_origin(img.height)
|
||||||
|
left, top, right, bottom = cropbox.as_tuple()
|
||||||
|
left = max(0, round(left))
|
||||||
|
top = max(0, round(top))
|
||||||
|
right = min(img.width, round(right))
|
||||||
|
bottom = min(img.height, round(bottom))
|
||||||
|
img = img.crop((left, top, right, bottom))
|
||||||
|
|
||||||
|
if scale != 1:
|
||||||
|
new_w = max(1, round(img.width * scale))
|
||||||
|
new_h = max(1, round(img.height * scale))
|
||||||
|
img = img.resize((new_w, new_h))
|
||||||
|
|
||||||
|
return img
|
||||||
|
|
||||||
|
def get_size(self) -> Size:
|
||||||
|
assert self._image is not None
|
||||||
|
return Size(width=self._image.width, height=self._image.height)
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
# Help GC and free memory
|
||||||
|
self._image = None
|
||||||
|
|
||||||
|
|
||||||
|
class ImageDocumentBackend(PdfDocumentBackend):
|
||||||
|
"""Image-native backend that bypasses pypdfium2.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Subclasses PdfDocumentBackend to satisfy pipeline type checks.
|
||||||
|
- Intentionally avoids calling PdfDocumentBackend.__init__ to skip
|
||||||
|
the image→PDF conversion and any pypdfium2 usage.
|
||||||
|
- Handles multi-page TIFF by extracting frames eagerly to separate
|
||||||
|
Image objects to keep thread-safety when pages process in parallel.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_doc: InputDocument,
|
||||||
|
path_or_stream: Union[BytesIO, Path],
|
||||||
|
options: PdfBackendOptions = PdfBackendOptions(),
|
||||||
|
):
|
||||||
|
# Bypass PdfDocumentBackend.__init__ to avoid image→PDF conversion
|
||||||
|
AbstractDocumentBackend.__init__(self, in_doc, path_or_stream, options)
|
||||||
|
self.options: PdfBackendOptions = options
|
||||||
|
|
||||||
|
if self.input_format not in {InputFormat.IMAGE}:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Incompatible file format {self.input_format} was passed to ImageDocumentBackend."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load frames eagerly for thread-safety across pages
|
||||||
|
self._frames: List[Image.Image] = []
|
||||||
|
try:
|
||||||
|
img = Image.open(self.path_or_stream) # type: ignore[arg-type]
|
||||||
|
|
||||||
|
# Handle multi-frame and single-frame images
|
||||||
|
# - multiframe formats: TIFF, GIF, ICO
|
||||||
|
# - singleframe formats: JPEG (.jpg, .jpeg), PNG (.png), BMP, WEBP (unless animated), HEIC
|
||||||
|
frame_count = getattr(img, "n_frames", 1)
|
||||||
|
|
||||||
|
if frame_count > 1:
|
||||||
|
for i in range(frame_count):
|
||||||
|
img.seek(i)
|
||||||
|
self._frames.append(img.copy().convert("RGB"))
|
||||||
|
else:
|
||||||
|
self._frames.append(img.convert("RGB"))
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Could not load image for document {self.file}") from e
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return len(self._frames) > 0
|
||||||
|
|
||||||
|
def page_count(self) -> int:
|
||||||
|
return len(self._frames)
|
||||||
|
|
||||||
|
def load_page(self, page_no: int) -> _ImagePageBackend:
|
||||||
|
if not (0 <= page_no < len(self._frames)):
|
||||||
|
raise IndexError(f"Page index out of range: {page_no}")
|
||||||
|
return _ImagePageBackend(self._frames[page_no])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_formats(cls) -> set[InputFormat]:
|
||||||
|
# Only IMAGE here; PDF handling remains in PDF-oriented backends
|
||||||
|
return {InputFormat.IMAGE}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supports_pagination(cls) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
super().unload()
|
||||||
|
self._frames = []
|
||||||
@@ -60,38 +60,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|||||||
super().__init__(in_doc, path_or_stream, options)
|
super().__init__(in_doc, path_or_stream, options)
|
||||||
self.options: PdfBackendOptions
|
self.options: PdfBackendOptions
|
||||||
|
|
||||||
if self.input_format is not InputFormat.PDF:
|
if self.input_format not in self.supported_formats():
|
||||||
if self.input_format is InputFormat.IMAGE:
|
|
||||||
buf = BytesIO()
|
|
||||||
img = Image.open(self.path_or_stream)
|
|
||||||
|
|
||||||
# Handle multi-page TIFF images
|
|
||||||
if hasattr(img, "n_frames") and img.n_frames > 1:
|
|
||||||
# Extract all frames from multi-page image
|
|
||||||
frames = []
|
|
||||||
try:
|
|
||||||
for i in range(img.n_frames):
|
|
||||||
img.seek(i)
|
|
||||||
frame = img.copy().convert("RGB")
|
|
||||||
frames.append(frame)
|
|
||||||
except EOFError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Save as multi-page PDF
|
|
||||||
if frames:
|
|
||||||
frames[0].save(
|
|
||||||
buf, "PDF", save_all=True, append_images=frames[1:]
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Fallback to single page if frame extraction fails
|
|
||||||
img.convert("RGB").save(buf, "PDF")
|
|
||||||
else:
|
|
||||||
# Single page image - convert to RGB and save
|
|
||||||
img.convert("RGB").save(buf, "PDF")
|
|
||||||
|
|
||||||
buf.seek(0)
|
|
||||||
self.path_or_stream = buf
|
|
||||||
elif self.input_format not in self.supported_formats():
|
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
|
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
|
||||||
)
|
)
|
||||||
@@ -106,7 +75,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def supported_formats(cls) -> Set[InputFormat]:
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
return {InputFormat.PDF, InputFormat.IMAGE}
|
return {InputFormat.PDF}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def supports_pagination(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ from rich.console import Console
|
|||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||||
|
from docling.backend.image_backend import ImageDocumentBackend
|
||||||
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
|
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
@@ -698,9 +699,16 @@ def convert( # noqa: C901
|
|||||||
if artifacts_path is not None:
|
if artifacts_path is not None:
|
||||||
simple_format_option.artifacts_path = artifacts_path
|
simple_format_option.artifacts_path = artifacts_path
|
||||||
|
|
||||||
|
# Use image-native backend for IMAGE to avoid pypdfium2 locking
|
||||||
|
image_format_option = PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
backend=ImageDocumentBackend,
|
||||||
|
backend_options=pdf_backend_options,
|
||||||
|
)
|
||||||
|
|
||||||
format_options = {
|
format_options = {
|
||||||
InputFormat.PDF: pdf_format_option,
|
InputFormat.PDF: pdf_format_option,
|
||||||
InputFormat.IMAGE: pdf_format_option,
|
InputFormat.IMAGE: image_format_option,
|
||||||
InputFormat.METS_GBS: mets_gbs_format_option,
|
InputFormat.METS_GBS: mets_gbs_format_option,
|
||||||
InputFormat.DOCX: WordFormatOption(
|
InputFormat.DOCX: WordFormatOption(
|
||||||
pipeline_options=simple_format_option
|
pipeline_options=simple_format_option
|
||||||
|
|||||||
@@ -164,6 +164,7 @@ class DoclingComponentType(str, Enum):
|
|||||||
MODEL = "model"
|
MODEL = "model"
|
||||||
DOC_ASSEMBLER = "doc_assembler"
|
DOC_ASSEMBLER = "doc_assembler"
|
||||||
USER_INPUT = "user_input"
|
USER_INPUT = "user_input"
|
||||||
|
PIPELINE = "pipeline"
|
||||||
|
|
||||||
|
|
||||||
class VlmStopReason(str, Enum):
|
class VlmStopReason(str, Enum):
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import logging
|
|||||||
import sys
|
import sys
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
import warnings
|
||||||
from collections.abc import Iterable, Iterator
|
from collections.abc import Iterable, Iterator
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@@ -21,6 +22,7 @@ from docling.backend.asciidoc_backend import AsciiDocBackend
|
|||||||
from docling.backend.csv_backend import CsvDocumentBackend
|
from docling.backend.csv_backend import CsvDocumentBackend
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
|
from docling.backend.image_backend import ImageDocumentBackend
|
||||||
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
||||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||||
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
|
from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend
|
||||||
@@ -129,7 +131,7 @@ class XMLJatsFormatOption(FormatOption):
|
|||||||
|
|
||||||
class ImageFormatOption(FormatOption):
|
class ImageFormatOption(FormatOption):
|
||||||
pipeline_cls: Type = StandardPdfPipeline
|
pipeline_cls: Type = StandardPdfPipeline
|
||||||
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
backend: Type[AbstractDocumentBackend] = ImageDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
class PdfFormatOption(FormatOption):
|
class PdfFormatOption(FormatOption):
|
||||||
@@ -184,10 +186,35 @@ class DocumentConverter:
|
|||||||
self.allowed_formats = (
|
self.allowed_formats = (
|
||||||
allowed_formats if allowed_formats is not None else list(InputFormat)
|
allowed_formats if allowed_formats is not None else list(InputFormat)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Normalize format options: ensure IMAGE format uses ImageDocumentBackend
|
||||||
|
# for backward compatibility (old code might use PdfFormatOption or other backends for images)
|
||||||
|
normalized_format_options: dict[InputFormat, FormatOption] = {}
|
||||||
|
if format_options:
|
||||||
|
for format, option in format_options.items():
|
||||||
|
if (
|
||||||
|
format == InputFormat.IMAGE
|
||||||
|
and option.backend is not ImageDocumentBackend
|
||||||
|
):
|
||||||
|
warnings.warn(
|
||||||
|
f"Using {option.backend.__name__} for InputFormat.IMAGE is deprecated. "
|
||||||
|
"Images should use ImageDocumentBackend via ImageFormatOption. "
|
||||||
|
"Automatically correcting the backend, please update your code to avoid this warning.",
|
||||||
|
DeprecationWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
# Convert to ImageFormatOption while preserving pipeline and backend options
|
||||||
|
normalized_format_options[format] = ImageFormatOption(
|
||||||
|
pipeline_options=option.pipeline_options,
|
||||||
|
backend_options=option.backend_options,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
normalized_format_options[format] = option
|
||||||
|
|
||||||
self.format_to_options: dict[InputFormat, FormatOption] = {
|
self.format_to_options: dict[InputFormat, FormatOption] = {
|
||||||
format: (
|
format: (
|
||||||
_get_default_option(format=format)
|
_get_default_option(format=format)
|
||||||
if (custom_option := (format_options or {}).get(format)) is None
|
if (custom_option := normalized_format_options.get(format)) is None
|
||||||
else custom_option
|
else custom_option
|
||||||
)
|
)
|
||||||
for format in self.allowed_formats
|
for format in self.allowed_formats
|
||||||
@@ -263,8 +290,12 @@ class DocumentConverter:
|
|||||||
ConversionStatus.SUCCESS,
|
ConversionStatus.SUCCESS,
|
||||||
ConversionStatus.PARTIAL_SUCCESS,
|
ConversionStatus.PARTIAL_SUCCESS,
|
||||||
}:
|
}:
|
||||||
|
error_details = ""
|
||||||
|
if conv_res.errors:
|
||||||
|
error_messages = [err.error_message for err in conv_res.errors]
|
||||||
|
error_details = f" Errors: {'; '.join(error_messages)}"
|
||||||
raise ConversionError(
|
raise ConversionError(
|
||||||
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}.{error_details}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
yield conv_res
|
yield conv_res
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ from pydantic import ConfigDict, model_validator, validate_call
|
|||||||
from typing_extensions import Self
|
from typing_extensions import Self
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
|
from docling.backend.image_backend import ImageDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
BaseFormatOption,
|
BaseFormatOption,
|
||||||
@@ -72,7 +73,7 @@ def _get_default_extraction_option(fmt: InputFormat) -> ExtractionFormatOption:
|
|||||||
the format registry between convert/extract.
|
the format registry between convert/extract.
|
||||||
"""
|
"""
|
||||||
format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
format_to_default_backend: dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
||||||
InputFormat.IMAGE: PyPdfiumDocumentBackend,
|
InputFormat.IMAGE: ImageDocumentBackend,
|
||||||
InputFormat.PDF: PyPdfiumDocumentBackend,
|
InputFormat.PDF: PyPdfiumDocumentBackend,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -76,8 +76,15 @@ class BasePipeline(ABC):
|
|||||||
conv_res.status = self._determine_status(conv_res)
|
conv_res.status = self._determine_status(conv_res)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
if raises_on_error:
|
if not raises_on_error:
|
||||||
raise e
|
error_item = ErrorItem(
|
||||||
|
component_type=DoclingComponentType.PIPELINE,
|
||||||
|
module_name=self.__class__.__name__,
|
||||||
|
error_message=str(e),
|
||||||
|
)
|
||||||
|
conv_res.errors.append(error_item)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Pipeline {self.__class__.__name__} failed") from e
|
||||||
finally:
|
finally:
|
||||||
self._unload(conv_res)
|
self._unload(conv_res)
|
||||||
|
|
||||||
|
|||||||
@@ -30,7 +30,13 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
|||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import AssembledUnit, ConversionStatus, Page
|
from docling.datamodel.base_models import (
|
||||||
|
AssembledUnit,
|
||||||
|
ConversionStatus,
|
||||||
|
DoclingComponentType,
|
||||||
|
ErrorItem,
|
||||||
|
Page,
|
||||||
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
|
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
@@ -265,7 +271,9 @@ class ThreadedPipelineStage:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
_log.error("Stage %s failed for run %d: %s", self.name, rid, exc)
|
_log.error(
|
||||||
|
"Stage %s failed for run %d: %s", self.name, rid, exc, exc_info=True
|
||||||
|
)
|
||||||
for it in items:
|
for it in items:
|
||||||
it.is_failed = True
|
it.is_failed = True
|
||||||
it.error = exc
|
it.error = exc
|
||||||
@@ -598,6 +606,16 @@ class StandardPdfPipeline(ConvertPipeline):
|
|||||||
if p.page_no in page_map
|
if p.page_no in page_map
|
||||||
or not any(fp == p.page_no for fp, _ in proc.failed_pages)
|
or not any(fp == p.page_no for fp, _ in proc.failed_pages)
|
||||||
]
|
]
|
||||||
|
# Add error details from failed pages
|
||||||
|
for page_no, error in proc.failed_pages:
|
||||||
|
page_label = f"Page {page_no + 1}" if page_no >= 0 else "Unknown page"
|
||||||
|
error_msg = str(error) if error else ""
|
||||||
|
error_item = ErrorItem(
|
||||||
|
component_type=DoclingComponentType.PIPELINE,
|
||||||
|
module_name=self.__class__.__name__,
|
||||||
|
error_message=f"{page_label}: {error_msg}" if error_msg else page_label,
|
||||||
|
)
|
||||||
|
conv_res.errors.append(error_item)
|
||||||
if proc.is_complete_failure:
|
if proc.is_complete_failure:
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
elif proc.is_partial_success:
|
elif proc.is_partial_success:
|
||||||
|
|||||||
218
tests/test_backend_image_native.py
Normal file
218
tests/test_backend_image_native.py
Normal file
@@ -0,0 +1,218 @@
|
|||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from docling.backend.image_backend import ImageDocumentBackend, _ImagePageBackend
|
||||||
|
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||||
|
from docling.document_converter import DocumentConverter, ImageFormatOption
|
||||||
|
from docling.document_extractor import DocumentExtractor
|
||||||
|
|
||||||
|
|
||||||
|
def _make_png_stream(
|
||||||
|
width: int = 64, height: int = 48, color=(123, 45, 67)
|
||||||
|
) -> DocumentStream:
|
||||||
|
img = Image.new("RGB", (width, height), color)
|
||||||
|
buf = BytesIO()
|
||||||
|
img.save(buf, format="PNG")
|
||||||
|
buf.seek(0)
|
||||||
|
return DocumentStream(name="test.png", stream=buf)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_multipage_tiff_stream(num_pages: int = 3, size=(32, 32)) -> DocumentStream:
|
||||||
|
frames = [
|
||||||
|
Image.new("RGB", size, (i * 10 % 255, i * 20 % 255, i * 30 % 255))
|
||||||
|
for i in range(num_pages)
|
||||||
|
]
|
||||||
|
buf = BytesIO()
|
||||||
|
frames[0].save(buf, format="TIFF", save_all=True, append_images=frames[1:])
|
||||||
|
buf.seek(0)
|
||||||
|
return DocumentStream(name="test.tiff", stream=buf)
|
||||||
|
|
||||||
|
|
||||||
|
def test_docs_builder_uses_image_backend_for_image_stream():
|
||||||
|
stream = _make_png_stream()
|
||||||
|
conv_input = _DocumentConversionInput(path_or_stream_iterator=[stream])
|
||||||
|
# Provide format options mapping that includes IMAGE -> ImageFormatOption (which carries ImageDocumentBackend)
|
||||||
|
format_options = {InputFormat.IMAGE: ImageFormatOption()}
|
||||||
|
|
||||||
|
docs = list(conv_input.docs(format_options))
|
||||||
|
assert len(docs) == 1
|
||||||
|
in_doc = docs[0]
|
||||||
|
assert in_doc.format == InputFormat.IMAGE
|
||||||
|
assert isinstance(in_doc._backend, ImageDocumentBackend)
|
||||||
|
assert in_doc.page_count == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_docs_builder_multipage_tiff_counts_frames():
|
||||||
|
stream = _make_multipage_tiff_stream(num_pages=4)
|
||||||
|
conv_input = _DocumentConversionInput(path_or_stream_iterator=[stream])
|
||||||
|
format_options = {InputFormat.IMAGE: ImageFormatOption()}
|
||||||
|
|
||||||
|
in_doc = next(conv_input.docs(format_options))
|
||||||
|
assert isinstance(in_doc._backend, ImageDocumentBackend)
|
||||||
|
assert in_doc.page_count == 4
|
||||||
|
|
||||||
|
|
||||||
|
def test_converter_default_maps_image_to_image_backend():
|
||||||
|
converter = DocumentConverter(allowed_formats=[InputFormat.IMAGE])
|
||||||
|
backend_cls = converter.format_to_options[InputFormat.IMAGE].backend
|
||||||
|
assert backend_cls is ImageDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
|
def test_extractor_default_maps_image_to_image_backend():
|
||||||
|
extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE])
|
||||||
|
backend_cls = extractor.extraction_format_to_options[InputFormat.IMAGE].backend
|
||||||
|
assert backend_cls is ImageDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
|
def _get_backend_from_stream(stream: DocumentStream):
|
||||||
|
"""Helper to create InputDocument with ImageDocumentBackend from a stream."""
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=stream.stream,
|
||||||
|
format=InputFormat.IMAGE,
|
||||||
|
backend=ImageDocumentBackend,
|
||||||
|
filename=stream.name,
|
||||||
|
)
|
||||||
|
return in_doc._backend
|
||||||
|
|
||||||
|
|
||||||
|
def test_num_pages_single():
|
||||||
|
"""Test page count for single-page image."""
|
||||||
|
stream = _make_png_stream(width=100, height=80)
|
||||||
|
doc_backend = _get_backend_from_stream(stream)
|
||||||
|
assert doc_backend.page_count() == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_num_pages_multipage():
|
||||||
|
"""Test page count for multi-page TIFF."""
|
||||||
|
stream = _make_multipage_tiff_stream(num_pages=5, size=(64, 64))
|
||||||
|
doc_backend = _get_backend_from_stream(stream)
|
||||||
|
assert doc_backend.page_count() == 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_size():
|
||||||
|
"""Test getting page size."""
|
||||||
|
width, height = 120, 90
|
||||||
|
stream = _make_png_stream(width=width, height=height)
|
||||||
|
doc_backend = _get_backend_from_stream(stream)
|
||||||
|
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||||
|
size = page_backend.get_size()
|
||||||
|
assert size.width == width
|
||||||
|
assert size.height == height
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_page_image_full():
|
||||||
|
"""Test getting full page image."""
|
||||||
|
width, height = 100, 80
|
||||||
|
stream = _make_png_stream(width=width, height=height)
|
||||||
|
doc_backend = _get_backend_from_stream(stream)
|
||||||
|
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||||
|
img = page_backend.get_page_image()
|
||||||
|
assert img.width == width
|
||||||
|
assert img.height == height
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_page_image_scaled():
|
||||||
|
"""Test getting scaled page image."""
|
||||||
|
width, height = 100, 80
|
||||||
|
scale = 2.0
|
||||||
|
stream = _make_png_stream(width=width, height=height)
|
||||||
|
doc_backend = _get_backend_from_stream(stream)
|
||||||
|
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||||
|
img = page_backend.get_page_image(scale=scale)
|
||||||
|
assert img.width == round(width * scale)
|
||||||
|
assert img.height == round(height * scale)
|
||||||
|
|
||||||
|
|
||||||
|
def test_crop_page_image():
|
||||||
|
"""Test cropping page image."""
|
||||||
|
width, height = 200, 150
|
||||||
|
stream = _make_png_stream(width=width, height=height)
|
||||||
|
doc_backend = _get_backend_from_stream(stream)
|
||||||
|
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
|
# Crop a region from the center
|
||||||
|
cropbox = BoundingBox(l=50, t=30, r=150, b=120, coord_origin=CoordOrigin.TOPLEFT)
|
||||||
|
img = page_backend.get_page_image(cropbox=cropbox)
|
||||||
|
assert img.width == 100 # 150 - 50
|
||||||
|
assert img.height == 90 # 120 - 30
|
||||||
|
|
||||||
|
|
||||||
|
def test_crop_page_image_scaled():
|
||||||
|
"""Test cropping and scaling page image."""
|
||||||
|
width, height = 200, 150
|
||||||
|
scale = 0.5
|
||||||
|
stream = _make_png_stream(width=width, height=height)
|
||||||
|
doc_backend = _get_backend_from_stream(stream)
|
||||||
|
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
|
cropbox = BoundingBox(l=50, t=30, r=150, b=120, coord_origin=CoordOrigin.TOPLEFT)
|
||||||
|
img = page_backend.get_page_image(scale=scale, cropbox=cropbox)
|
||||||
|
assert img.width == round(100 * scale) # cropped width * scale
|
||||||
|
assert img.height == round(90 * scale) # cropped height * scale
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_bitmap_rects():
|
||||||
|
"""Test getting bitmap rects - should return full page rectangle."""
|
||||||
|
width, height = 100, 80
|
||||||
|
stream = _make_png_stream(width=width, height=height)
|
||||||
|
doc_backend = _get_backend_from_stream(stream)
|
||||||
|
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
|
rects = list(page_backend.get_bitmap_rects())
|
||||||
|
assert len(rects) == 1
|
||||||
|
bbox = rects[0]
|
||||||
|
assert bbox.l == 0.0
|
||||||
|
assert bbox.t == 0.0
|
||||||
|
assert bbox.r == float(width)
|
||||||
|
assert bbox.b == float(height)
|
||||||
|
assert bbox.coord_origin == CoordOrigin.TOPLEFT
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_bitmap_rects_scaled():
|
||||||
|
"""Test getting bitmap rects with scaling."""
|
||||||
|
width, height = 100, 80
|
||||||
|
scale = 2.0
|
||||||
|
stream = _make_png_stream(width=width, height=height)
|
||||||
|
doc_backend = _get_backend_from_stream(stream)
|
||||||
|
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
|
rects = list(page_backend.get_bitmap_rects(scale=scale))
|
||||||
|
assert len(rects) == 1
|
||||||
|
bbox = rects[0]
|
||||||
|
assert bbox.l == 0.0
|
||||||
|
assert bbox.t == 0.0
|
||||||
|
assert bbox.r == float(width * scale)
|
||||||
|
assert bbox.b == float(height * scale)
|
||||||
|
assert bbox.coord_origin == CoordOrigin.TOPLEFT
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_text_in_rect():
|
||||||
|
"""Test that get_text_in_rect returns empty string for images (no OCR)."""
|
||||||
|
stream = _make_png_stream()
|
||||||
|
doc_backend = _get_backend_from_stream(stream)
|
||||||
|
page_backend: _ImagePageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
|
bbox = BoundingBox(l=10, t=10, r=50, b=50, coord_origin=CoordOrigin.TOPLEFT)
|
||||||
|
text = page_backend.get_text_in_rect(bbox)
|
||||||
|
assert text == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_multipage_access():
|
||||||
|
"""Test accessing different pages in multi-page image."""
|
||||||
|
num_pages = 4
|
||||||
|
stream = _make_multipage_tiff_stream(num_pages=num_pages, size=(64, 64))
|
||||||
|
doc_backend = _get_backend_from_stream(stream)
|
||||||
|
assert doc_backend.page_count() == num_pages
|
||||||
|
|
||||||
|
# Access each page
|
||||||
|
for i in range(num_pages):
|
||||||
|
page_backend = doc_backend.load_page(i)
|
||||||
|
assert page_backend.is_valid()
|
||||||
|
size = page_backend.get_size()
|
||||||
|
assert size.width == 64
|
||||||
|
assert size.height == 64
|
||||||
@@ -15,7 +15,7 @@ def test_doc_path():
|
|||||||
def _get_backend(pdf_doc):
|
def _get_backend(pdf_doc):
|
||||||
in_doc = InputDocument(
|
in_doc = InputDocument(
|
||||||
path_or_stream=pdf_doc,
|
path_or_stream=pdf_doc,
|
||||||
format=InputFormat.PDF,
|
format=InputFormat.METS_GBS,
|
||||||
backend=MetsGbsDocumentBackend,
|
backend=MetsGbsDocumentBackend,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ import sys
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from pydantic.type_adapter import R
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult, DoclingDocument
|
from docling.datamodel.document import ConversionResult, DoclingDocument
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
@@ -72,7 +74,9 @@ def test_e2e_webp_conversions():
|
|||||||
for webp_path in webp_paths:
|
for webp_path in webp_paths:
|
||||||
print(f"converting {webp_path}")
|
print(f"converting {webp_path}")
|
||||||
|
|
||||||
doc_result: ConversionResult = converter.convert(webp_path)
|
doc_result: ConversionResult = converter.convert(
|
||||||
|
webp_path, raises_on_error=True
|
||||||
|
)
|
||||||
|
|
||||||
verify_conversion_result_v2(
|
verify_conversion_result_v2(
|
||||||
input_path=webp_path,
|
input_path=webp_path,
|
||||||
|
|||||||
@@ -4,9 +4,6 @@ from pathlib import Path
|
|||||||
import pytest
|
import pytest
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
|
||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.backend_options import (
|
from docling.datamodel.backend_options import (
|
||||||
@@ -17,7 +14,7 @@ from docling.datamodel.backend_options import (
|
|||||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||||
from docling.datamodel.settings import DocumentLimits
|
from docling.datamodel.settings import DocumentLimits
|
||||||
from docling.document_converter import PdfFormatOption
|
from docling.document_converter import ImageFormatOption, PdfFormatOption
|
||||||
|
|
||||||
|
|
||||||
def test_in_doc_from_valid_path():
|
def test_in_doc_from_valid_path():
|
||||||
@@ -51,36 +48,6 @@ def test_in_doc_from_invalid_buf():
|
|||||||
assert doc.valid is False
|
assert doc.valid is False
|
||||||
|
|
||||||
|
|
||||||
def test_image_in_pdf_backend():
|
|
||||||
in_doc = InputDocument(
|
|
||||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
|
||||||
format=InputFormat.IMAGE,
|
|
||||||
backend=PyPdfiumDocumentBackend,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert in_doc.valid
|
|
||||||
in_doc = InputDocument(
|
|
||||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
|
||||||
format=InputFormat.IMAGE,
|
|
||||||
backend=DoclingParseDocumentBackend,
|
|
||||||
)
|
|
||||||
assert in_doc.valid
|
|
||||||
|
|
||||||
in_doc = InputDocument(
|
|
||||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
|
||||||
format=InputFormat.IMAGE,
|
|
||||||
backend=DoclingParseV2DocumentBackend,
|
|
||||||
)
|
|
||||||
assert in_doc.valid
|
|
||||||
|
|
||||||
in_doc = InputDocument(
|
|
||||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
|
||||||
format=InputFormat.IMAGE,
|
|
||||||
backend=DoclingParseV4DocumentBackend,
|
|
||||||
)
|
|
||||||
assert in_doc.valid
|
|
||||||
|
|
||||||
|
|
||||||
def test_in_doc_with_page_range():
|
def test_in_doc_with_page_range():
|
||||||
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
limits = DocumentLimits()
|
limits = DocumentLimits()
|
||||||
@@ -297,7 +264,7 @@ def test_tiff_two_pages():
|
|||||||
doc = InputDocument(
|
doc = InputDocument(
|
||||||
path_or_stream=tiff_path,
|
path_or_stream=tiff_path,
|
||||||
format=InputFormat.IMAGE,
|
format=InputFormat.IMAGE,
|
||||||
backend=PdfFormatOption().backend, # use default backend
|
backend=ImageFormatOption().backend, # use default backend
|
||||||
)
|
)
|
||||||
assert doc.valid is True
|
assert doc.valid is True
|
||||||
assert doc.page_count == 2
|
assert doc.page_count == 2
|
||||||
|
|||||||
Reference in New Issue
Block a user