mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 03:24:59 +00:00
Merge branch 'main' of github.com:DS4SD/docling into nli/layout_heron
This commit is contained in:
commit
69e0123213
@ -57,7 +57,31 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
||||
if self.input_format is InputFormat.IMAGE:
|
||||
buf = BytesIO()
|
||||
img = Image.open(self.path_or_stream)
|
||||
img.save(buf, "PDF")
|
||||
|
||||
# Handle multi-page TIFF images
|
||||
if hasattr(img, "n_frames") and img.n_frames > 1:
|
||||
# Extract all frames from multi-page image
|
||||
frames = []
|
||||
try:
|
||||
for i in range(img.n_frames):
|
||||
img.seek(i)
|
||||
frame = img.copy().convert("RGB")
|
||||
frames.append(frame)
|
||||
except EOFError:
|
||||
pass
|
||||
|
||||
# Save as multi-page PDF
|
||||
if frames:
|
||||
frames[0].save(
|
||||
buf, "PDF", save_all=True, append_images=frames[1:]
|
||||
)
|
||||
else:
|
||||
# Fallback to single page if frame extraction fails
|
||||
img.convert("RGB").save(buf, "PDF")
|
||||
else:
|
||||
# Single page image - convert to RGB and save
|
||||
img.convert("RGB").save(buf, "PDF")
|
||||
|
||||
buf.seek(0)
|
||||
self.path_or_stream = buf
|
||||
else:
|
||||
|
@ -217,7 +217,13 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
return conv_res
|
||||
|
||||
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||
status = ConversionStatus.SUCCESS
|
||||
status = conv_res.status
|
||||
if status in [
|
||||
ConversionStatus.PENDING,
|
||||
ConversionStatus.STARTED,
|
||||
]: # preserves ConversionStatus.PARTIAL_SUCCESS
|
||||
status = ConversionStatus.SUCCESS
|
||||
|
||||
for page in conv_res.pages:
|
||||
if page._backend is None or not page._backend.is_valid():
|
||||
conv_res.errors.append(
|
||||
|
BIN
tests/data/tiff/2206.01062.tif
vendored
Normal file
BIN
tests/data/tiff/2206.01062.tif
vendored
Normal file
Binary file not shown.
@ -243,3 +243,26 @@ def _make_input_doc_from_stream(doc_stream):
|
||||
backend=PdfFormatOption().backend, # use default
|
||||
)
|
||||
return in_doc
|
||||
|
||||
|
||||
def test_tiff_two_pages():
|
||||
tiff_path = Path("./tests/data/tiff/2206.01062.tif")
|
||||
doc = InputDocument(
|
||||
path_or_stream=tiff_path,
|
||||
format=InputFormat.IMAGE,
|
||||
backend=PdfFormatOption().backend, # use default backend
|
||||
)
|
||||
assert doc.valid is True
|
||||
assert doc.page_count == 2
|
||||
|
||||
# Expect two full-page rectangles
|
||||
rects_page1 = doc._backend.load_page(0).get_bitmap_rects()
|
||||
rects_page2 = doc._backend.load_page(1).get_bitmap_rects()
|
||||
|
||||
page1_rect = next(rects_page1)
|
||||
page2_rect = next(rects_page2)
|
||||
|
||||
assert page1_rect.t == page2_rect.t == 0
|
||||
assert page1_rect.l == page2_rect.l == 0
|
||||
assert page1_rect.r == page2_rect.r == 612.0
|
||||
assert page1_rect.b == page2_rect.b == 792.0
|
||||
|
Loading…
Reference in New Issue
Block a user