mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 03:24:59 +00:00
fix: multi-page image support (tiff) (#1928)
* Initial plan * Fix multi-page TIFF image support Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com> * add RGB conversion Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Remove pointless test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add multi-page TIFF test data and verification tests Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com> * Revert "Add multi-page TIFF test data and verification tests" This reverts commit130a10e2d9
. * Proper test for 2 page tiff file Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * DCO Remediation Commit for copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> I, copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>, hereby add my Signed-off-by to this commit:420df478f3
I, copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>, hereby add my Signed-off-by to this commit:c1d722725f
I, Christoph Auer <cau@zurich.ibm.com>, hereby add my Signed-off-by to this commit:6aa85cc933
I, copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>, hereby add my Signed-off-by to this commit:130a10e2d9
I, Christoph Auer <cau@zurich.ibm.com>, hereby add my Signed-off-by to this commit:d571f36299
I, Christoph Auer <cau@zurich.ibm.com>, hereby add my Signed-off-by to this commit:2aab66288b
Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Proper test for 2 page tiff file (2) Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
ec971bbe68
commit
8d50a59d48
@ -57,7 +57,31 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|||||||
if self.input_format is InputFormat.IMAGE:
|
if self.input_format is InputFormat.IMAGE:
|
||||||
buf = BytesIO()
|
buf = BytesIO()
|
||||||
img = Image.open(self.path_or_stream)
|
img = Image.open(self.path_or_stream)
|
||||||
img.save(buf, "PDF")
|
|
||||||
|
# Handle multi-page TIFF images
|
||||||
|
if hasattr(img, "n_frames") and img.n_frames > 1:
|
||||||
|
# Extract all frames from multi-page image
|
||||||
|
frames = []
|
||||||
|
try:
|
||||||
|
for i in range(img.n_frames):
|
||||||
|
img.seek(i)
|
||||||
|
frame = img.copy().convert("RGB")
|
||||||
|
frames.append(frame)
|
||||||
|
except EOFError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Save as multi-page PDF
|
||||||
|
if frames:
|
||||||
|
frames[0].save(
|
||||||
|
buf, "PDF", save_all=True, append_images=frames[1:]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Fallback to single page if frame extraction fails
|
||||||
|
img.convert("RGB").save(buf, "PDF")
|
||||||
|
else:
|
||||||
|
# Single page image - convert to RGB and save
|
||||||
|
img.convert("RGB").save(buf, "PDF")
|
||||||
|
|
||||||
buf.seek(0)
|
buf.seek(0)
|
||||||
self.path_or_stream = buf
|
self.path_or_stream = buf
|
||||||
else:
|
else:
|
||||||
|
BIN
tests/data/tiff/2206.01062.tif
vendored
Normal file
BIN
tests/data/tiff/2206.01062.tif
vendored
Normal file
Binary file not shown.
@ -243,3 +243,26 @@ def _make_input_doc_from_stream(doc_stream):
|
|||||||
backend=PdfFormatOption().backend, # use default
|
backend=PdfFormatOption().backend, # use default
|
||||||
)
|
)
|
||||||
return in_doc
|
return in_doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_tiff_two_pages():
|
||||||
|
tiff_path = Path("./tests/data/tiff/2206.01062.tif")
|
||||||
|
doc = InputDocument(
|
||||||
|
path_or_stream=tiff_path,
|
||||||
|
format=InputFormat.IMAGE,
|
||||||
|
backend=PdfFormatOption().backend, # use default backend
|
||||||
|
)
|
||||||
|
assert doc.valid is True
|
||||||
|
assert doc.page_count == 2
|
||||||
|
|
||||||
|
# Expect two full-page rectangles
|
||||||
|
rects_page1 = doc._backend.load_page(0).get_bitmap_rects()
|
||||||
|
rects_page2 = doc._backend.load_page(1).get_bitmap_rects()
|
||||||
|
|
||||||
|
page1_rect = next(rects_page1)
|
||||||
|
page2_rect = next(rects_page2)
|
||||||
|
|
||||||
|
assert page1_rect.t == page2_rect.t == 0
|
||||||
|
assert page1_rect.l == page2_rect.l == 0
|
||||||
|
assert page1_rect.r == page2_rect.r == 612.0
|
||||||
|
assert page1_rect.b == page2_rect.b == 792.0
|
||||||
|
Loading…
Reference in New Issue
Block a user