mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-23 18:45:00 +00:00
Fix multi-page TIFF image support
Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com>
This commit is contained in:
parent
420df478f3
commit
c1d722725f
@ -57,7 +57,28 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
||||
if self.input_format is InputFormat.IMAGE:
|
||||
buf = BytesIO()
|
||||
img = Image.open(self.path_or_stream)
|
||||
img.save(buf, "PDF")
|
||||
|
||||
# Handle multi-page TIFF images
|
||||
if hasattr(img, 'n_frames') and img.n_frames > 1:
|
||||
# Extract all frames from multi-page image
|
||||
frames = []
|
||||
try:
|
||||
for i in range(img.n_frames):
|
||||
img.seek(i)
|
||||
frames.append(img.copy())
|
||||
except EOFError:
|
||||
pass
|
||||
|
||||
# Save as multi-page PDF
|
||||
if frames:
|
||||
frames[0].save(buf, "PDF", save_all=True, append_images=frames[1:])
|
||||
else:
|
||||
# Fallback to single page if frame extraction fails
|
||||
img.save(buf, "PDF")
|
||||
else:
|
||||
# Single page image - use existing behavior
|
||||
img.save(buf, "PDF")
|
||||
|
||||
buf.seek(0)
|
||||
self.path_or_stream = buf
|
||||
else:
|
||||
|
139
tests/test_multipage_tiff.py
Normal file
139
tests/test_multipage_tiff.py
Normal file
@ -0,0 +1,139 @@
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from io import BytesIO
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
def create_test_multipage_image(format_name, num_pages=3):
|
||||
"""Create a test multi-page image file."""
|
||||
images = []
|
||||
|
||||
for i in range(num_pages):
|
||||
# Create a unique image for each page
|
||||
img = Image.new('RGB', (200, 150), color='white')
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Draw page identifier
|
||||
text = f"Page {i+1}"
|
||||
draw.text((10, 10), text, fill='black')
|
||||
|
||||
# Draw some unique content for each page
|
||||
colors = ['red', 'green', 'blue', 'orange', 'purple']
|
||||
color = colors[i % len(colors)]
|
||||
draw.rectangle([10, 30, 190, 140], outline=color, width=2)
|
||||
|
||||
images.append(img)
|
||||
|
||||
# Save as multi-page image
|
||||
suffix = '.tif' if format_name == 'TIFF' else f'.{format_name.lower()}'
|
||||
temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
|
||||
temp_file.close()
|
||||
|
||||
try:
|
||||
if images:
|
||||
images[0].save(temp_file.name, format_name, save_all=True, append_images=images[1:])
|
||||
return temp_file.name
|
||||
except Exception as e:
|
||||
# Clean up on error
|
||||
if os.path.exists(temp_file.name):
|
||||
os.unlink(temp_file.name)
|
||||
raise e
|
||||
|
||||
|
||||
def test_multipage_image_support():
|
||||
"""Test that multi-page image files are properly handled."""
|
||||
|
||||
# Test with different image formats and numbers of pages
|
||||
test_cases = [
|
||||
('TIFF', 1, "single page TIFF"),
|
||||
('TIFF', 2, "two-page TIFF"),
|
||||
('TIFF', 3, "three-page TIFF"),
|
||||
('GIF', 2, "two-page GIF"),
|
||||
('WEBP', 2, "two-page WEBP"),
|
||||
]
|
||||
|
||||
for format_name, num_pages, description in test_cases:
|
||||
print(f"Testing {description}...")
|
||||
|
||||
# Create test file
|
||||
test_file = create_test_multipage_image(format_name, num_pages)
|
||||
|
||||
try:
|
||||
# Verify the image has the expected number of pages
|
||||
with Image.open(test_file) as img:
|
||||
expected_pages = getattr(img, 'n_frames', 1)
|
||||
assert expected_pages == num_pages, f"Test {format_name} should have {num_pages} pages, got {expected_pages}"
|
||||
|
||||
# Test with docling backend
|
||||
input_doc = InputDocument(
|
||||
path_or_stream=Path(test_file),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=DoclingParseV4DocumentBackend
|
||||
)
|
||||
|
||||
backend = DoclingParseV4DocumentBackend(input_doc, Path(test_file))
|
||||
|
||||
# Check the page count in the backend
|
||||
actual_pages = backend.page_count()
|
||||
assert actual_pages == expected_pages, f"Backend should report {expected_pages} pages, got {actual_pages}"
|
||||
|
||||
# Also verify the PDF was created correctly
|
||||
if isinstance(backend.path_or_stream, BytesIO):
|
||||
backend.path_or_stream.seek(0)
|
||||
pdf_doc = pdfium.PdfDocument(backend.path_or_stream)
|
||||
pdf_pages = len(pdf_doc)
|
||||
pdf_doc.close()
|
||||
assert pdf_pages == expected_pages, f"PDF should have {expected_pages} pages, got {pdf_pages}"
|
||||
|
||||
print(f"✅ {description} passed")
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
if os.path.exists(test_file):
|
||||
os.unlink(test_file)
|
||||
|
||||
|
||||
def test_single_page_image_unchanged():
|
||||
"""Test that single-page image files still work as before."""
|
||||
|
||||
# Test different single-page formats
|
||||
formats_to_test = ['TIFF', 'GIF', 'WEBP']
|
||||
|
||||
for format_name in formats_to_test:
|
||||
print(f"Testing single-page {format_name}...")
|
||||
|
||||
# Create a single-page image
|
||||
test_file = create_test_multipage_image(format_name, 1)
|
||||
|
||||
try:
|
||||
# Test with docling backend
|
||||
input_doc = InputDocument(
|
||||
path_or_stream=Path(test_file),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=DoclingParseV4DocumentBackend
|
||||
)
|
||||
|
||||
backend = DoclingParseV4DocumentBackend(input_doc, Path(test_file))
|
||||
|
||||
# Should have exactly 1 page
|
||||
assert backend.page_count() == 1, f"Single-page {format_name} should have 1 page, got {backend.page_count()}"
|
||||
|
||||
print(f"✅ Single-page {format_name} test passed")
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
if os.path.exists(test_file):
|
||||
os.unlink(test_file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_multipage_image_support()
|
||||
test_single_page_image_unchanged()
|
||||
print("All tests passed!")
|
Loading…
Reference in New Issue
Block a user