mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
Apply ruff formatting to test file
Co-authored-by: cau-git <60343111+cau-git@users.noreply.github.com>
This commit is contained in:
@@ -12,24 +12,24 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
|||||||
def test_page_range_beyond_32():
|
def test_page_range_beyond_32():
|
||||||
"""
|
"""
|
||||||
Test that page_range works correctly when requesting pages beyond index 32.
|
Test that page_range works correctly when requesting pages beyond index 32.
|
||||||
|
|
||||||
This test verifies the fix for the bug where page_range would stop at page 32
|
This test verifies the fix for the bug where page_range would stop at page 32
|
||||||
when the range started from page 30 or higher.
|
when the range started from page 30 or higher.
|
||||||
|
|
||||||
Bug scenario:
|
Bug scenario:
|
||||||
- page_range=(30, 35) would only extract pages 30-32 instead of 30-35
|
- page_range=(30, 35) would only extract pages 30-32 instead of 30-35
|
||||||
- Pages with page_no >= 32 (0-indexed) were not being processed
|
- Pages with page_no >= 32 (0-indexed) were not being processed
|
||||||
|
|
||||||
Root cause was a hardcoded batch_size=32 in the drain loop.
|
Root cause was a hardcoded batch_size=32 in the drain loop.
|
||||||
"""
|
"""
|
||||||
# Use a multi-page PDF for testing
|
# Use a multi-page PDF for testing
|
||||||
# Note: 2206.01062.pdf is a research paper that should have enough pages
|
# Note: 2206.01062.pdf is a research paper that should have enough pages
|
||||||
test_pdf = Path("tests/data/pdf/2206.01062.pdf")
|
test_pdf = Path("tests/data/pdf/2206.01062.pdf")
|
||||||
|
|
||||||
# Skip test if PDF doesn't exist
|
# Skip test if PDF doesn't exist
|
||||||
if not test_pdf.exists():
|
if not test_pdf.exists():
|
||||||
pytest.skip(f"Test PDF not found: {test_pdf}")
|
pytest.skip(f"Test PDF not found: {test_pdf}")
|
||||||
|
|
||||||
# Create converter with StandardPdfPipeline
|
# Create converter with StandardPdfPipeline
|
||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
allowed_formats=[InputFormat.PDF],
|
allowed_formats=[InputFormat.PDF],
|
||||||
@@ -37,15 +37,15 @@ def test_page_range_beyond_32():
|
|||||||
InputFormat.PDF: PdfFormatOption(pipeline_cls=StandardPdfPipeline),
|
InputFormat.PDF: PdfFormatOption(pipeline_cls=StandardPdfPipeline),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
# First, convert without page_range to get total page count
|
# First, convert without page_range to get total page count
|
||||||
result_full = converter.convert(test_pdf)
|
result_full = converter.convert(test_pdf)
|
||||||
total_pages = len(result_full.pages)
|
total_pages = len(result_full.pages)
|
||||||
|
|
||||||
# Skip if PDF doesn't have enough pages for this test
|
# Skip if PDF doesn't have enough pages for this test
|
||||||
if total_pages < 40:
|
if total_pages < 40:
|
||||||
pytest.skip(f"PDF has only {total_pages} pages, need at least 40 for this test")
|
pytest.skip(f"PDF has only {total_pages} pages, need at least 40 for this test")
|
||||||
|
|
||||||
# Test case 1: page_range=(1, 45) should work
|
# Test case 1: page_range=(1, 45) should work
|
||||||
result1 = converter.convert(test_pdf, page_range=(1, min(45, total_pages)))
|
result1 = converter.convert(test_pdf, page_range=(1, min(45, total_pages)))
|
||||||
expected_pages1 = min(45, total_pages)
|
expected_pages1 = min(45, total_pages)
|
||||||
@@ -53,7 +53,7 @@ def test_page_range_beyond_32():
|
|||||||
f"Expected {expected_pages1} pages for range (1, {min(45, total_pages)}), "
|
f"Expected {expected_pages1} pages for range (1, {min(45, total_pages)}), "
|
||||||
f"got {len(result1.pages)}"
|
f"got {len(result1.pages)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test case 2: page_range=(30, 35) should extract pages 30-35 (not just 30-32)
|
# Test case 2: page_range=(30, 35) should extract pages 30-35 (not just 30-32)
|
||||||
# This was the failing case in the bug report
|
# This was the failing case in the bug report
|
||||||
result2 = converter.convert(test_pdf, page_range=(30, min(35, total_pages)))
|
result2 = converter.convert(test_pdf, page_range=(30, min(35, total_pages)))
|
||||||
@@ -62,14 +62,16 @@ def test_page_range_beyond_32():
|
|||||||
f"Expected {expected_pages2} pages for range (30, {min(35, total_pages)}), "
|
f"Expected {expected_pages2} pages for range (30, {min(35, total_pages)}), "
|
||||||
f"got {len(result2.pages)}. This is the bug: conversion stopped prematurely!"
|
f"got {len(result2.pages)}. This is the bug: conversion stopped prematurely!"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Verify that the page numbers are correct
|
# Verify that the page numbers are correct
|
||||||
page_numbers2 = [p.page_no for p in result2.pages]
|
page_numbers2 = [p.page_no for p in result2.pages]
|
||||||
expected_page_nos = list(range(29, 29 + expected_pages2)) # 0-indexed: 29, 30, 31, 32, 33, 34
|
expected_page_nos = list(
|
||||||
|
range(29, 29 + expected_pages2)
|
||||||
|
) # 0-indexed: 29, 30, 31, 32, 33, 34
|
||||||
assert page_numbers2 == expected_page_nos, (
|
assert page_numbers2 == expected_page_nos, (
|
||||||
f"Expected page numbers {expected_page_nos}, got {page_numbers2}"
|
f"Expected page numbers {expected_page_nos}, got {page_numbers2}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test case 3: page_range=(30, 45) should extract pages 30-45 (not just 30-32)
|
# Test case 3: page_range=(30, 45) should extract pages 30-45 (not just 30-32)
|
||||||
result3 = converter.convert(test_pdf, page_range=(30, min(45, total_pages)))
|
result3 = converter.convert(test_pdf, page_range=(30, min(45, total_pages)))
|
||||||
expected_pages3 = min(16, total_pages - 29) # pages 30-45 is 16 pages
|
expected_pages3 = min(16, total_pages - 29) # pages 30-45 is 16 pages
|
||||||
@@ -77,7 +79,7 @@ def test_page_range_beyond_32():
|
|||||||
f"Expected {expected_pages3} pages for range (30, {min(45, total_pages)}), "
|
f"Expected {expected_pages3} pages for range (30, {min(45, total_pages)}), "
|
||||||
f"got {len(result3.pages)}. This is the bug: conversion stopped prematurely!"
|
f"got {len(result3.pages)}. This is the bug: conversion stopped prematurely!"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test case 4: page_range with pages entirely beyond 32
|
# Test case 4: page_range with pages entirely beyond 32
|
||||||
if total_pages >= 40:
|
if total_pages >= 40:
|
||||||
result4 = converter.convert(test_pdf, page_range=(35, min(40, total_pages)))
|
result4 = converter.convert(test_pdf, page_range=(35, min(40, total_pages)))
|
||||||
@@ -86,7 +88,7 @@ def test_page_range_beyond_32():
|
|||||||
f"Expected {expected_pages4} pages for range (35, {min(40, total_pages)}), "
|
f"Expected {expected_pages4} pages for range (35, {min(40, total_pages)}), "
|
||||||
f"got {len(result4.pages)}"
|
f"got {len(result4.pages)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Verify that pages with page_no >= 32 are actually processed
|
# Verify that pages with page_no >= 32 are actually processed
|
||||||
page_numbers4 = [p.page_no for p in result4.pages]
|
page_numbers4 = [p.page_no for p in result4.pages]
|
||||||
assert all(page_no >= 34 for page_no in page_numbers4), (
|
assert all(page_no >= 34 for page_no in page_numbers4), (
|
||||||
@@ -97,36 +99,40 @@ def test_page_range_beyond_32():
|
|||||||
def test_page_range_edge_cases():
|
def test_page_range_edge_cases():
|
||||||
"""Test edge cases for page_range parameter."""
|
"""Test edge cases for page_range parameter."""
|
||||||
test_pdf = Path("tests/data/pdf/2206.01062.pdf")
|
test_pdf = Path("tests/data/pdf/2206.01062.pdf")
|
||||||
|
|
||||||
if not test_pdf.exists():
|
if not test_pdf.exists():
|
||||||
pytest.skip(f"Test PDF not found: {test_pdf}")
|
pytest.skip(f"Test PDF not found: {test_pdf}")
|
||||||
|
|
||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
allowed_formats=[InputFormat.PDF],
|
allowed_formats=[InputFormat.PDF],
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(pipeline_cls=StandardPdfPipeline),
|
InputFormat.PDF: PdfFormatOption(pipeline_cls=StandardPdfPipeline),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get total page count
|
# Get total page count
|
||||||
result_full = converter.convert(test_pdf)
|
result_full = converter.convert(test_pdf)
|
||||||
total_pages = len(result_full.pages)
|
total_pages = len(result_full.pages)
|
||||||
|
|
||||||
if total_pages < 5:
|
if total_pages < 5:
|
||||||
pytest.skip(f"PDF has only {total_pages} pages, need at least 5 for this test")
|
pytest.skip(f"PDF has only {total_pages} pages, need at least 5 for this test")
|
||||||
|
|
||||||
# Edge case 1: Single page at boundary (page 32)
|
# Edge case 1: Single page at boundary (page 32)
|
||||||
if total_pages >= 32:
|
if total_pages >= 32:
|
||||||
result = converter.convert(test_pdf, page_range=(32, 32))
|
result = converter.convert(test_pdf, page_range=(32, 32))
|
||||||
assert len(result.pages) == 1, f"Expected 1 page, got {len(result.pages)}"
|
assert len(result.pages) == 1, f"Expected 1 page, got {len(result.pages)}"
|
||||||
assert result.pages[0].page_no == 31, f"Expected page_no=31, got {result.pages[0].page_no}"
|
assert result.pages[0].page_no == 31, (
|
||||||
|
f"Expected page_no=31, got {result.pages[0].page_no}"
|
||||||
|
)
|
||||||
|
|
||||||
# Edge case 2: Single page after boundary (page 33)
|
# Edge case 2: Single page after boundary (page 33)
|
||||||
if total_pages >= 33:
|
if total_pages >= 33:
|
||||||
result = converter.convert(test_pdf, page_range=(33, 33))
|
result = converter.convert(test_pdf, page_range=(33, 33))
|
||||||
assert len(result.pages) == 1, f"Expected 1 page, got {len(result.pages)}"
|
assert len(result.pages) == 1, f"Expected 1 page, got {len(result.pages)}"
|
||||||
assert result.pages[0].page_no == 32, f"Expected page_no=32, got {result.pages[0].page_no}"
|
assert result.pages[0].page_no == 32, (
|
||||||
|
f"Expected page_no=32, got {result.pages[0].page_no}"
|
||||||
|
)
|
||||||
|
|
||||||
# Edge case 3: Range crossing the boundary (31-34)
|
# Edge case 3: Range crossing the boundary (31-34)
|
||||||
if total_pages >= 34:
|
if total_pages >= 34:
|
||||||
result = converter.convert(test_pdf, page_range=(31, 34))
|
result = converter.convert(test_pdf, page_range=(31, 34))
|
||||||
|
|||||||
Reference in New Issue
Block a user