feat: Add option to define page range (#852)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-01-31 15:23:00 +01:00
committed by GitHub
parent d727b04ad0
commit 70d68b6164
6 changed files with 82 additions and 4 deletions

View File

@@ -4,6 +4,7 @@ from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument, _DocumentConversionInput
from docling.datamodel.settings import DocumentLimits
def test_in_doc_from_valid_path():
@@ -39,6 +40,40 @@ def test_in_doc_from_invalid_buf():
assert doc.valid == False
def test_in_doc_with_page_range():
test_doc_path = Path("./tests/data/2206.01062.pdf")
limits = DocumentLimits()
limits.page_range = (1, 10)
doc = InputDocument(
path_or_stream=test_doc_path,
format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == True
limits.page_range = (9, 9)
doc = InputDocument(
path_or_stream=test_doc_path,
format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == True
limits.page_range = (11, 12)
doc = InputDocument(
path_or_stream=test_doc_path,
format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == False
def test_guess_format(tmp_path):
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
dci = _DocumentConversionInput(path_or_stream_iterator=[])

View File

@@ -105,6 +105,20 @@ def test_e2e_conversions(test_doc_path):
assert doc_result.status == ConversionStatus.SUCCESS
def test_page_range(test_doc_path):
converter = DocumentConverter()
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(9, 9))
assert doc_result.status == ConversionStatus.SUCCESS
assert doc_result.input.page_count == 9
assert doc_result.document.num_pages() == 1
doc_result: ConversionResult = converter.convert(
test_doc_path, page_range=(10, 10), raises_on_error=False
)
assert doc_result.status == ConversionStatus.FAILURE
def test_ocr_coverage_threshold(test_doc_path):
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True