mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: Add option to define page range (#852)
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -4,6 +4,7 @@ from pathlib import Path
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
|
||||
|
||||
def test_in_doc_from_valid_path():
|
||||
@@ -39,6 +40,40 @@ def test_in_doc_from_invalid_buf():
|
||||
assert doc.valid == False
|
||||
|
||||
|
||||
def test_in_doc_with_page_range():
|
||||
test_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
limits = DocumentLimits()
|
||||
limits.page_range = (1, 10)
|
||||
|
||||
doc = InputDocument(
|
||||
path_or_stream=test_doc_path,
|
||||
format=InputFormat.PDF,
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
limits=limits,
|
||||
)
|
||||
assert doc.valid == True
|
||||
|
||||
limits.page_range = (9, 9)
|
||||
|
||||
doc = InputDocument(
|
||||
path_or_stream=test_doc_path,
|
||||
format=InputFormat.PDF,
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
limits=limits,
|
||||
)
|
||||
assert doc.valid == True
|
||||
|
||||
limits.page_range = (11, 12)
|
||||
|
||||
doc = InputDocument(
|
||||
path_or_stream=test_doc_path,
|
||||
format=InputFormat.PDF,
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
limits=limits,
|
||||
)
|
||||
assert doc.valid == False
|
||||
|
||||
|
||||
def test_guess_format(tmp_path):
|
||||
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
|
||||
dci = _DocumentConversionInput(path_or_stream_iterator=[])
|
||||
|
||||
@@ -105,6 +105,20 @@ def test_e2e_conversions(test_doc_path):
|
||||
assert doc_result.status == ConversionStatus.SUCCESS
|
||||
|
||||
|
||||
def test_page_range(test_doc_path):
|
||||
converter = DocumentConverter()
|
||||
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(9, 9))
|
||||
|
||||
assert doc_result.status == ConversionStatus.SUCCESS
|
||||
assert doc_result.input.page_count == 9
|
||||
assert doc_result.document.num_pages() == 1
|
||||
|
||||
doc_result: ConversionResult = converter.convert(
|
||||
test_doc_path, page_range=(10, 10), raises_on_error=False
|
||||
)
|
||||
assert doc_result.status == ConversionStatus.FAILURE
|
||||
|
||||
|
||||
def test_ocr_coverage_threshold(test_doc_path):
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
|
||||
Reference in New Issue
Block a user