mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 13:48:13 +00:00
feat: Add option to define page range (#852)
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -157,6 +157,8 @@ class InputDocument(BaseModel):
|
||||
self.page_count = self._backend.page_count()
|
||||
if not self.page_count <= self.limits.max_num_pages:
|
||||
self.valid = False
|
||||
elif self.page_count < self.limits.page_range[0]:
|
||||
self.valid = False
|
||||
|
||||
except (FileNotFoundError, OSError) as e:
|
||||
self.valid = False
|
||||
|
||||
@@ -1,13 +1,28 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Tuple
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, PlainValidator
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
|
||||
if v[0] < 1 or v[1] < v[0]:
|
||||
raise ValueError(
|
||||
"Invalid page range: start must be ≥ 1 and end must be ≥ start."
|
||||
)
|
||||
return v
|
||||
|
||||
|
||||
PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
|
||||
|
||||
DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
|
||||
|
||||
|
||||
class DocumentLimits(BaseModel):
|
||||
max_num_pages: int = sys.maxsize
|
||||
max_file_size: int = sys.maxsize
|
||||
page_range: PageRange = DEFAULT_PAGE_RANGE
|
||||
|
||||
|
||||
class BatchConcurrencySettings(BaseModel):
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
import logging
|
||||
import math
|
||||
import sys
|
||||
import time
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
|
||||
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
|
||||
@@ -31,7 +32,12 @@ from docling.datamodel.document import (
|
||||
_DocumentConversionInput,
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import DocumentLimits, settings
|
||||
from docling.datamodel.settings import (
|
||||
DEFAULT_PAGE_RANGE,
|
||||
DocumentLimits,
|
||||
PageRange,
|
||||
settings,
|
||||
)
|
||||
from docling.exceptions import ConversionError
|
||||
from docling.pipeline.base_pipeline import BasePipeline
|
||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||
@@ -184,6 +190,7 @@ class DocumentConverter:
|
||||
raises_on_error: bool = True,
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
||||
) -> ConversionResult:
|
||||
all_res = self.convert_all(
|
||||
source=[source],
|
||||
@@ -191,6 +198,7 @@ class DocumentConverter:
|
||||
max_num_pages=max_num_pages,
|
||||
max_file_size=max_file_size,
|
||||
headers=headers,
|
||||
page_range=page_range,
|
||||
)
|
||||
return next(all_res)
|
||||
|
||||
@@ -202,10 +210,12 @@ class DocumentConverter:
|
||||
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
||||
) -> Iterator[ConversionResult]:
|
||||
limits = DocumentLimits(
|
||||
max_num_pages=max_num_pages,
|
||||
max_file_size=max_file_size,
|
||||
page_range=page_range,
|
||||
)
|
||||
conv_input = _DocumentConversionInput(
|
||||
path_or_stream_iterator=source, limits=limits, headers=headers
|
||||
|
||||
@@ -141,7 +141,9 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||
|
||||
for i in range(0, conv_res.input.page_count):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
start_page, end_page = conv_res.input.limits.page_range
|
||||
if (start_page - 1) <= i <= (end_page - 1):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
|
||||
try:
|
||||
# Iterate batches of pages (page_batch_size) in the doc
|
||||
|
||||
Reference in New Issue
Block a user