docs: add Pydantic field documentation for PipelineOptions (#2771)

* Add Pydantic field descriptions for Class OrcOptions

Signed-off-by: Nikolaos Georgantopoulos <niko.geor@outlook.com>

* Add Pydantic field descriptions for class OcrAutoOptions

Signed-off-by: Nikolaos Georgantopoulos <niko.geor@outlook.com>

* Add Pydantic field documentation for class PipelineOptions

Signed-off-by: Nikolaos Georgantopoulos <niko.geor@outlook.com>

* update docstrings

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* import from typing

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Nikolaos Georgantopoulos <niko.geor@outlook.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Nikolaos Georgantopoulos
2025-12-11 13:30:41 +01:00
committed by GitHub
parent 807303e33e
commit 7c24b014f6

View File

@@ -2,7 +2,7 @@ import logging
from datetime import datetime from datetime import datetime
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union from typing import Annotated, Any, ClassVar, Dict, List, Literal, Optional, Union
from pydantic import ( from pydantic import (
AnyUrl, AnyUrl,
@@ -79,18 +79,41 @@ class TableStructureOptions(BaseTableStructureOptions):
class OcrOptions(BaseOptions): class OcrOptions(BaseOptions):
"""OCR options.""" """OCR options."""
lang: List[str] lang: Annotated[
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied List[str],
bitmap_area_threshold: float = ( Field(
0.05 # percentage of the area for a bitmap to processed with OCR description="List of OCR languages to use. The format must match the values of the OCR engine of choice.",
) examples=[["deu", "eng"]],
),
]
force_full_page_ocr: Annotated[
bool,
Field(
description="If enabled, a full-page OCR is always applied.",
examples=[False],
),
] = False
bitmap_area_threshold: Annotated[
float,
Field(
description="Percentage of the page area for a bitmap to be processed with OCR.",
examples=[0.05, 0.1],
),
] = 0.05
class OcrAutoOptions(OcrOptions): class OcrAutoOptions(OcrOptions):
"""Options for pick OCR engine automatically.""" """Options for pick OCR engine automatically."""
kind: ClassVar[Literal["auto"]] = "auto" kind: ClassVar[Literal["auto"]] = "auto"
lang: List[str] = [] lang: Annotated[
List[str],
Field(
description="The automatic OCR engine will use the default values of the engine. Please specify the engine explicitly to change the language selection.",
),
] = []
class RapidOcrOptions(OcrOptions): class RapidOcrOptions(OcrOptions):
@@ -278,11 +301,44 @@ class OcrEngine(str, Enum):
class PipelineOptions(BaseOptions): class PipelineOptions(BaseOptions):
"""Base pipeline options.""" """Base pipeline options."""
document_timeout: Optional[float] = None document_timeout: Annotated[
accelerator_options: AcceleratorOptions = AcceleratorOptions() Optional[float],
enable_remote_services: bool = False Field(
allow_external_plugins: bool = False description="Maximum allowed processing time for a document before timing out. If None, no timeout is enforced.",
artifacts_path: Optional[Union[Path, str]] = None examples=[10.0, 20.0],
),
] = None
accelerator_options: Annotated[
AcceleratorOptions,
Field(
description="Configuration options for hardware acceleration (e.g., GPU or optimized execution settings).",
),
] = AcceleratorOptions()
enable_remote_services: Annotated[
bool,
Field(
description="Enable calling external APIs or cloud services during pipeline execution.",
examples=[False],
),
] = False
allow_external_plugins: Annotated[
bool,
Field(
description="Allow loading external third-party plugins or modules. Disabled by default for safety.",
examples=[False],
),
] = False
artifacts_path: Annotated[
Optional[Union[Path, str]],
Field(
description="Filesystem path where pipeline artifacts should be stored. If None, artifacts will be fetched. You can use the utility `docling-tools models download` to pre-fetch the model artifacts.",
examples=["./artifacts", "/tmp/docling_outputs"],
),
] = None
class ConvertPipelineOptions(PipelineOptions): class ConvertPipelineOptions(PipelineOptions):