scaffolding in place

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2025-06-12 17:57:29 +02:00 · 2025-06-12 17:57:29 +02:00 · 5c606c2574
commit 5c606c2574
parent 0432a31b2f
6 changed files with 186 additions and 6 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -29,6 +29,10 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.asr_model_specs import (
    WHISPER_TINY,
    AsrModelType,
 )
 from docling.datamodel.base_models import (
    ConversionStatus,
    FormatToExtensions,
@ -37,12 +41,13 @@ from docling.datamodel.base_models import (
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AsrPipelineOptions,
    EasyOcrOptions,
    OcrOptions,
    PaginatedPipelineOptions,
    PdfBackend,
    PdfPipeline,
    PdfPipelineOptions,
    ProcessingPipeline,
    TableFormerMode,
    VlmPipelineOptions,
 )
@ -56,6 +61,7 @@ from docling.datamodel.vlm_model_specs import (
 )
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 from docling.models.factories import get_ocr_factory
 from docling.pipeline.asr_pipeline import AsrPipeline
 from docling.pipeline.vlm_pipeline import VlmPipeline
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@ -296,13 +302,17 @@ def convert(  # noqa: C901
        ),
    ] = ImageRefMode.EMBEDDED,
    pipeline: Annotated[
-        PdfPipeline,
+        ProcessingPipeline,
        typer.Option(..., help="Choose the pipeline to process PDF or image files."),
-    ] = PdfPipeline.STANDARD,
+    ] = ProcessingPipeline.STANDARD,
    vlm_model: Annotated[
        VlmModelType,
        typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
    ] = VlmModelType.SMOLDOCLING,
    asr_model: Annotated[
        AsrModelType,
        typer.Option(..., help="Choose the ASR model to use with audio/video files."),
    ] = AsrModelType.WHISPER_TINY,
    ocr: Annotated[
        bool,
        typer.Option(
@ -532,7 +542,7 @@ def convert(  # noqa: C901
        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
        pipeline_options: PaginatedPipelineOptions
-        if pipeline == PdfPipeline.STANDARD:
+        if pipeline == ProcessingPipeline.STANDARD:
            pipeline_options = PdfPipelineOptions(
                allow_external_plugins=allow_external_plugins,
                enable_remote_services=enable_remote_services,
@ -574,7 +584,7 @@ def convert(  # noqa: C901
                pipeline_options=pipeline_options,
                backend=backend,  # pdf_backend
            )
-        elif pipeline == PdfPipeline.VLM:
+        elif pipeline == ProcessingPipeline.VLM:
            pipeline_options = VlmPipelineOptions(
                enable_remote_services=enable_remote_services,
            )
@ -599,13 +609,28 @@ def convert(  # noqa: C901
            pdf_format_option = PdfFormatOption(
                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
            )
        elif pipeline == ProcessingPipeline.ASR:
            audio_pipeline_options = AsrPipelineOptions(
                # enable_remote_services=enable_remote_services,
            )
            audio_format_option = PdfFormatOption(
                pipeline_cls=AsrPipeline, pipeline_options=audio_pipeline_options
            )
            """
            if asr_model == AsrModelType.WHISPER_TINY:
                pipeline_options.asr_options = WHISPER_TINY:
            """
        if artifacts_path is not None:
            pipeline_options.artifacts_path = artifacts_path
            # audio_pipeline_options.artifacts_path = artifacts_path
        format_options: Dict[InputFormat, FormatOption] = {
            InputFormat.PDF: pdf_format_option,
            InputFormat.IMAGE: pdf_format_option,
            InputFormat.AUDIO: audio_format_option,
        }
        doc_converter = DocumentConverter(
            allowed_formats=from_formats,
--- a/docling/datamodel/asr_model_specs.py
+++ b/docling/datamodel/asr_model_specs.py
@ -0,0 +1,27 @@
 import logging
 from enum import Enum
 from pydantic import (
    AnyUrl,
 )
 from docling.datamodel.accelerator_options import AcceleratorDevice
 from docling.datamodel.pipeline_options_asr_model import (
    # ApiAsrOptions,
    InferenceFramework,
    InlineAsrOptions,
    AsrResponseFormat,
    TransformersModelType,
 )
 _log = logging.getLogger(__name__)
 # SmolDocling
 WHISPER_TINY = InlineAsrOptions(
    repo_id="openai/whisper-tiny",
    inference_framework=InferenceFramework.TRANSFORMERS,
    response_format = AsrResponseFormat.WHISPER,
 )
 class AsrModelType(str, Enum):
    WHISPER_TINY = "whisper_tiny"
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -49,6 +49,7 @@ class InputFormat(str, Enum):
    XML_USPTO = "xml_uspto"
    XML_JATS = "xml_jats"
    JSON_DOCLING = "json_docling"
    AUDIO = "audio"
 class OutputFormat(str, Enum):
@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.XLSX: ["xlsx", "xlsm"],
    InputFormat.XML_USPTO: ["xml", "txt"],
    InputFormat.JSON_DOCLING: ["json"],
    InputFormat.AUDIO: ["wav", "mp3"],
 }
 FormatToMimeType: Dict[InputFormat, List[str]] = {
@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
    ],
    InputFormat.XML_USPTO: ["application/xml", "text/plain"],
    InputFormat.JSON_DOCLING: ["application/json"],
    InputFormat.AUDIO: ["audio/wav"],
 }
 MimeTypeToFormat: dict[str, list[InputFormat]] = {
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -13,6 +13,13 @@ from typing_extensions import deprecated
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.asr_model_specs import (
    WHISPER_TINY as whisper_tiny,
    AsrModelType,
 )
 from docling.datamodel.pipeline_options_asr_model import (
    InlineAsrOptions,
 )
 from docling.datamodel.pipeline_options_vlm_model import (
    ApiVlmOptions,
    InferenceFramework,
@ -260,6 +267,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
    )
 class AsrPipelineOptions(PipelineOptions):
    asr_options: Union[InlineAsrOptions] = whisper_tiny
    artifacts_path: Optional[Union[Path, str]] = None
 class PdfPipelineOptions(PaginatedPipelineOptions):
    """Options for the PDF pipeline."""
@ -295,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
    generate_parsed_pages: bool = False
-class PdfPipeline(str, Enum):
+class ProcessingPipeline(str, Enum):
    STANDARD = "standard"
    VLM = "vlm"
    ASR = "asr"
--- a/docling/datamodel/pipeline_options_asr_model.py
+++ b/docling/datamodel/pipeline_options_asr_model.py
@ -0,0 +1,50 @@
 from enum import Enum
 from typing import Any, Dict, List, Literal, Optional, Union
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
 from docling.datamodel.accelerator_options import AcceleratorDevice
 from docling.datamodel.pipeline_options_vlm_model import InferenceFramework, TransformersModelType
 class BaseAsrOptions(BaseModel):
    kind: str
    # prompt: str
 class AsrResponseFormat(str, Enum):
    WHISPER = "whisper"
 class InlineAsrOptions(BaseAsrOptions):
    kind: Literal["inline_model_options"] = "inline_model_options"
    repo_id: str
    trust_remote_code: bool = False
    load_in_8bit: bool = True
    llm_int8_threshold: float = 6.0
    quantized: bool = False
    inference_framework: InferenceFramework
    transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
    response_format: AsrResponseFormat
    torch_dtype: Optional[str] = None
    supported_devices: List[AcceleratorDevice] = [
        AcceleratorDevice.CPU,
        AcceleratorDevice.CUDA,
        AcceleratorDevice.MPS,
    ]
    temperature: float = 0.0
    stop_strings: List[str] = []
    extra_generation_config: Dict[str, Any] = {}
    use_kv_cache: bool = True
    max_new_tokens: int = 4096
    @property
    def repo_cache_folder(self) -> str:
        return self.repo_id.replace("/", "--")
--- a/docling/pipeline/asr_pipeline.py
+++ b/docling/pipeline/asr_pipeline.py
@ -0,0 +1,62 @@
 import logging
 import re
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import (
    AsrPipelineOptions,
 )
 from docling.datamodel.pipeline_options_vlm_model import (
    InferenceFramework,
 )
 from docling.datamodel.pipeline_options_asr_model import (
    InlineAsrOptions,
    AsrResponseFormat,
 )
 from docling.datamodel.settings import settings
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 from docling.datamodel.document import ConversionResult, InputDocument
 _log = logging.getLogger(__name__)
 class AsrPipeline(BasePipeline):
    def __init__(self, pipeline_options: AsrPipelineOptions):
        super().__init__(pipeline_options)
        self.keep_backend = True
        self.pipeline_options: AsrPipelineOptions
        artifacts_path: Optional[Path] = None
        if pipeline_options.artifacts_path is not None:
            artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
        elif settings.artifacts_path is not None:
            artifacts_path = Path(settings.artifacts_path).expanduser()
        if artifacts_path is not None and not artifacts_path.is_dir():
            raise RuntimeError(
                f"The value of {artifacts_path=} is not valid. "
                "When defined, it must point to a folder containing all models required by the pipeline."
            )
    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
        total_elapsed_time = 0.0
        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
            print("do something")
        return conv_res
    """
    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
        status = ConversionStatus()        
        return status
    """
    @classmethod    
    def is_backend_supported(cls, backend: AbstractDocumentBackend):
        return True