From 5c606c25740fa38c472ef87f9a98f97fce1e8e33 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Thu, 12 Jun 2025 17:57:29 +0200
Subject: [PATCH] scaffolding in place

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/cli/main.py                           | 35 +++++++++--
 docling/datamodel/asr_model_specs.py          | 27 ++++++++
 docling/datamodel/base_models.py              |  3 +
 docling/datamodel/pipeline_options.py         | 15 ++++-
 .../datamodel/pipeline_options_asr_model.py   | 50 +++++++++++++++
 docling/pipeline/asr_pipeline.py              | 62 +++++++++++++++++++
 6 files changed, 186 insertions(+), 6 deletions(-)
 create mode 100644 docling/datamodel/asr_model_specs.py
 create mode 100644 docling/datamodel/pipeline_options_asr_model.py
 create mode 100644 docling/pipeline/asr_pipeline.py

diff --git a/docling/cli/main.py b/docling/cli/main.py
index 083f53b2..3e3796cf 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -29,6 +29,10 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.asr_model_specs import (
+    WHISPER_TINY,
+    AsrModelType,
+)
 from docling.datamodel.base_models import (
     ConversionStatus,
     FormatToExtensions,
@@ -37,12 +41,13 @@ from docling.datamodel.base_models import (
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AsrPipelineOptions,
     EasyOcrOptions,
     OcrOptions,
     PaginatedPipelineOptions,
     PdfBackend,
-    PdfPipeline,
     PdfPipelineOptions,
+    ProcessingPipeline,
     TableFormerMode,
     VlmPipelineOptions,
 )
@@ -56,6 +61,7 @@ from docling.datamodel.vlm_model_specs import (
 )
 from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
 from docling.models.factories import get_ocr_factory
+from docling.pipeline.asr_pipeline import AsrPipeline
 from docling.pipeline.vlm_pipeline import VlmPipeline
 
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -296,13 +302,17 @@ def convert(  # noqa: C901
         ),
     ] = ImageRefMode.EMBEDDED,
     pipeline: Annotated[
-        PdfPipeline,
+        ProcessingPipeline,
         typer.Option(..., help="Choose the pipeline to process PDF or image files."),
-    ] = PdfPipeline.STANDARD,
+    ] = ProcessingPipeline.STANDARD,
     vlm_model: Annotated[
         VlmModelType,
         typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
     ] = VlmModelType.SMOLDOCLING,
+    asr_model: Annotated[
+        AsrModelType,
+        typer.Option(..., help="Choose the ASR model to use with audio/video files."),
+    ] = AsrModelType.WHISPER_TINY,
     ocr: Annotated[
         bool,
         typer.Option(
@@ -532,7 +542,7 @@ def convert(  # noqa: C901
         accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
         pipeline_options: PaginatedPipelineOptions
 
-        if pipeline == PdfPipeline.STANDARD:
+        if pipeline == ProcessingPipeline.STANDARD:
             pipeline_options = PdfPipelineOptions(
                 allow_external_plugins=allow_external_plugins,
                 enable_remote_services=enable_remote_services,
@@ -574,7 +584,7 @@ def convert(  # noqa: C901
                 pipeline_options=pipeline_options,
                 backend=backend,  # pdf_backend
             )
-        elif pipeline == PdfPipeline.VLM:
+        elif pipeline == ProcessingPipeline.VLM:
             pipeline_options = VlmPipelineOptions(
                 enable_remote_services=enable_remote_services,
             )
@@ -599,13 +609,28 @@ def convert(  # noqa: C901
             pdf_format_option = PdfFormatOption(
                 pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
             )
+        elif pipeline == ProcessingPipeline.ASR:
+            audio_pipeline_options = AsrPipelineOptions(
+                # enable_remote_services=enable_remote_services,
+            )
+
+            audio_format_option = PdfFormatOption(
+                pipeline_cls=AsrPipeline, pipeline_options=audio_pipeline_options
+            )
+
+            """
+            if asr_model == AsrModelType.WHISPER_TINY:
+                pipeline_options.asr_options = WHISPER_TINY:
+            """
 
         if artifacts_path is not None:
             pipeline_options.artifacts_path = artifacts_path
+            # audio_pipeline_options.artifacts_path = artifacts_path
 
         format_options: Dict[InputFormat, FormatOption] = {
             InputFormat.PDF: pdf_format_option,
             InputFormat.IMAGE: pdf_format_option,
+            InputFormat.AUDIO: audio_format_option,
         }
         doc_converter = DocumentConverter(
             allowed_formats=from_formats,
diff --git a/docling/datamodel/asr_model_specs.py b/docling/datamodel/asr_model_specs.py
new file mode 100644
index 00000000..71daa8a8
--- /dev/null
+++ b/docling/datamodel/asr_model_specs.py
@@ -0,0 +1,27 @@
+import logging
+from enum import Enum
+
+from pydantic import (
+    AnyUrl,
+)
+
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_asr_model import (
+    # ApiAsrOptions,
+    InferenceFramework,
+    InlineAsrOptions,
+    AsrResponseFormat,
+    TransformersModelType,
+)
+
+_log = logging.getLogger(__name__)
+
+# SmolDocling
+WHISPER_TINY = InlineAsrOptions(
+    repo_id="openai/whisper-tiny",
+    inference_framework=InferenceFramework.TRANSFORMERS,
+    response_format = AsrResponseFormat.WHISPER,
+)
+
+class AsrModelType(str, Enum):
+    WHISPER_TINY = "whisper_tiny"
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index 2cd25150..3eb88548 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -49,6 +49,7 @@ class InputFormat(str, Enum):
     XML_USPTO = "xml_uspto"
     XML_JATS = "xml_jats"
     JSON_DOCLING = "json_docling"
+    AUDIO = "audio"
 
 
 class OutputFormat(str, Enum):
@@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.XLSX: ["xlsx", "xlsm"],
     InputFormat.XML_USPTO: ["xml", "txt"],
     InputFormat.JSON_DOCLING: ["json"],
+    InputFormat.AUDIO: ["wav", "mp3"],
 }
 
 FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
     ],
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
     InputFormat.JSON_DOCLING: ["application/json"],
+    InputFormat.AUDIO: ["audio/wav"],
 }
 
 MimeTypeToFormat: dict[str, list[InputFormat]] = {
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index cde5cd33..302afcdf 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -13,6 +13,13 @@ from typing_extensions import deprecated
 
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.asr_model_specs import (
+    WHISPER_TINY as whisper_tiny,
+    AsrModelType,
+)
+from docling.datamodel.pipeline_options_asr_model import (
+    InlineAsrOptions,
+)
 from docling.datamodel.pipeline_options_vlm_model import (
     ApiVlmOptions,
     InferenceFramework,
@@ -260,6 +267,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
     )
 
 
+class AsrPipelineOptions(PipelineOptions):
+    asr_options: Union[InlineAsrOptions] = whisper_tiny
+    artifacts_path: Optional[Union[Path, str]] = None
+
+
 class PdfPipelineOptions(PaginatedPipelineOptions):
     """Options for the PDF pipeline."""
 
@@ -295,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
     generate_parsed_pages: bool = False
 
 
-class PdfPipeline(str, Enum):
+class ProcessingPipeline(str, Enum):
     STANDARD = "standard"
     VLM = "vlm"
+    ASR = "asr"
diff --git a/docling/datamodel/pipeline_options_asr_model.py b/docling/datamodel/pipeline_options_asr_model.py
new file mode 100644
index 00000000..25d0dd15
--- /dev/null
+++ b/docling/datamodel/pipeline_options_asr_model.py
@@ -0,0 +1,50 @@
+from enum import Enum
+from typing import Any, Dict, List, Literal, Optional, Union
+
+from pydantic import AnyUrl, BaseModel
+from typing_extensions import deprecated
+
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_vlm_model import InferenceFramework, TransformersModelType
+
+class BaseAsrOptions(BaseModel):
+    kind: str
+    # prompt: str
+
+
+class AsrResponseFormat(str, Enum):
+    WHISPER = "whisper"
+
+    
+class InlineAsrOptions(BaseAsrOptions):
+    kind: Literal["inline_model_options"] = "inline_model_options"
+
+    repo_id: str
+    trust_remote_code: bool = False
+    load_in_8bit: bool = True
+    llm_int8_threshold: float = 6.0
+    quantized: bool = False
+
+    inference_framework: InferenceFramework
+    transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
+    response_format: AsrResponseFormat
+
+    torch_dtype: Optional[str] = None
+    supported_devices: List[AcceleratorDevice] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ]
+
+    temperature: float = 0.0
+    stop_strings: List[str] = []
+    extra_generation_config: Dict[str, Any] = {}
+
+    use_kv_cache: bool = True
+    max_new_tokens: int = 4096
+
+    @property
+    def repo_cache_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+
+    
diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py
new file mode 100644
index 00000000..70e52537
--- /dev/null
+++ b/docling/pipeline/asr_pipeline.py
@@ -0,0 +1,62 @@
+import logging
+import re
+from io import BytesIO
+from pathlib import Path
+from typing import List, Optional, Union, cast
+
+from docling.backend.abstract_backend import AbstractDocumentBackend
+
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.pipeline_options import (
+    AsrPipelineOptions,
+)
+from docling.datamodel.pipeline_options_vlm_model import (
+    InferenceFramework,
+)
+from docling.datamodel.pipeline_options_asr_model import (
+    InlineAsrOptions,
+    AsrResponseFormat,
+)
+from docling.datamodel.settings import settings
+from docling.pipeline.base_pipeline import BasePipeline
+from docling.utils.profiling import ProfilingScope, TimeRecorder
+from docling.datamodel.document import ConversionResult, InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class AsrPipeline(BasePipeline):
+    def __init__(self, pipeline_options: AsrPipelineOptions):
+        super().__init__(pipeline_options)
+        self.keep_backend = True
+
+        self.pipeline_options: AsrPipelineOptions
+
+        artifacts_path: Optional[Path] = None
+        if pipeline_options.artifacts_path is not None:
+            artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
+        elif settings.artifacts_path is not None:
+            artifacts_path = Path(settings.artifacts_path).expanduser()
+
+        if artifacts_path is not None and not artifacts_path.is_dir():
+            raise RuntimeError(
+                f"The value of {artifacts_path=} is not valid. "
+                "When defined, it must point to a folder containing all models required by the pipeline."
+            )
+
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
+        total_elapsed_time = 0.0
+        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
+            print("do something")
+
+        return conv_res
+
+    """
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
+        status = ConversionStatus()        
+        return status
+    """
+    
+    @classmethod    
+    def is_backend_supported(cls, backend: AbstractDocumentBackend):
+        return True