feat: Support audio input (#1763)

* scaffolding in place Signed-off-by: Peter Staar <taa@zurich.ibm.com> * doing scaffolding for audio pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * WIP: got first transcription working Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, time to start cleaning up Signed-off-by: Peter Staar <taa@zurich.ibm.com> * first working ASR pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added openai-whisper as a first transcription model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updating with asr_options Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalised the first working ASR pipeline with Whisper Signed-off-by: Peter Staar <taa@zurich.ibm.com> * use whisper from the latest git commit Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * updated comment Signed-off-by: Peter Staar <taa@zurich.ibm.com> * AudioBackend -> DummyBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * file rename Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename to NoOpBackend, add test for ASR pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Support every format in NoOpBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add missing audio file and test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Install ffmpeg system dependency for ASR test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-11 06:08:09 +00:00 · 2025-06-23 14:47:26 +02:00
parent d26dac61a8
commit 1557e7ce3e
14 changed files with 941 additions and 62 deletions
--- a/docling/backend/noop_backend.py
+++ b/docling/backend/noop_backend.py
@@ -0,0 +1,51 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class NoOpBackend(AbstractDocumentBackend):
+    """
+    A no-op backend that only validates input existence.
+    Used e.g. for audio files where actual processing is handled by the ASR pipeline.
+    """
+
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
+        _log.debug(f"NoOpBackend initialized for: {path_or_stream}")
+
+        # Validate input
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                # Check if stream has content
+                self.valid = len(self.path_or_stream.getvalue()) > 0
+                _log.debug(
+                    f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
+                )
+            elif isinstance(self.path_or_stream, Path):
+                # Check if file exists
+                self.valid = self.path_or_stream.exists()
+                _log.debug(f"File exists: {self.valid}")
+            else:
+                self.valid = False
+        except Exception as e:
+            _log.error(f"NoOpBackend validation failed: {e}")
+            self.valid = False
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return set(InputFormat)
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -29,6 +29,15 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.asr_model_specs import (
+    WHISPER_BASE,
+    WHISPER_LARGE,
+    WHISPER_MEDIUM,
+    WHISPER_SMALL,
+    WHISPER_TINY,
+    WHISPER_TURBO,
+    AsrModelType,
+)
 from docling.datamodel.base_models import (
    ConversionStatus,
    FormatToExtensions,
@@ -37,12 +46,14 @@ from docling.datamodel.base_models import (
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AsrPipelineOptions,
    EasyOcrOptions,
    OcrOptions,
    PaginatedPipelineOptions,
    PdfBackend,
-    PdfPipeline,
    PdfPipelineOptions,
+    PipelineOptions,
+    ProcessingPipeline,
    TableFormerMode,
    VlmPipelineOptions,
 )
@@ -54,8 +65,14 @@ from docling.datamodel.vlm_model_specs import (
    SMOLDOCLING_TRANSFORMERS,
    VlmModelType,
 )
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docling.document_converter import (
+    AudioFormatOption,
+    DocumentConverter,
+    FormatOption,
+    PdfFormatOption,
+)
 from docling.models.factories import get_ocr_factory
+from docling.pipeline.asr_pipeline import AsrPipeline
 from docling.pipeline.vlm_pipeline import VlmPipeline

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -296,13 +313,17 @@ def convert(  # noqa: C901
        ),
    ] = ImageRefMode.EMBEDDED,
    pipeline: Annotated[
-        PdfPipeline,
+        ProcessingPipeline,
        typer.Option(..., help="Choose the pipeline to process PDF or image files."),
-    ] = PdfPipeline.STANDARD,
+    ] = ProcessingPipeline.STANDARD,
    vlm_model: Annotated[
        VlmModelType,
        typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
    ] = VlmModelType.SMOLDOCLING,
+    asr_model: Annotated[
+        AsrModelType,
+        typer.Option(..., help="Choose the ASR model to use with audio/video files."),
+    ] = AsrModelType.WHISPER_TINY,
    ocr: Annotated[
        bool,
        typer.Option(
@@ -450,12 +471,14 @@ def convert(  # noqa: C901
        ),
    ] = None,
 ):
+    log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
+
    if verbose == 0:
-        logging.basicConfig(level=logging.WARNING)
+        logging.basicConfig(level=logging.WARNING, format=log_format)
    elif verbose == 1:
-        logging.basicConfig(level=logging.INFO)
+        logging.basicConfig(level=logging.INFO, format=log_format)
    else:
-        logging.basicConfig(level=logging.DEBUG)
+        logging.basicConfig(level=logging.DEBUG, format=log_format)

    settings.debug.visualize_cells = debug_visualize_cells
    settings.debug.visualize_layout = debug_visualize_layout
@@ -530,9 +553,12 @@ def convert(  # noqa: C901
            ocr_options.lang = ocr_lang_list

        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
-        pipeline_options: PaginatedPipelineOptions
+        # pipeline_options: PaginatedPipelineOptions
+        pipeline_options: PipelineOptions

-        if pipeline == PdfPipeline.STANDARD:
+        format_options: Dict[InputFormat, FormatOption] = {}
+
+        if pipeline == ProcessingPipeline.STANDARD:
            pipeline_options = PdfPipelineOptions(
                allow_external_plugins=allow_external_plugins,
                enable_remote_services=enable_remote_services,
@@ -574,7 +600,13 @@ def convert(  # noqa: C901
                pipeline_options=pipeline_options,
                backend=backend,  # pdf_backend
            )
-        elif pipeline == PdfPipeline.VLM:
+
+            format_options = {
+                InputFormat.PDF: pdf_format_option,
+                InputFormat.IMAGE: pdf_format_option,
+            }
+
+        elif pipeline == ProcessingPipeline.VLM:
            pipeline_options = VlmPipelineOptions(
                enable_remote_services=enable_remote_services,
            )
@@ -600,13 +632,48 @@ def convert(  # noqa: C901
                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
            )

+            format_options = {
+                InputFormat.PDF: pdf_format_option,
+                InputFormat.IMAGE: pdf_format_option,
+            }
+
+        elif pipeline == ProcessingPipeline.ASR:
+            pipeline_options = AsrPipelineOptions(
+                # enable_remote_services=enable_remote_services,
+                # artifacts_path = artifacts_path
+            )
+
+            if asr_model == AsrModelType.WHISPER_TINY:
+                pipeline_options.asr_options = WHISPER_TINY
+            elif asr_model == AsrModelType.WHISPER_SMALL:
+                pipeline_options.asr_options = WHISPER_SMALL
+            elif asr_model == AsrModelType.WHISPER_MEDIUM:
+                pipeline_options.asr_options = WHISPER_MEDIUM
+            elif asr_model == AsrModelType.WHISPER_BASE:
+                pipeline_options.asr_options = WHISPER_BASE
+            elif asr_model == AsrModelType.WHISPER_LARGE:
+                pipeline_options.asr_options = WHISPER_LARGE
+            elif asr_model == AsrModelType.WHISPER_TURBO:
+                pipeline_options.asr_options = WHISPER_TURBO
+            else:
+                _log.error(f"{asr_model} is not known")
+                raise ValueError(f"{asr_model} is not known")
+
+            _log.info(f"pipeline_options: {pipeline_options}")
+
+            audio_format_option = AudioFormatOption(
+                pipeline_cls=AsrPipeline,
+                pipeline_options=pipeline_options,
+            )
+
+            format_options = {
+                InputFormat.AUDIO: audio_format_option,
+            }
+
        if artifacts_path is not None:
            pipeline_options.artifacts_path = artifacts_path
+            # audio_pipeline_options.artifacts_path = artifacts_path

-        format_options: Dict[InputFormat, FormatOption] = {
-            InputFormat.PDF: pdf_format_option,
-            InputFormat.IMAGE: pdf_format_option,
-        }
        doc_converter = DocumentConverter(
            allowed_formats=from_formats,
            format_options=format_options,
@@ -614,6 +681,7 @@ def convert(  # noqa: C901

        start_time = time.time()

+        _log.info(f"paths: {input_doc_paths}")
        conv_results = doc_converter.convert_all(
            input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
        )
--- a/docling/datamodel/asr_model_specs.py
+++ b/docling/datamodel/asr_model_specs.py
@@ -0,0 +1,92 @@
+import logging
+from enum import Enum
+
+from pydantic import (
+    AnyUrl,
+)
+
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_asr_model import (
+    # AsrResponseFormat,
+    # ApiAsrOptions,
+    InferenceAsrFramework,
+    InlineAsrNativeWhisperOptions,
+    TransformersModelType,
+)
+
+_log = logging.getLogger(__name__)
+
+WHISPER_TINY = InlineAsrNativeWhisperOptions(
+    repo_id="tiny",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+
+WHISPER_SMALL = InlineAsrNativeWhisperOptions(
+    repo_id="small",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+
+WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
+    repo_id="medium",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+
+WHISPER_BASE = InlineAsrNativeWhisperOptions(
+    repo_id="base",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+
+WHISPER_LARGE = InlineAsrNativeWhisperOptions(
+    repo_id="large",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+
+WHISPER_TURBO = InlineAsrNativeWhisperOptions(
+    repo_id="turbo",
+    inference_framework=InferenceAsrFramework.WHISPER,
+    verbose=True,
+    timestamps=True,
+    word_timestamps=True,
+    temperatue=0.0,
+    max_new_tokens=256,
+    max_time_chunk=30.0,
+)
+
+
+class AsrModelType(str, Enum):
+    WHISPER_TINY = "whisper_tiny"
+    WHISPER_SMALL = "whisper_small"
+    WHISPER_MEDIUM = "whisper_medium"
+    WHISPER_BASE = "whisper_base"
+    WHISPER_LARGE = "whisper_large"
+    WHISPER_TURBO = "whisper_turbo"
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -49,6 +49,7 @@ class InputFormat(str, Enum):
    XML_USPTO = "xml_uspto"
    XML_JATS = "xml_jats"
    JSON_DOCLING = "json_docling"
+    AUDIO = "audio"


 class OutputFormat(str, Enum):
@@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.XLSX: ["xlsx", "xlsm"],
    InputFormat.XML_USPTO: ["xml", "txt"],
    InputFormat.JSON_DOCLING: ["json"],
+    InputFormat.AUDIO: ["wav", "mp3"],
 }

 FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
    ],
    InputFormat.XML_USPTO: ["application/xml", "text/plain"],
    InputFormat.JSON_DOCLING: ["application/json"],
+    InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
 }

 MimeTypeToFormat: dict[str, list[InputFormat]] = {
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
            backend: Type[AbstractDocumentBackend]
            if format not in format_options.keys():
                _log.error(
-                    f"Input document {obj.name} does not match any allowed format."
+                    f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
                )
                backend = _DummyBackend
            else:
@@ -318,6 +318,8 @@ class _DocumentConversionInput(BaseModel):
        mime = mime or _DocumentConversionInput._detect_csv(content)
        mime = mime or "text/plain"
        formats = MimeTypeToFormat.get(mime, [])
+        _log.info(f"detected formats: {formats}")
+
        if formats:
            if len(formats) == 1 and mime not in ("text/plain"):
                return formats[0]
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -11,8 +11,13 @@ from pydantic import (
 )
 from typing_extensions import deprecated

+from docling.datamodel import asr_model_specs
+
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.pipeline_options_asr_model import (
+    InlineAsrOptions,
+)
 from docling.datamodel.pipeline_options_vlm_model import (
    ApiVlmOptions,
    InferenceFramework,
@@ -260,6 +265,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
    )


+class AsrPipelineOptions(PipelineOptions):
+    asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
+    artifacts_path: Optional[Union[Path, str]] = None
+
+
 class PdfPipelineOptions(PaginatedPipelineOptions):
    """Options for the PDF pipeline."""

@@ -297,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
    )


-class PdfPipeline(str, Enum):
+class ProcessingPipeline(str, Enum):
    STANDARD = "standard"
    VLM = "vlm"
+    ASR = "asr"
--- a/docling/datamodel/pipeline_options_asr_model.py
+++ b/docling/datamodel/pipeline_options_asr_model.py
@@ -0,0 +1,57 @@
+from enum import Enum
+from typing import Any, Dict, List, Literal, Optional, Union
+
+from pydantic import AnyUrl, BaseModel
+from typing_extensions import deprecated
+
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_vlm_model import (
+    # InferenceFramework,
+    TransformersModelType,
+)
+
+
+class BaseAsrOptions(BaseModel):
+    kind: str
+    # prompt: str
+
+
+class InferenceAsrFramework(str, Enum):
+    # MLX = "mlx" # disabled for now
+    # TRANSFORMERS = "transformers" # disabled for now
+    WHISPER = "whisper"
+
+
+class InlineAsrOptions(BaseAsrOptions):
+    kind: Literal["inline_model_options"] = "inline_model_options"
+
+    repo_id: str
+
+    verbose: bool = False
+    timestamps: bool = True
+
+    temperature: float = 0.0
+    max_new_tokens: int = 256
+    max_time_chunk: float = 30.0
+
+    torch_dtype: Optional[str] = None
+    supported_devices: List[AcceleratorDevice] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+        AcceleratorDevice.MPS,
+    ]
+
+    @property
+    def repo_cache_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+
+
+class InlineAsrNativeWhisperOptions(InlineAsrOptions):
+    inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
+
+    language: str = "en"
+    supported_devices: List[AcceleratorDevice] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+    ]
+    word_timestamps: bool = True
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -19,6 +19,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.noop_backend import NoOpBackend
 from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
@@ -41,6 +42,7 @@ from docling.datamodel.settings import (
    settings,
 )
 from docling.exceptions import ConversionError
+from docling.pipeline.asr_pipeline import AsrPipeline
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.pipeline.simple_pipeline import SimplePipeline
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@@ -118,6 +120,11 @@ class PdfFormatOption(FormatOption):
    backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend


+class AudioFormatOption(FormatOption):
+    pipeline_cls: Type = AsrPipeline
+    backend: Type[AbstractDocumentBackend] = NoOpBackend
+
+
 def _get_default_option(format: InputFormat) -> FormatOption:
    format_to_default_options = {
        InputFormat.CSV: FormatOption(
@@ -156,6 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
        InputFormat.JSON_DOCLING: FormatOption(
            pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
        ),
+        InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
    }
    if (options := format_to_default_options.get(format)) is not None:
        return options
--- a/docling/pipeline/asr_pipeline.py
+++ b/docling/pipeline/asr_pipeline.py
@@ -0,0 +1,253 @@
+import logging
+import os
+import re
+from io import BytesIO
+from pathlib import Path
+from typing import List, Optional, Union, cast
+
+from docling_core.types.doc import DoclingDocument, DocumentOrigin
+
+# import whisper  # type: ignore
+# import librosa
+# import numpy as np
+# import soundfile as sf  # type: ignore
+from docling_core.types.doc.labels import DocItemLabel
+from pydantic import BaseModel, Field, validator
+
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.noop_backend import NoOpBackend
+
+# from pydub import AudioSegment  # type: ignore
+# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
+from docling.datamodel.accelerator_options import (
+    AcceleratorOptions,
+)
+from docling.datamodel.base_models import (
+    ConversionStatus,
+    FormatToMimeType,
+)
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.pipeline_options import (
+    AsrPipelineOptions,
+)
+from docling.datamodel.pipeline_options_asr_model import (
+    InlineAsrNativeWhisperOptions,
+    # AsrResponseFormat,
+    InlineAsrOptions,
+)
+from docling.datamodel.pipeline_options_vlm_model import (
+    InferenceFramework,
+)
+from docling.datamodel.settings import settings
+from docling.pipeline.base_pipeline import BasePipeline
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import ProfilingScope, TimeRecorder
+
+_log = logging.getLogger(__name__)
+
+
+class _ConversationWord(BaseModel):
+    text: str
+    start_time: Optional[float] = Field(
+        None, description="Start time in seconds from video start"
+    )
+    end_time: Optional[float] = Field(
+        None, ge=0, description="End time in seconds from video start"
+    )
+
+
+class _ConversationItem(BaseModel):
+    text: str
+    start_time: Optional[float] = Field(
+        None, description="Start time in seconds from video start"
+    )
+    end_time: Optional[float] = Field(
+        None, ge=0, description="End time in seconds from video start"
+    )
+    speaker_id: Optional[int] = Field(None, description="Numeric speaker identifier")
+    speaker: Optional[str] = Field(
+        None, description="Speaker name, defaults to speaker-{speaker_id}"
+    )
+    words: Optional[list[_ConversationWord]] = Field(
+        None, description="Individual words with time-stamps"
+    )
+
+    def __lt__(self, other):
+        if not isinstance(other, _ConversationItem):
+            return NotImplemented
+        return self.start_time < other.start_time
+
+    def __eq__(self, other):
+        if not isinstance(other, _ConversationItem):
+            return NotImplemented
+        return self.start_time == other.start_time
+
+    def to_string(self) -> str:
+        """Format the conversation entry as a string"""
+        result = ""
+        if (self.start_time is not None) and (self.end_time is not None):
+            result += f"[time: {self.start_time}-{self.end_time}] "
+
+        if self.speaker is not None:
+            result += f"[speaker:{self.speaker}] "
+
+        result += self.text
+        return result
+
+
+class _NativeWhisperModel:
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        asr_options: InlineAsrNativeWhisperOptions,
+    ):
+        """
+        Transcriber using native Whisper.
+        """
+        self.enabled = enabled
+
+        _log.info(f"artifacts-path: {artifacts_path}")
+        _log.info(f"accelerator_options: {accelerator_options}")
+
+        if self.enabled:
+            try:
+                import whisper  # type: ignore
+            except ImportError:
+                raise ImportError(
+                    "whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
+                )
+            self.asr_options = asr_options
+            self.max_tokens = asr_options.max_new_tokens
+            self.temperature = asr_options.temperature
+
+            self.device = decide_device(
+                accelerator_options.device,
+                supported_devices=asr_options.supported_devices,
+            )
+            _log.info(f"Available device for Whisper: {self.device}")
+
+            self.model_name = asr_options.repo_id
+            _log.info(f"loading _NativeWhisperModel({self.model_name})")
+            if artifacts_path is not None:
+                _log.info(f"loading {self.model_name} from {artifacts_path}")
+                self.model = whisper.load_model(
+                    name=self.model_name,
+                    device=self.device,
+                    download_root=str(artifacts_path),
+                )
+            else:
+                self.model = whisper.load_model(
+                    name=self.model_name, device=self.device
+                )
+
+            self.verbose = asr_options.verbose
+            self.timestamps = asr_options.timestamps
+            self.word_timestamps = asr_options.word_timestamps
+
+    def run(self, conv_res: ConversionResult) -> ConversionResult:
+        audio_path: Path = Path(conv_res.input.file).resolve()
+
+        try:
+            conversation = self.transcribe(audio_path)
+
+            # Ensure we have a proper DoclingDocument
+            origin = DocumentOrigin(
+                filename=conv_res.input.file.name or "audio.wav",
+                mimetype="audio/x-wav",
+                binary_hash=conv_res.input.document_hash,
+            )
+            conv_res.document = DoclingDocument(
+                name=conv_res.input.file.stem or "audio.wav", origin=origin
+            )
+
+            for citem in conversation:
+                conv_res.document.add_text(
+                    label=DocItemLabel.TEXT, text=citem.to_string()
+                )
+
+            conv_res.status = ConversionStatus.SUCCESS
+            return conv_res
+
+        except Exception as exc:
+            _log.error(f"Audio tranciption has an error: {exc}")
+
+        conv_res.status = ConversionStatus.FAILURE
+        return conv_res
+
+    def transcribe(self, fpath: Path) -> list[_ConversationItem]:
+        result = self.model.transcribe(
+            str(fpath), verbose=self.verbose, word_timestamps=self.word_timestamps
+        )
+
+        convo: list[_ConversationItem] = []
+        for _ in result["segments"]:
+            item = _ConversationItem(
+                start_time=_["start"], end_time=_["end"], text=_["text"], words=[]
+            )
+            if "words" in _ and self.word_timestamps:
+                item.words = []
+                for __ in _["words"]:
+                    item.words.append(
+                        _ConversationWord(
+                            start_time=__["start"],
+                            end_time=__["end"],
+                            text=__["word"],
+                        )
+                    )
+            convo.append(item)
+
+        return convo
+
+
+class AsrPipeline(BasePipeline):
+    def __init__(self, pipeline_options: AsrPipelineOptions):
+        super().__init__(pipeline_options)
+        self.keep_backend = True
+
+        self.pipeline_options: AsrPipelineOptions = pipeline_options
+
+        artifacts_path: Optional[Path] = None
+        if pipeline_options.artifacts_path is not None:
+            artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
+        elif settings.artifacts_path is not None:
+            artifacts_path = Path(settings.artifacts_path).expanduser()
+
+        if artifacts_path is not None and not artifacts_path.is_dir():
+            raise RuntimeError(
+                f"The value of {artifacts_path=} is not valid. "
+                "When defined, it must point to a folder containing all models required by the pipeline."
+            )
+
+        if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
+            asr_options: InlineAsrNativeWhisperOptions = (
+                self.pipeline_options.asr_options
+            )
+            self._model = _NativeWhisperModel(
+                enabled=True,  # must be always enabled for this pipeline to make sense.
+                artifacts_path=artifacts_path,
+                accelerator_options=pipeline_options.accelerator_options,
+                asr_options=asr_options,
+            )
+        else:
+            _log.error(f"No model support for {self.pipeline_options.asr_options}")
+
+    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
+        status = ConversionStatus.SUCCESS
+        return status
+
+    @classmethod
+    def get_default_options(cls) -> AsrPipelineOptions:
+        return AsrPipelineOptions()
+
+    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
+        _log.info(f"start _build_document in AsrPipeline: {conv_res.input.file}")
+        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
+            self._model.run(conv_res=conv_res)
+
+        return conv_res
+
+    @classmethod
+    def is_backend_supported(cls, backend: AbstractDocumentBackend):
+        return isinstance(backend, NoOpBackend)