feat: Support audio input (#1763)

* scaffolding in place

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* doing scaffolding for audio pipeline

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* WIP: got first transcription working

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* all working, time to start cleaning up

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* first working ASR pipeline

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added openai-whisper as a first transcription model

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updating with asr_options

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* finalised the first working ASR pipeline with Whisper

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* use whisper from the latest git commit

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Update docling/datamodel/pipeline_options.py

Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com>

* Update docling/datamodel/pipeline_options.py

Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com>

* updated comment

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* AudioBackend -> DummyBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* file rename

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Rename to NoOpBackend, add test for ASR pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Support every format in NoOpBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add missing audio file and test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Install ffmpeg system dependency for ASR test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2025-06-23 14:47:26 +02:00
committed by GitHub
parent d26dac61a8
commit 1557e7ce3e
14 changed files with 941 additions and 62 deletions

View File

@@ -0,0 +1,51 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class NoOpBackend(AbstractDocumentBackend):
"""
A no-op backend that only validates input existence.
Used e.g. for audio files where actual processing is handled by the ASR pipeline.
"""
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
_log.debug(f"NoOpBackend initialized for: {path_or_stream}")
# Validate input
try:
if isinstance(self.path_or_stream, BytesIO):
# Check if stream has content
self.valid = len(self.path_or_stream.getvalue()) > 0
_log.debug(
f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
)
elif isinstance(self.path_or_stream, Path):
# Check if file exists
self.valid = self.path_or_stream.exists()
_log.debug(f"File exists: {self.valid}")
else:
self.valid = False
except Exception as e:
_log.error(f"NoOpBackend validation failed: {e}")
self.valid = False
def is_valid(self) -> bool:
return self.valid
@classmethod
def supports_pagination(cls) -> bool:
return False
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return set(InputFormat)

View File

@@ -29,6 +29,15 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.asr_model_specs import (
WHISPER_BASE,
WHISPER_LARGE,
WHISPER_MEDIUM,
WHISPER_SMALL,
WHISPER_TINY,
WHISPER_TURBO,
AsrModelType,
)
from docling.datamodel.base_models import (
ConversionStatus,
FormatToExtensions,
@@ -37,12 +46,14 @@ from docling.datamodel.base_models import (
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AsrPipelineOptions,
EasyOcrOptions,
OcrOptions,
PaginatedPipelineOptions,
PdfBackend,
PdfPipeline,
PdfPipelineOptions,
PipelineOptions,
ProcessingPipeline,
TableFormerMode,
VlmPipelineOptions,
)
@@ -54,8 +65,14 @@ from docling.datamodel.vlm_model_specs import (
SMOLDOCLING_TRANSFORMERS,
VlmModelType,
)
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.document_converter import (
AudioFormatOption,
DocumentConverter,
FormatOption,
PdfFormatOption,
)
from docling.models.factories import get_ocr_factory
from docling.pipeline.asr_pipeline import AsrPipeline
from docling.pipeline.vlm_pipeline import VlmPipeline
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -296,13 +313,17 @@ def convert( # noqa: C901
),
] = ImageRefMode.EMBEDDED,
pipeline: Annotated[
PdfPipeline,
ProcessingPipeline,
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
] = PdfPipeline.STANDARD,
] = ProcessingPipeline.STANDARD,
vlm_model: Annotated[
VlmModelType,
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
] = VlmModelType.SMOLDOCLING,
asr_model: Annotated[
AsrModelType,
typer.Option(..., help="Choose the ASR model to use with audio/video files."),
] = AsrModelType.WHISPER_TINY,
ocr: Annotated[
bool,
typer.Option(
@@ -450,12 +471,14 @@ def convert( # noqa: C901
),
] = None,
):
log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
if verbose == 0:
logging.basicConfig(level=logging.WARNING)
logging.basicConfig(level=logging.WARNING, format=log_format)
elif verbose == 1:
logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=logging.INFO, format=log_format)
else:
logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.DEBUG, format=log_format)
settings.debug.visualize_cells = debug_visualize_cells
settings.debug.visualize_layout = debug_visualize_layout
@@ -530,9 +553,12 @@ def convert( # noqa: C901
ocr_options.lang = ocr_lang_list
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
pipeline_options: PaginatedPipelineOptions
# pipeline_options: PaginatedPipelineOptions
pipeline_options: PipelineOptions
if pipeline == PdfPipeline.STANDARD:
format_options: Dict[InputFormat, FormatOption] = {}
if pipeline == ProcessingPipeline.STANDARD:
pipeline_options = PdfPipelineOptions(
allow_external_plugins=allow_external_plugins,
enable_remote_services=enable_remote_services,
@@ -574,7 +600,13 @@ def convert( # noqa: C901
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
)
elif pipeline == PdfPipeline.VLM:
format_options = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}
elif pipeline == ProcessingPipeline.VLM:
pipeline_options = VlmPipelineOptions(
enable_remote_services=enable_remote_services,
)
@@ -600,13 +632,48 @@ def convert( # noqa: C901
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
)
format_options = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}
elif pipeline == ProcessingPipeline.ASR:
pipeline_options = AsrPipelineOptions(
# enable_remote_services=enable_remote_services,
# artifacts_path = artifacts_path
)
if asr_model == AsrModelType.WHISPER_TINY:
pipeline_options.asr_options = WHISPER_TINY
elif asr_model == AsrModelType.WHISPER_SMALL:
pipeline_options.asr_options = WHISPER_SMALL
elif asr_model == AsrModelType.WHISPER_MEDIUM:
pipeline_options.asr_options = WHISPER_MEDIUM
elif asr_model == AsrModelType.WHISPER_BASE:
pipeline_options.asr_options = WHISPER_BASE
elif asr_model == AsrModelType.WHISPER_LARGE:
pipeline_options.asr_options = WHISPER_LARGE
elif asr_model == AsrModelType.WHISPER_TURBO:
pipeline_options.asr_options = WHISPER_TURBO
else:
_log.error(f"{asr_model} is not known")
raise ValueError(f"{asr_model} is not known")
_log.info(f"pipeline_options: {pipeline_options}")
audio_format_option = AudioFormatOption(
pipeline_cls=AsrPipeline,
pipeline_options=pipeline_options,
)
format_options = {
InputFormat.AUDIO: audio_format_option,
}
if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path
# audio_pipeline_options.artifacts_path = artifacts_path
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}
doc_converter = DocumentConverter(
allowed_formats=from_formats,
format_options=format_options,
@@ -614,6 +681,7 @@ def convert( # noqa: C901
start_time = time.time()
_log.info(f"paths: {input_doc_paths}")
conv_results = doc_converter.convert_all(
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
)

View File

@@ -0,0 +1,92 @@
import logging
from enum import Enum
from pydantic import (
AnyUrl,
)
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.pipeline_options_asr_model import (
# AsrResponseFormat,
# ApiAsrOptions,
InferenceAsrFramework,
InlineAsrNativeWhisperOptions,
TransformersModelType,
)
_log = logging.getLogger(__name__)
WHISPER_TINY = InlineAsrNativeWhisperOptions(
repo_id="tiny",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
WHISPER_SMALL = InlineAsrNativeWhisperOptions(
repo_id="small",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
repo_id="medium",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
WHISPER_BASE = InlineAsrNativeWhisperOptions(
repo_id="base",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
WHISPER_LARGE = InlineAsrNativeWhisperOptions(
repo_id="large",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
WHISPER_TURBO = InlineAsrNativeWhisperOptions(
repo_id="turbo",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
class AsrModelType(str, Enum):
WHISPER_TINY = "whisper_tiny"
WHISPER_SMALL = "whisper_small"
WHISPER_MEDIUM = "whisper_medium"
WHISPER_BASE = "whisper_base"
WHISPER_LARGE = "whisper_large"
WHISPER_TURBO = "whisper_turbo"

View File

@@ -49,6 +49,7 @@ class InputFormat(str, Enum):
XML_USPTO = "xml_uspto"
XML_JATS = "xml_jats"
JSON_DOCLING = "json_docling"
AUDIO = "audio"
class OutputFormat(str, Enum):
@@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.XLSX: ["xlsx", "xlsm"],
InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"],
InputFormat.AUDIO: ["wav", "mp3"],
}
FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
],
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
InputFormat.JSON_DOCLING: ["application/json"],
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
}
MimeTypeToFormat: dict[str, list[InputFormat]] = {

View File

@@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
backend: Type[AbstractDocumentBackend]
if format not in format_options.keys():
_log.error(
f"Input document {obj.name} does not match any allowed format."
f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
)
backend = _DummyBackend
else:
@@ -318,6 +318,8 @@ class _DocumentConversionInput(BaseModel):
mime = mime or _DocumentConversionInput._detect_csv(content)
mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, [])
_log.info(f"detected formats: {formats}")
if formats:
if len(formats) == 1 and mime not in ("text/plain"):
return formats[0]

View File

@@ -11,8 +11,13 @@ from pydantic import (
)
from typing_extensions import deprecated
from docling.datamodel import asr_model_specs
# Import the following for backwards compatibility
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.pipeline_options_asr_model import (
InlineAsrOptions,
)
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
InferenceFramework,
@@ -260,6 +265,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
)
class AsrPipelineOptions(PipelineOptions):
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
artifacts_path: Optional[Union[Path, str]] = None
class PdfPipelineOptions(PaginatedPipelineOptions):
"""Options for the PDF pipeline."""
@@ -297,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
)
class PdfPipeline(str, Enum):
class ProcessingPipeline(str, Enum):
STANDARD = "standard"
VLM = "vlm"
ASR = "asr"

View File

@@ -0,0 +1,57 @@
from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Union
from pydantic import AnyUrl, BaseModel
from typing_extensions import deprecated
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.pipeline_options_vlm_model import (
# InferenceFramework,
TransformersModelType,
)
class BaseAsrOptions(BaseModel):
kind: str
# prompt: str
class InferenceAsrFramework(str, Enum):
# MLX = "mlx" # disabled for now
# TRANSFORMERS = "transformers" # disabled for now
WHISPER = "whisper"
class InlineAsrOptions(BaseAsrOptions):
kind: Literal["inline_model_options"] = "inline_model_options"
repo_id: str
verbose: bool = False
timestamps: bool = True
temperature: float = 0.0
max_new_tokens: int = 256
max_time_chunk: float = 30.0
torch_dtype: Optional[str] = None
supported_devices: List[AcceleratorDevice] = [
AcceleratorDevice.CPU,
AcceleratorDevice.CUDA,
AcceleratorDevice.MPS,
]
@property
def repo_cache_folder(self) -> str:
return self.repo_id.replace("/", "--")
class InlineAsrNativeWhisperOptions(InlineAsrOptions):
inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
language: str = "en"
supported_devices: List[AcceleratorDevice] = [
AcceleratorDevice.CPU,
AcceleratorDevice.CUDA,
]
word_timestamps: bool = True

View File

@@ -19,6 +19,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.noop_backend import NoOpBackend
from docling.backend.xml.jats_backend import JatsDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import (
@@ -41,6 +42,7 @@ from docling.datamodel.settings import (
settings,
)
from docling.exceptions import ConversionError
from docling.pipeline.asr_pipeline import AsrPipeline
from docling.pipeline.base_pipeline import BasePipeline
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@@ -118,6 +120,11 @@ class PdfFormatOption(FormatOption):
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
class AudioFormatOption(FormatOption):
pipeline_cls: Type = AsrPipeline
backend: Type[AbstractDocumentBackend] = NoOpBackend
def _get_default_option(format: InputFormat) -> FormatOption:
format_to_default_options = {
InputFormat.CSV: FormatOption(
@@ -156,6 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.JSON_DOCLING: FormatOption(
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
),
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
}
if (options := format_to_default_options.get(format)) is not None:
return options

View File

@@ -0,0 +1,253 @@
import logging
import os
import re
from io import BytesIO
from pathlib import Path
from typing import List, Optional, Union, cast
from docling_core.types.doc import DoclingDocument, DocumentOrigin
# import whisper # type: ignore
# import librosa
# import numpy as np
# import soundfile as sf # type: ignore
from docling_core.types.doc.labels import DocItemLabel
from pydantic import BaseModel, Field, validator
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.noop_backend import NoOpBackend
# from pydub import AudioSegment # type: ignore
# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
from docling.datamodel.accelerator_options import (
AcceleratorOptions,
)
from docling.datamodel.base_models import (
ConversionStatus,
FormatToMimeType,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import (
AsrPipelineOptions,
)
from docling.datamodel.pipeline_options_asr_model import (
InlineAsrNativeWhisperOptions,
# AsrResponseFormat,
InlineAsrOptions,
)
from docling.datamodel.pipeline_options_vlm_model import (
InferenceFramework,
)
from docling.datamodel.settings import settings
from docling.pipeline.base_pipeline import BasePipeline
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__)
class _ConversationWord(BaseModel):
text: str
start_time: Optional[float] = Field(
None, description="Start time in seconds from video start"
)
end_time: Optional[float] = Field(
None, ge=0, description="End time in seconds from video start"
)
class _ConversationItem(BaseModel):
text: str
start_time: Optional[float] = Field(
None, description="Start time in seconds from video start"
)
end_time: Optional[float] = Field(
None, ge=0, description="End time in seconds from video start"
)
speaker_id: Optional[int] = Field(None, description="Numeric speaker identifier")
speaker: Optional[str] = Field(
None, description="Speaker name, defaults to speaker-{speaker_id}"
)
words: Optional[list[_ConversationWord]] = Field(
None, description="Individual words with time-stamps"
)
def __lt__(self, other):
if not isinstance(other, _ConversationItem):
return NotImplemented
return self.start_time < other.start_time
def __eq__(self, other):
if not isinstance(other, _ConversationItem):
return NotImplemented
return self.start_time == other.start_time
def to_string(self) -> str:
"""Format the conversation entry as a string"""
result = ""
if (self.start_time is not None) and (self.end_time is not None):
result += f"[time: {self.start_time}-{self.end_time}] "
if self.speaker is not None:
result += f"[speaker:{self.speaker}] "
result += self.text
return result
class _NativeWhisperModel:
def __init__(
self,
enabled: bool,
artifacts_path: Optional[Path],
accelerator_options: AcceleratorOptions,
asr_options: InlineAsrNativeWhisperOptions,
):
"""
Transcriber using native Whisper.
"""
self.enabled = enabled
_log.info(f"artifacts-path: {artifacts_path}")
_log.info(f"accelerator_options: {accelerator_options}")
if self.enabled:
try:
import whisper # type: ignore
except ImportError:
raise ImportError(
"whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
)
self.asr_options = asr_options
self.max_tokens = asr_options.max_new_tokens
self.temperature = asr_options.temperature
self.device = decide_device(
accelerator_options.device,
supported_devices=asr_options.supported_devices,
)
_log.info(f"Available device for Whisper: {self.device}")
self.model_name = asr_options.repo_id
_log.info(f"loading _NativeWhisperModel({self.model_name})")
if artifacts_path is not None:
_log.info(f"loading {self.model_name} from {artifacts_path}")
self.model = whisper.load_model(
name=self.model_name,
device=self.device,
download_root=str(artifacts_path),
)
else:
self.model = whisper.load_model(
name=self.model_name, device=self.device
)
self.verbose = asr_options.verbose
self.timestamps = asr_options.timestamps
self.word_timestamps = asr_options.word_timestamps
def run(self, conv_res: ConversionResult) -> ConversionResult:
audio_path: Path = Path(conv_res.input.file).resolve()
try:
conversation = self.transcribe(audio_path)
# Ensure we have a proper DoclingDocument
origin = DocumentOrigin(
filename=conv_res.input.file.name or "audio.wav",
mimetype="audio/x-wav",
binary_hash=conv_res.input.document_hash,
)
conv_res.document = DoclingDocument(
name=conv_res.input.file.stem or "audio.wav", origin=origin
)
for citem in conversation:
conv_res.document.add_text(
label=DocItemLabel.TEXT, text=citem.to_string()
)
conv_res.status = ConversionStatus.SUCCESS
return conv_res
except Exception as exc:
_log.error(f"Audio tranciption has an error: {exc}")
conv_res.status = ConversionStatus.FAILURE
return conv_res
def transcribe(self, fpath: Path) -> list[_ConversationItem]:
result = self.model.transcribe(
str(fpath), verbose=self.verbose, word_timestamps=self.word_timestamps
)
convo: list[_ConversationItem] = []
for _ in result["segments"]:
item = _ConversationItem(
start_time=_["start"], end_time=_["end"], text=_["text"], words=[]
)
if "words" in _ and self.word_timestamps:
item.words = []
for __ in _["words"]:
item.words.append(
_ConversationWord(
start_time=__["start"],
end_time=__["end"],
text=__["word"],
)
)
convo.append(item)
return convo
class AsrPipeline(BasePipeline):
def __init__(self, pipeline_options: AsrPipelineOptions):
super().__init__(pipeline_options)
self.keep_backend = True
self.pipeline_options: AsrPipelineOptions = pipeline_options
artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
asr_options: InlineAsrNativeWhisperOptions = (
self.pipeline_options.asr_options
)
self._model = _NativeWhisperModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
asr_options=asr_options,
)
else:
_log.error(f"No model support for {self.pipeline_options.asr_options}")
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
status = ConversionStatus.SUCCESS
return status
@classmethod
def get_default_options(cls) -> AsrPipelineOptions:
return AsrPipelineOptions()
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
_log.info(f"start _build_document in AsrPipeline: {conv_res.input.file}")
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
self._model.run(conv_res=conv_res)
return conv_res
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
return isinstance(backend, NoOpBackend)