mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 06:08:09 +00:00
feat: Support audio input (#1763)
* scaffolding in place Signed-off-by: Peter Staar <taa@zurich.ibm.com> * doing scaffolding for audio pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * WIP: got first transcription working Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, time to start cleaning up Signed-off-by: Peter Staar <taa@zurich.ibm.com> * first working ASR pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added openai-whisper as a first transcription model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updating with asr_options Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalised the first working ASR pipeline with Whisper Signed-off-by: Peter Staar <taa@zurich.ibm.com> * use whisper from the latest git commit Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * updated comment Signed-off-by: Peter Staar <taa@zurich.ibm.com> * AudioBackend -> DummyBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * file rename Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename to NoOpBackend, add test for ASR pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Support every format in NoOpBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add missing audio file and test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Install ffmpeg system dependency for ASR test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
d26dac61a8
commit
1557e7ce3e
51
docling/backend/noop_backend.py
Normal file
51
docling/backend/noop_backend.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NoOpBackend(AbstractDocumentBackend):
|
||||
"""
|
||||
A no-op backend that only validates input existence.
|
||||
Used e.g. for audio files where actual processing is handled by the ASR pipeline.
|
||||
"""
|
||||
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
_log.debug(f"NoOpBackend initialized for: {path_or_stream}")
|
||||
|
||||
# Validate input
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
# Check if stream has content
|
||||
self.valid = len(self.path_or_stream.getvalue()) > 0
|
||||
_log.debug(
|
||||
f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
|
||||
)
|
||||
elif isinstance(self.path_or_stream, Path):
|
||||
# Check if file exists
|
||||
self.valid = self.path_or_stream.exists()
|
||||
_log.debug(f"File exists: {self.valid}")
|
||||
else:
|
||||
self.valid = False
|
||||
except Exception as e:
|
||||
_log.error(f"NoOpBackend validation failed: {e}")
|
||||
self.valid = False
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return set(InputFormat)
|
||||
@@ -29,6 +29,15 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.asr_model_specs import (
|
||||
WHISPER_BASE,
|
||||
WHISPER_LARGE,
|
||||
WHISPER_MEDIUM,
|
||||
WHISPER_SMALL,
|
||||
WHISPER_TINY,
|
||||
WHISPER_TURBO,
|
||||
AsrModelType,
|
||||
)
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
FormatToExtensions,
|
||||
@@ -37,12 +46,14 @@ from docling.datamodel.base_models import (
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AsrPipelineOptions,
|
||||
EasyOcrOptions,
|
||||
OcrOptions,
|
||||
PaginatedPipelineOptions,
|
||||
PdfBackend,
|
||||
PdfPipeline,
|
||||
PdfPipelineOptions,
|
||||
PipelineOptions,
|
||||
ProcessingPipeline,
|
||||
TableFormerMode,
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
@@ -54,8 +65,14 @@ from docling.datamodel.vlm_model_specs import (
|
||||
SMOLDOCLING_TRANSFORMERS,
|
||||
VlmModelType,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling.document_converter import (
|
||||
AudioFormatOption,
|
||||
DocumentConverter,
|
||||
FormatOption,
|
||||
PdfFormatOption,
|
||||
)
|
||||
from docling.models.factories import get_ocr_factory
|
||||
from docling.pipeline.asr_pipeline import AsrPipeline
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
@@ -296,13 +313,17 @@ def convert( # noqa: C901
|
||||
),
|
||||
] = ImageRefMode.EMBEDDED,
|
||||
pipeline: Annotated[
|
||||
PdfPipeline,
|
||||
ProcessingPipeline,
|
||||
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
|
||||
] = PdfPipeline.STANDARD,
|
||||
] = ProcessingPipeline.STANDARD,
|
||||
vlm_model: Annotated[
|
||||
VlmModelType,
|
||||
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
|
||||
] = VlmModelType.SMOLDOCLING,
|
||||
asr_model: Annotated[
|
||||
AsrModelType,
|
||||
typer.Option(..., help="Choose the ASR model to use with audio/video files."),
|
||||
] = AsrModelType.WHISPER_TINY,
|
||||
ocr: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
@@ -450,12 +471,14 @@ def convert( # noqa: C901
|
||||
),
|
||||
] = None,
|
||||
):
|
||||
log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
|
||||
|
||||
if verbose == 0:
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
logging.basicConfig(level=logging.WARNING, format=log_format)
|
||||
elif verbose == 1:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.basicConfig(level=logging.INFO, format=log_format)
|
||||
else:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logging.basicConfig(level=logging.DEBUG, format=log_format)
|
||||
|
||||
settings.debug.visualize_cells = debug_visualize_cells
|
||||
settings.debug.visualize_layout = debug_visualize_layout
|
||||
@@ -530,9 +553,12 @@ def convert( # noqa: C901
|
||||
ocr_options.lang = ocr_lang_list
|
||||
|
||||
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
||||
pipeline_options: PaginatedPipelineOptions
|
||||
# pipeline_options: PaginatedPipelineOptions
|
||||
pipeline_options: PipelineOptions
|
||||
|
||||
if pipeline == PdfPipeline.STANDARD:
|
||||
format_options: Dict[InputFormat, FormatOption] = {}
|
||||
|
||||
if pipeline == ProcessingPipeline.STANDARD:
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
allow_external_plugins=allow_external_plugins,
|
||||
enable_remote_services=enable_remote_services,
|
||||
@@ -574,7 +600,13 @@ def convert( # noqa: C901
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend, # pdf_backend
|
||||
)
|
||||
elif pipeline == PdfPipeline.VLM:
|
||||
|
||||
format_options = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
|
||||
elif pipeline == ProcessingPipeline.VLM:
|
||||
pipeline_options = VlmPipelineOptions(
|
||||
enable_remote_services=enable_remote_services,
|
||||
)
|
||||
@@ -600,13 +632,48 @@ def convert( # noqa: C901
|
||||
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
||||
)
|
||||
|
||||
format_options = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
|
||||
elif pipeline == ProcessingPipeline.ASR:
|
||||
pipeline_options = AsrPipelineOptions(
|
||||
# enable_remote_services=enable_remote_services,
|
||||
# artifacts_path = artifacts_path
|
||||
)
|
||||
|
||||
if asr_model == AsrModelType.WHISPER_TINY:
|
||||
pipeline_options.asr_options = WHISPER_TINY
|
||||
elif asr_model == AsrModelType.WHISPER_SMALL:
|
||||
pipeline_options.asr_options = WHISPER_SMALL
|
||||
elif asr_model == AsrModelType.WHISPER_MEDIUM:
|
||||
pipeline_options.asr_options = WHISPER_MEDIUM
|
||||
elif asr_model == AsrModelType.WHISPER_BASE:
|
||||
pipeline_options.asr_options = WHISPER_BASE
|
||||
elif asr_model == AsrModelType.WHISPER_LARGE:
|
||||
pipeline_options.asr_options = WHISPER_LARGE
|
||||
elif asr_model == AsrModelType.WHISPER_TURBO:
|
||||
pipeline_options.asr_options = WHISPER_TURBO
|
||||
else:
|
||||
_log.error(f"{asr_model} is not known")
|
||||
raise ValueError(f"{asr_model} is not known")
|
||||
|
||||
_log.info(f"pipeline_options: {pipeline_options}")
|
||||
|
||||
audio_format_option = AudioFormatOption(
|
||||
pipeline_cls=AsrPipeline,
|
||||
pipeline_options=pipeline_options,
|
||||
)
|
||||
|
||||
format_options = {
|
||||
InputFormat.AUDIO: audio_format_option,
|
||||
}
|
||||
|
||||
if artifacts_path is not None:
|
||||
pipeline_options.artifacts_path = artifacts_path
|
||||
# audio_pipeline_options.artifacts_path = artifacts_path
|
||||
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
doc_converter = DocumentConverter(
|
||||
allowed_formats=from_formats,
|
||||
format_options=format_options,
|
||||
@@ -614,6 +681,7 @@ def convert( # noqa: C901
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
_log.info(f"paths: {input_doc_paths}")
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
|
||||
)
|
||||
|
||||
92
docling/datamodel/asr_model_specs.py
Normal file
92
docling/datamodel/asr_model_specs.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import logging
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import (
|
||||
AnyUrl,
|
||||
)
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||
from docling.datamodel.pipeline_options_asr_model import (
|
||||
# AsrResponseFormat,
|
||||
# ApiAsrOptions,
|
||||
InferenceAsrFramework,
|
||||
InlineAsrNativeWhisperOptions,
|
||||
TransformersModelType,
|
||||
)
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
WHISPER_TINY = InlineAsrNativeWhisperOptions(
|
||||
repo_id="tiny",
|
||||
inference_framework=InferenceAsrFramework.WHISPER,
|
||||
verbose=True,
|
||||
timestamps=True,
|
||||
word_timestamps=True,
|
||||
temperatue=0.0,
|
||||
max_new_tokens=256,
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
|
||||
WHISPER_SMALL = InlineAsrNativeWhisperOptions(
|
||||
repo_id="small",
|
||||
inference_framework=InferenceAsrFramework.WHISPER,
|
||||
verbose=True,
|
||||
timestamps=True,
|
||||
word_timestamps=True,
|
||||
temperatue=0.0,
|
||||
max_new_tokens=256,
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
|
||||
WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
|
||||
repo_id="medium",
|
||||
inference_framework=InferenceAsrFramework.WHISPER,
|
||||
verbose=True,
|
||||
timestamps=True,
|
||||
word_timestamps=True,
|
||||
temperatue=0.0,
|
||||
max_new_tokens=256,
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
|
||||
WHISPER_BASE = InlineAsrNativeWhisperOptions(
|
||||
repo_id="base",
|
||||
inference_framework=InferenceAsrFramework.WHISPER,
|
||||
verbose=True,
|
||||
timestamps=True,
|
||||
word_timestamps=True,
|
||||
temperatue=0.0,
|
||||
max_new_tokens=256,
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
|
||||
WHISPER_LARGE = InlineAsrNativeWhisperOptions(
|
||||
repo_id="large",
|
||||
inference_framework=InferenceAsrFramework.WHISPER,
|
||||
verbose=True,
|
||||
timestamps=True,
|
||||
word_timestamps=True,
|
||||
temperatue=0.0,
|
||||
max_new_tokens=256,
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
|
||||
WHISPER_TURBO = InlineAsrNativeWhisperOptions(
|
||||
repo_id="turbo",
|
||||
inference_framework=InferenceAsrFramework.WHISPER,
|
||||
verbose=True,
|
||||
timestamps=True,
|
||||
word_timestamps=True,
|
||||
temperatue=0.0,
|
||||
max_new_tokens=256,
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
|
||||
|
||||
class AsrModelType(str, Enum):
|
||||
WHISPER_TINY = "whisper_tiny"
|
||||
WHISPER_SMALL = "whisper_small"
|
||||
WHISPER_MEDIUM = "whisper_medium"
|
||||
WHISPER_BASE = "whisper_base"
|
||||
WHISPER_LARGE = "whisper_large"
|
||||
WHISPER_TURBO = "whisper_turbo"
|
||||
@@ -49,6 +49,7 @@ class InputFormat(str, Enum):
|
||||
XML_USPTO = "xml_uspto"
|
||||
XML_JATS = "xml_jats"
|
||||
JSON_DOCLING = "json_docling"
|
||||
AUDIO = "audio"
|
||||
|
||||
|
||||
class OutputFormat(str, Enum):
|
||||
@@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.XLSX: ["xlsx", "xlsm"],
|
||||
InputFormat.XML_USPTO: ["xml", "txt"],
|
||||
InputFormat.JSON_DOCLING: ["json"],
|
||||
InputFormat.AUDIO: ["wav", "mp3"],
|
||||
}
|
||||
|
||||
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
@@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
],
|
||||
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
||||
InputFormat.JSON_DOCLING: ["application/json"],
|
||||
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
|
||||
}
|
||||
|
||||
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
||||
|
||||
@@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
|
||||
backend: Type[AbstractDocumentBackend]
|
||||
if format not in format_options.keys():
|
||||
_log.error(
|
||||
f"Input document {obj.name} does not match any allowed format."
|
||||
f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
|
||||
)
|
||||
backend = _DummyBackend
|
||||
else:
|
||||
@@ -318,6 +318,8 @@ class _DocumentConversionInput(BaseModel):
|
||||
mime = mime or _DocumentConversionInput._detect_csv(content)
|
||||
mime = mime or "text/plain"
|
||||
formats = MimeTypeToFormat.get(mime, [])
|
||||
_log.info(f"detected formats: {formats}")
|
||||
|
||||
if formats:
|
||||
if len(formats) == 1 and mime not in ("text/plain"):
|
||||
return formats[0]
|
||||
|
||||
@@ -11,8 +11,13 @@ from pydantic import (
|
||||
)
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.datamodel import asr_model_specs
|
||||
|
||||
# Import the following for backwards compatibility
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.pipeline_options_asr_model import (
|
||||
InlineAsrOptions,
|
||||
)
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
ApiVlmOptions,
|
||||
InferenceFramework,
|
||||
@@ -260,6 +265,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||
)
|
||||
|
||||
|
||||
class AsrPipelineOptions(PipelineOptions):
|
||||
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
|
||||
|
||||
class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||
"""Options for the PDF pipeline."""
|
||||
|
||||
@@ -297,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||
)
|
||||
|
||||
|
||||
class PdfPipeline(str, Enum):
|
||||
class ProcessingPipeline(str, Enum):
|
||||
STANDARD = "standard"
|
||||
VLM = "vlm"
|
||||
ASR = "asr"
|
||||
|
||||
57
docling/datamodel/pipeline_options_asr_model.py
Normal file
57
docling/datamodel/pipeline_options_asr_model.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Literal, Optional, Union
|
||||
|
||||
from pydantic import AnyUrl, BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
# InferenceFramework,
|
||||
TransformersModelType,
|
||||
)
|
||||
|
||||
|
||||
class BaseAsrOptions(BaseModel):
|
||||
kind: str
|
||||
# prompt: str
|
||||
|
||||
|
||||
class InferenceAsrFramework(str, Enum):
|
||||
# MLX = "mlx" # disabled for now
|
||||
# TRANSFORMERS = "transformers" # disabled for now
|
||||
WHISPER = "whisper"
|
||||
|
||||
|
||||
class InlineAsrOptions(BaseAsrOptions):
|
||||
kind: Literal["inline_model_options"] = "inline_model_options"
|
||||
|
||||
repo_id: str
|
||||
|
||||
verbose: bool = False
|
||||
timestamps: bool = True
|
||||
|
||||
temperature: float = 0.0
|
||||
max_new_tokens: int = 256
|
||||
max_time_chunk: float = 30.0
|
||||
|
||||
torch_dtype: Optional[str] = None
|
||||
supported_devices: List[AcceleratorDevice] = [
|
||||
AcceleratorDevice.CPU,
|
||||
AcceleratorDevice.CUDA,
|
||||
AcceleratorDevice.MPS,
|
||||
]
|
||||
|
||||
@property
|
||||
def repo_cache_folder(self) -> str:
|
||||
return self.repo_id.replace("/", "--")
|
||||
|
||||
|
||||
class InlineAsrNativeWhisperOptions(InlineAsrOptions):
|
||||
inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
|
||||
|
||||
language: str = "en"
|
||||
supported_devices: List[AcceleratorDevice] = [
|
||||
AcceleratorDevice.CPU,
|
||||
AcceleratorDevice.CUDA,
|
||||
]
|
||||
word_timestamps: bool = True
|
||||
@@ -19,6 +19,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.noop_backend import NoOpBackend
|
||||
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
@@ -41,6 +42,7 @@ from docling.datamodel.settings import (
|
||||
settings,
|
||||
)
|
||||
from docling.exceptions import ConversionError
|
||||
from docling.pipeline.asr_pipeline import AsrPipeline
|
||||
from docling.pipeline.base_pipeline import BasePipeline
|
||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
@@ -118,6 +120,11 @@ class PdfFormatOption(FormatOption):
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
||||
|
||||
|
||||
class AudioFormatOption(FormatOption):
|
||||
pipeline_cls: Type = AsrPipeline
|
||||
backend: Type[AbstractDocumentBackend] = NoOpBackend
|
||||
|
||||
|
||||
def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
format_to_default_options = {
|
||||
InputFormat.CSV: FormatOption(
|
||||
@@ -156,6 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
InputFormat.JSON_DOCLING: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
||||
),
|
||||
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
|
||||
}
|
||||
if (options := format_to_default_options.get(format)) is not None:
|
||||
return options
|
||||
|
||||
253
docling/pipeline/asr_pipeline.py
Normal file
253
docling/pipeline/asr_pipeline.py
Normal file
@@ -0,0 +1,253 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union, cast
|
||||
|
||||
from docling_core.types.doc import DoclingDocument, DocumentOrigin
|
||||
|
||||
# import whisper # type: ignore
|
||||
# import librosa
|
||||
# import numpy as np
|
||||
# import soundfile as sf # type: ignore
|
||||
from docling_core.types.doc.labels import DocItemLabel
|
||||
from pydantic import BaseModel, Field, validator
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.noop_backend import NoOpBackend
|
||||
|
||||
# from pydub import AudioSegment # type: ignore
|
||||
# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
|
||||
from docling.datamodel.accelerator_options import (
|
||||
AcceleratorOptions,
|
||||
)
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
FormatToMimeType,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AsrPipelineOptions,
|
||||
)
|
||||
from docling.datamodel.pipeline_options_asr_model import (
|
||||
InlineAsrNativeWhisperOptions,
|
||||
# AsrResponseFormat,
|
||||
InlineAsrOptions,
|
||||
)
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
InferenceFramework,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.pipeline.base_pipeline import BasePipeline
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class _ConversationWord(BaseModel):
|
||||
text: str
|
||||
start_time: Optional[float] = Field(
|
||||
None, description="Start time in seconds from video start"
|
||||
)
|
||||
end_time: Optional[float] = Field(
|
||||
None, ge=0, description="End time in seconds from video start"
|
||||
)
|
||||
|
||||
|
||||
class _ConversationItem(BaseModel):
|
||||
text: str
|
||||
start_time: Optional[float] = Field(
|
||||
None, description="Start time in seconds from video start"
|
||||
)
|
||||
end_time: Optional[float] = Field(
|
||||
None, ge=0, description="End time in seconds from video start"
|
||||
)
|
||||
speaker_id: Optional[int] = Field(None, description="Numeric speaker identifier")
|
||||
speaker: Optional[str] = Field(
|
||||
None, description="Speaker name, defaults to speaker-{speaker_id}"
|
||||
)
|
||||
words: Optional[list[_ConversationWord]] = Field(
|
||||
None, description="Individual words with time-stamps"
|
||||
)
|
||||
|
||||
def __lt__(self, other):
|
||||
if not isinstance(other, _ConversationItem):
|
||||
return NotImplemented
|
||||
return self.start_time < other.start_time
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, _ConversationItem):
|
||||
return NotImplemented
|
||||
return self.start_time == other.start_time
|
||||
|
||||
def to_string(self) -> str:
|
||||
"""Format the conversation entry as a string"""
|
||||
result = ""
|
||||
if (self.start_time is not None) and (self.end_time is not None):
|
||||
result += f"[time: {self.start_time}-{self.end_time}] "
|
||||
|
||||
if self.speaker is not None:
|
||||
result += f"[speaker:{self.speaker}] "
|
||||
|
||||
result += self.text
|
||||
return result
|
||||
|
||||
|
||||
class _NativeWhisperModel:
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
artifacts_path: Optional[Path],
|
||||
accelerator_options: AcceleratorOptions,
|
||||
asr_options: InlineAsrNativeWhisperOptions,
|
||||
):
|
||||
"""
|
||||
Transcriber using native Whisper.
|
||||
"""
|
||||
self.enabled = enabled
|
||||
|
||||
_log.info(f"artifacts-path: {artifacts_path}")
|
||||
_log.info(f"accelerator_options: {accelerator_options}")
|
||||
|
||||
if self.enabled:
|
||||
try:
|
||||
import whisper # type: ignore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
|
||||
)
|
||||
self.asr_options = asr_options
|
||||
self.max_tokens = asr_options.max_new_tokens
|
||||
self.temperature = asr_options.temperature
|
||||
|
||||
self.device = decide_device(
|
||||
accelerator_options.device,
|
||||
supported_devices=asr_options.supported_devices,
|
||||
)
|
||||
_log.info(f"Available device for Whisper: {self.device}")
|
||||
|
||||
self.model_name = asr_options.repo_id
|
||||
_log.info(f"loading _NativeWhisperModel({self.model_name})")
|
||||
if artifacts_path is not None:
|
||||
_log.info(f"loading {self.model_name} from {artifacts_path}")
|
||||
self.model = whisper.load_model(
|
||||
name=self.model_name,
|
||||
device=self.device,
|
||||
download_root=str(artifacts_path),
|
||||
)
|
||||
else:
|
||||
self.model = whisper.load_model(
|
||||
name=self.model_name, device=self.device
|
||||
)
|
||||
|
||||
self.verbose = asr_options.verbose
|
||||
self.timestamps = asr_options.timestamps
|
||||
self.word_timestamps = asr_options.word_timestamps
|
||||
|
||||
def run(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
audio_path: Path = Path(conv_res.input.file).resolve()
|
||||
|
||||
try:
|
||||
conversation = self.transcribe(audio_path)
|
||||
|
||||
# Ensure we have a proper DoclingDocument
|
||||
origin = DocumentOrigin(
|
||||
filename=conv_res.input.file.name or "audio.wav",
|
||||
mimetype="audio/x-wav",
|
||||
binary_hash=conv_res.input.document_hash,
|
||||
)
|
||||
conv_res.document = DoclingDocument(
|
||||
name=conv_res.input.file.stem or "audio.wav", origin=origin
|
||||
)
|
||||
|
||||
for citem in conversation:
|
||||
conv_res.document.add_text(
|
||||
label=DocItemLabel.TEXT, text=citem.to_string()
|
||||
)
|
||||
|
||||
conv_res.status = ConversionStatus.SUCCESS
|
||||
return conv_res
|
||||
|
||||
except Exception as exc:
|
||||
_log.error(f"Audio tranciption has an error: {exc}")
|
||||
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
def transcribe(self, fpath: Path) -> list[_ConversationItem]:
|
||||
result = self.model.transcribe(
|
||||
str(fpath), verbose=self.verbose, word_timestamps=self.word_timestamps
|
||||
)
|
||||
|
||||
convo: list[_ConversationItem] = []
|
||||
for _ in result["segments"]:
|
||||
item = _ConversationItem(
|
||||
start_time=_["start"], end_time=_["end"], text=_["text"], words=[]
|
||||
)
|
||||
if "words" in _ and self.word_timestamps:
|
||||
item.words = []
|
||||
for __ in _["words"]:
|
||||
item.words.append(
|
||||
_ConversationWord(
|
||||
start_time=__["start"],
|
||||
end_time=__["end"],
|
||||
text=__["word"],
|
||||
)
|
||||
)
|
||||
convo.append(item)
|
||||
|
||||
return convo
|
||||
|
||||
|
||||
class AsrPipeline(BasePipeline):
|
||||
def __init__(self, pipeline_options: AsrPipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
self.keep_backend = True
|
||||
|
||||
self.pipeline_options: AsrPipelineOptions = pipeline_options
|
||||
|
||||
artifacts_path: Optional[Path] = None
|
||||
if pipeline_options.artifacts_path is not None:
|
||||
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
||||
elif settings.artifacts_path is not None:
|
||||
artifacts_path = Path(settings.artifacts_path).expanduser()
|
||||
|
||||
if artifacts_path is not None and not artifacts_path.is_dir():
|
||||
raise RuntimeError(
|
||||
f"The value of {artifacts_path=} is not valid. "
|
||||
"When defined, it must point to a folder containing all models required by the pipeline."
|
||||
)
|
||||
|
||||
if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
|
||||
asr_options: InlineAsrNativeWhisperOptions = (
|
||||
self.pipeline_options.asr_options
|
||||
)
|
||||
self._model = _NativeWhisperModel(
|
||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
asr_options=asr_options,
|
||||
)
|
||||
else:
|
||||
_log.error(f"No model support for {self.pipeline_options.asr_options}")
|
||||
|
||||
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
||||
status = ConversionStatus.SUCCESS
|
||||
return status
|
||||
|
||||
@classmethod
|
||||
def get_default_options(cls) -> AsrPipelineOptions:
|
||||
return AsrPipelineOptions()
|
||||
|
||||
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
_log.info(f"start _build_document in AsrPipeline: {conv_res.input.file}")
|
||||
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||
self._model.run(conv_res=conv_res)
|
||||
|
||||
return conv_res
|
||||
|
||||
@classmethod
|
||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||
return isinstance(backend, NoOpBackend)
|
||||
Reference in New Issue
Block a user