WIP: got first transcription working

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2025-06-13 10:43:23 +02:00 · 2025-06-13 10:43:23 +02:00 · 6dead88464
commit 6dead88464
parent 1d4008ac7c
9 changed files with 352 additions and 67 deletions
--- a/docling/backend/audio_backend.py
+++ b/docling/backend/audio_backend.py
@ -0,0 +1,80 @@
 import logging
 import warnings
 from io import BytesIO, StringIO
 from pathlib import Path
 from typing import Set, Union
 from docling_core.types.doc import (
    DoclingDocument,
    DocumentOrigin,
 )
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
 class AudioBackend(DeclarativeDocumentBackend):
    # content: StringIO
    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)
        _log.info(f"path: {path_or_stream}")
        # Load content
        try:
            if isinstance(self.path_or_stream, BytesIO):
                _log.info(f"reading streaming: {self.path_or_stream}")
                # self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8"))
            elif isinstance(self.path_or_stream, Path):
                _log.info(f"reading file: {self.path_or_stream}")
                # self.content = StringIO(self.path_or_stream.read())
            self.valid = True
        except Exception as e:
            raise RuntimeError(
                f"AudioBackend could not load document with hash {self.document_hash}"
            ) from e
        return
    def is_valid(self) -> bool:
        return self.valid
    @classmethod
    def supports_pagination(cls) -> bool:
        return False
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
        self.path_or_stream = None
    @classmethod
    def supported_formats(cls) -> Set[InputFormat]:
        return {InputFormat.AUDIO_WAV}
    def convert(self) -> DoclingDocument:
        """
        Parses the audio file into a structured document model.
        """
        # Parse the CSV into a structured document model
        origin = DocumentOrigin(
            filename=self.file.name or "audio.wav",
            mimetype="audio/wav",
            binary_hash=self.document_hash,
        )
        _log.info(f"origin: {origin}")
        doc = DoclingDocument(name=self.file.stem or "audio.wav", origin=origin)
        if self.is_valid():
            _log.error("time to get going ...")
        else:
            raise RuntimeError(
                f"Cannot convert doc with {self.document_hash} because the audio backend failed to init."
            )
        return doc
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -23,6 +23,7 @@ from docling_core.utils.file import resolve_source_to_path
 from pydantic import TypeAdapter
 from rich.console import Console
 from docling.backend.audio_backend import AudioBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
@ -59,7 +60,12 @@ from docling.datamodel.vlm_model_specs import (
    SMOLDOCLING_TRANSFORMERS,
    VlmModelType,
 )
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docling.document_converter import (
    AudioFormatOption,
    DocumentConverter,
    FormatOption,
    PdfFormatOption,
 )
 from docling.models.factories import get_ocr_factory
 from docling.pipeline.asr_pipeline import AsrPipeline
 from docling.pipeline.vlm_pipeline import VlmPipeline
@ -544,39 +550,7 @@ def convert(  # noqa: C901
        format_options: Dict[InputFormat, FormatOption] = {}
-        if pipeline == ProcessingPipeline.VLM:
+        if pipeline == ProcessingPipeline.STANDARD:
            pipeline_options = VlmPipelineOptions(
                enable_remote_services=enable_remote_services,
            )
            if vlm_model == VlmModelType.GRANITE_VISION:
                pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
            elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
                pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
            elif vlm_model == VlmModelType.SMOLDOCLING:
                pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
                if sys.platform == "darwin":
                    try:
                        import mlx_vlm
                        pipeline_options.vlm_options = SMOLDOCLING_MLX
                    except ImportError:
                        _log.warning(
                            "To run SmolDocling faster, please install mlx-vlm:\n"
                            "pip install mlx-vlm"
                        )
            pdf_format_option = PdfFormatOption(
                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
            )
            format_options: Dict[InputFormat, FormatOption] = {
                InputFormat.PDF: pdf_format_option,
                InputFormat.IMAGE: pdf_format_option,
            }
        elif pipeline == ProcessingPipeline.STANDARD:
            pipeline_options = PdfPipelineOptions(
                allow_external_plugins=allow_external_plugins,
                enable_remote_services=enable_remote_services,
@ -624,16 +598,52 @@ def convert(  # noqa: C901
                InputFormat.IMAGE: pdf_format_option,
            }
-        elif pipeline == ProcessingPipeline.ASR:
+        elif pipeline == ProcessingPipeline.VLM:
-            audio_pipeline_options = AsrPipelineOptions(
+            pipeline_options = VlmPipelineOptions(
-                # enable_remote_services=enable_remote_services,
+                enable_remote_services=enable_remote_services,
                artifacts_path = artifacts_path
            )
-            audio_format_option = PdfFormatOption(
+            if vlm_model == VlmModelType.GRANITE_VISION:
                pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
            elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
                pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
            elif vlm_model == VlmModelType.SMOLDOCLING:
                pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
                if sys.platform == "darwin":
                    try:
                        import mlx_vlm
                        pipeline_options.vlm_options = SMOLDOCLING_MLX
                    except ImportError:
                        _log.warning(
                            "To run SmolDocling faster, please install mlx-vlm:\n"
                            "pip install mlx-vlm"
                        )
            pdf_format_option = PdfFormatOption(
                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
            )
            format_options: Dict[InputFormat, FormatOption] = {
                InputFormat.PDF: pdf_format_option,
                InputFormat.IMAGE: pdf_format_option,
            }
        elif pipeline == ProcessingPipeline.ASR:
            pipeline_options = AsrPipelineOptions(
                # enable_remote_services=enable_remote_services,
                # artifacts_path = artifacts_path
            )
            if asr_model == AsrModelType.WHISPER_TINY:
                pipeline_options.asr_options = WHISPER_TINY
            else:
                pipeline_options.asr_options = WHISPER_TINY
            audio_format_option = AudioFormatOption(
                pipeline_cls=AsrPipeline,
-                pipeline_options=audio_pipeline_options,
+                pipeline_options=pipeline_options,
-                # backend = FIXME
+                backend=AudioBackend,
            )
            format_options: Dict[InputFormat, FormatOption] = {
@ -656,6 +666,7 @@ def convert(  # noqa: C901
        start_time = time.time()
        _log.info(f"paths: {input_doc_paths}")
        conv_results = doc_converter.convert_all(
            input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
        )
--- a/docling/datamodel/asr_model_specs.py
+++ b/docling/datamodel/asr_model_specs.py
@ -7,10 +7,10 @@ from pydantic import (
 from docling.datamodel.accelerator_options import AcceleratorDevice
 from docling.datamodel.pipeline_options_asr_model import (
    AsrResponseFormat,
    # ApiAsrOptions,
    InferenceFramework,
    InlineAsrOptions,
    AsrResponseFormat,
    TransformersModelType,
 )
@ -23,5 +23,6 @@ WHISPER_TINY = InlineAsrOptions(
    response_format=AsrResponseFormat.WHISPER,
 )
 class AsrModelType(str, Enum):
    WHISPER_TINY = "whisper_tiny"
--- a/docling/datamodel/pipeline_options_asr_model.py
+++ b/docling/datamodel/pipeline_options_asr_model.py
@ -5,7 +5,11 @@ from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
 from docling.datamodel.accelerator_options import AcceleratorDevice
-from docling.datamodel.pipeline_options_vlm_model import InferenceFramework, TransformersModelType
+from docling.datamodel.pipeline_options_vlm_model import (
    InferenceFramework,
    TransformersModelType,
 )
 class BaseAsrOptions(BaseModel):
    kind: str
@ -46,5 +50,3 @@ class InlineAsrOptions(BaseAsrOptions):
    @property
    def repo_cache_folder(self) -> str:
        return self.repo_id.replace("/", "--")
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -11,6 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.asciidoc_backend import AsciiDocBackend
 from docling.backend.audio_backend import AudioBackend
 from docling.backend.csv_backend import CsvDocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
@ -41,6 +42,7 @@ from docling.datamodel.settings import (
    settings,
 )
 from docling.exceptions import ConversionError
 from docling.pipeline.asr_pipeline import AsrPipeline
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.pipeline.simple_pipeline import SimplePipeline
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@ -118,6 +120,11 @@ class PdfFormatOption(FormatOption):
    backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
 class AudioFormatOption(FormatOption):
    pipeline_cls: Type = AsrPipeline
    backend: Type[AbstractDocumentBackend] = AudioBackend
 def _get_default_option(format: InputFormat) -> FormatOption:
    format_to_default_options = {
        InputFormat.CSV: FormatOption(
@ -156,6 +163,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
        InputFormat.JSON_DOCLING: FormatOption(
            pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
        ),
        InputFormat.AUDIO_WAV: FormatOption(
            pipeline_cls=AsrPipeline, backend=AudioBackend
        ),
    }
    if (options := format_to_default_options.get(format)) is not None:
        return options
--- a/docling/pipeline/asr_pipeline.py
+++ b/docling/pipeline/asr_pipeline.py
@ -1,30 +1,179 @@
 import logging
 import os
 import re
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast
-from docling.backend.abstract_backend import AbstractDocumentBackend
+import soundfile as sf
 from docling_core.types.doc.labels import DocItemLabel
 from pydantic import BaseModel, Field, validator
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.audio_backend import AudioBackend
 from docling.datamodel.base_models import (
    ConversionStatus,
 )
 from docling.datamodel.document import ConversionResult, InputDocument
 from docling.datamodel.pipeline_options import (
    AsrPipelineOptions,
 )
 from docling.datamodel.pipeline_options_asr_model import (
    AsrResponseFormat,
    InlineAsrOptions,
 )
 from docling.datamodel.pipeline_options_vlm_model import (
    InferenceFramework,
 )
 from docling.datamodel.pipeline_options_asr_model import (
    InlineAsrOptions,
    AsrResponseFormat,
 )
 from docling.datamodel.settings import settings
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 from docling.datamodel.document import ConversionResult, InputDocument
 _log = logging.getLogger(__name__)
 class ConversationEntry(BaseModel):
    text: str
    start_time: float = Field(
        ..., ge=0, description="Start time in seconds from video start"
    )
    end_time: float = Field(
        ..., ge=0, description="End time in seconds from video start"
    )
    speaker_id: int = Field(..., ge=0, description="Numeric speaker identifier")
    speaker: Optional[str] = Field(
        None, description="Speaker name, defaults to speaker-{speaker_id}"
    )
    @validator("end_time")
    def end_time_must_be_after_start(cls, v, values):
        if "start_time" in values and v <= values["start_time"]:
            raise ValueError("end_time must be greater than start_time")
        return v
    @validator("speaker", always=True)
    def set_default_speaker_name(cls, v, values):
        if v is None and "speaker_id" in values:
            return f"speaker-{values['speaker_id']}"
        return v
    def __lt__(self, other):
        if not isinstance(other, ConversationEntry):
            return NotImplemented
        return self.start_time < other.start_time
    def __eq__(self, other):
        if not isinstance(other, ConversationEntry):
            return NotImplemented
        return self.start_time == other.start_time
    def to_string(self) -> str:
        """Format the conversation entry as a string"""
        return f"[time: {self.start_time}-{self.end_time}] [speaker:{self.speaker}] {self.text}"
 class _WhisperModel:
    def __init__(self):
        _log.info("initialisation `_WhisperModel`")
        from transformers import WhisperForConditionalGeneration, WhisperProcessor
        self.model_repo = "openai/whisper-tiny"
        self.processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
        self.model = WhisperForConditionalGeneration.from_pretrained(
            "openai/whisper-tiny"
        )
        _log.info(f"model is loaded: {self.model_repo}")
    def run(self, conv_res: ConversionResult):
        # fpath = Path(conv_res.input.file)
        # _log.info(f"`_WhisperModel::run: {conv_res}`")
        _log.info(f"`_WhisperModel::run: {conv_res.input}`")
        _log.info(f"`_WhisperModel::run: {conv_res.input.file}`")
        if os.path.exists(str(conv_res.input.file)):
            print("file exists")
        else:
            print("file does not exist")
        #
        _log.info(f"sampling-rate: {self.processor.feature_extractor.sampling_rate}")
        try:
            fpath = conv_res.input.file
            # array, sampling_rate = sf.read(fpath)#, samplerate=processor.feature_extractor.sampling_rate)
            array, sampling_rate = sf.read(
                fpath
            )  # , samplerate=self.processor.feature_extractor.sampling_rate)
            _log.info(
                f"read the file .. (sampling-rate: {sampling_rate}, array: {array.shape})"
            )
            processed_input = self.processor(
                array,
                sampling_rate=self.processor.feature_extractor.sampling_rate,  # sampling_rate,
                return_tensors="pt",
            )
            print(processed_input)
            # pre-process to get the input features
            input_features = self.processor(
                array, sampling_rate=sampling_rate, return_tensors="pt"
            ).input_features
            _log.info(f"got input-features: {input_features.shape}")
            # generate token ids by running model forward sequentially
            predicted_ids = self.model.generate(
                input_features, max_new_tokens=256, return_timestamps=True
            )
            _log.info("ran model ..")
            """
            transcription = self.processor.batch_decode(predicted_ids,
                                                        skip_special_tokens=False,
                                                        decode_with_timestamps=True)
            _log.info("decoded output ..")
            print(f"Transcription: {transcription}")
            """
            conversation = []
            print("Timestamp info:")
            for pidi, pid in enumerate(predicted_ids):
                # timestamps = processor.tokenizer.decode(pid, decode_with_timestamps=True)
                timestamps = self.processor.tokenizer.decode(pid, output_offsets=True)
                print(f"Predicted id [{pidi}]: {timestamps['text']}")
                for offset in timestamps["offsets"]:
                    print(f" => {offset['timestamp']}: {offset['text']}")
                    item = ConversationEntry(
                        text=offset["text"],
                        speaker_id=pidi,
                        start_time=offset["timestamp"][0],
                        end_time=offset["timestamp"][1],
                    )
                    conv_res.document.add_text(
                        label=DocItemLabel.TEXT, text=item.to_string()
                    )
            conv_res.status = ConversionStatus.SUCCESS
            print("document: \n\n", conv_res.document.export_to_markdown())
        except Exception as exc:
            conv_res.status = ConversionStatus.FAILED
            print(exc)
        return conv_res
 class AsrPipeline(BasePipeline):
    def __init__(self, pipeline_options: AsrPipelineOptions):
        super().__init__(pipeline_options)
@ -44,19 +193,24 @@ class AsrPipeline(BasePipeline):
                "When defined, it must point to a folder containing all models required by the pipeline."
            )
        self._model = _WhisperModel()
    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
        status = ConversionStatus.SUCCESS
        return status
    @classmethod
    def get_default_options(cls) -> AsrPipelineOptions:
        return AsrPipelineOptions()
    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
        total_elapsed_time = 0.0
        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
-            print("do something")
+            _log.info(f"do something: {conv_res.input.file}")
            self._model.run(conv_res=conv_res)
            _log.info(f"finished doing something: {conv_res.input.file}")
        return conv_res
    """
    def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
        status = ConversionStatus()        
        return status
    """
    @classmethod
    def is_backend_supported(cls, backend: AbstractDocumentBackend):
-        return True
+        return isinstance(backend, AudioBackend)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -99,6 +99,9 @@ rapidocr = [
  # 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
  # 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
 ]
 asr = [
    "soundfile>=0.13.1",
 ]
 [dependency-groups]
 dev = [
--- a/uv.lock
+++ b/uv.lock
@ -844,6 +844,9 @@ dependencies = [
 ]
 [package.optional-dependencies]
 asr = [
    { name = "soundfile" },
 ]
 ocrmac = [
    { name = "ocrmac", marker = "sys_platform == 'darwin'" },
 ]
@ -932,12 +935,13 @@ requires-dist = [
    { name = "requests", specifier = ">=2.32.2,<3.0.0" },
    { name = "rtree", specifier = ">=1.3.0,<2.0.0" },
    { name = "scipy", specifier = ">=1.6.0,<2.0.0" },
    { name = "soundfile", marker = "extra == 'asr'", specifier = ">=0.13.1" },
    { name = "tesserocr", marker = "extra == 'tesserocr'", specifier = ">=2.7.1,<3.0.0" },
    { name = "tqdm", specifier = ">=4.65.0,<5.0.0" },
    { name = "transformers", marker = "extra == 'vlm'", specifier = ">=4.46.0,<5.0.0" },
    { name = "typer", specifier = ">=0.12.5,<0.17.0" },
 ]
-provides-extras = ["tesserocr", "ocrmac", "vlm", "rapidocr"]
+provides-extras = ["tesserocr", "ocrmac", "vlm", "rapidocr", "asr"]
 [package.metadata.requires-dev]
 constraints = [
@ -5764,6 +5768,26 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
 ]
 [[package]]
 name = "soundfile"
 version = "0.13.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "cffi" },
    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/64/28/e2a36573ccbcf3d57c00626a21fe51989380636e821b341d36ccca0c1c3a/soundfile-0.13.1-py2.py3-none-any.whl", hash = "sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445", size = 25751, upload-time = "2025-01-25T09:16:44.235Z" },
    { url = "https://files.pythonhosted.org/packages/ea/ab/73e97a5b3cc46bba7ff8650a1504348fa1863a6f9d57d7001c6b67c5f20e/soundfile-0.13.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:82dc664d19831933fe59adad199bf3945ad06d84bc111a5b4c0d3089a5b9ec33", size = 1142250, upload-time = "2025-01-25T09:16:47.583Z" },
    { url = "https://files.pythonhosted.org/packages/a0/e5/58fd1a8d7b26fc113af244f966ee3aecf03cb9293cb935daaddc1e455e18/soundfile-0.13.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:743f12c12c4054921e15736c6be09ac26b3b3d603aef6fd69f9dde68748f2593", size = 1101406, upload-time = "2025-01-25T09:16:49.662Z" },
    { url = "https://files.pythonhosted.org/packages/58/ae/c0e4a53d77cf6e9a04179535766b3321b0b9ced5f70522e4caf9329f0046/soundfile-0.13.1-py2.py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb", size = 1235729, upload-time = "2025-01-25T09:16:53.018Z" },
    { url = "https://files.pythonhosted.org/packages/57/5e/70bdd9579b35003a489fc850b5047beeda26328053ebadc1fb60f320f7db/soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618", size = 1313646, upload-time = "2025-01-25T09:16:54.872Z" },
    { url = "https://files.pythonhosted.org/packages/fe/df/8c11dc4dfceda14e3003bb81a0d0edcaaf0796dd7b4f826ea3e532146bba/soundfile-0.13.1-py2.py3-none-win32.whl", hash = "sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5", size = 899881, upload-time = "2025-01-25T09:16:56.663Z" },
    { url = "https://files.pythonhosted.org/packages/14/e9/6b761de83277f2f02ded7e7ea6f07828ec78e4b229b80e4ca55dd205b9dc/soundfile-0.13.1-py2.py3-none-win_amd64.whl", hash = "sha256:1e70a05a0626524a69e9f0f4dd2ec174b4e9567f4d8b6c11d38b5c289be36ee9", size = 1019162, upload-time = "2025-01-25T09:16:59.573Z" },
 ]
 [[package]]
 name = "soupsieve"
 version = "2.7"