AudioBackend -> DummyBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2025-06-23 09:55:59 +02:00 · 2025-06-23 09:55:59 +02:00 · b43aef2eb5
commit b43aef2eb5
parent caf18e634b
4 changed files with 39 additions and 57 deletions
--- a/docling/backend/audio_backend.py
+++ b/docling/backend/audio_backend.py
@ -1,43 +1,43 @@
 import logging
-import warnings
+from io import BytesIO
 from io import BytesIO, StringIO
 from pathlib import Path
 from typing import Set, Union
-from docling_core.types.doc import (
+from docling.backend.abstract_backend import AbstractDocumentBackend
    DoclingDocument,
    DocumentOrigin,
 )
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
-class AudioBackend(DeclarativeDocumentBackend):
+class DummyBackend(AbstractDocumentBackend):
-    # content: StringIO
+    """
    A dummy backend that only validates input existence.
    Used e.g. for audio files where actual processing is handled by the ASR pipeline.
    """
    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)
-        _log.info(f"path: {path_or_stream}")
+        _log.debug(f"DummyBackend initialized for: {path_or_stream}")
-        # Load content
+        # Validate input
        try:
            if isinstance(self.path_or_stream, BytesIO):
-                _log.info(f"reading streaming: {self.path_or_stream}")
+                # Check if stream has content
-                # self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8"))
+                self.valid = len(self.path_or_stream.getvalue()) > 0
                _log.debug(
                    f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
                )
            elif isinstance(self.path_or_stream, Path):
-                _log.info(f"reading file: {self.path_or_stream}")
+                # Check if file exists
-                # self.content = StringIO(self.path_or_stream.read())
+                self.valid = self.path_or_stream.exists()
-            self.valid = True
+                _log.debug(f"File exists: {self.valid}")
            else:
                self.valid = False
        except Exception as e:
-            raise RuntimeError(
+            _log.error(f"DummyBackend validation failed: {e}")
-                f"AudioBackend could not load document with hash {self.document_hash}"
+            self.valid = False
            ) from e
        return
    def is_valid(self) -> bool:
        return self.valid
@ -46,35 +46,6 @@ class AudioBackend(DeclarativeDocumentBackend):
    def supports_pagination(cls) -> bool:
        return False
    def unload(self):
        if isinstance(self.path_or_stream, BytesIO):
            self.path_or_stream.close()
        self.path_or_stream = None
    @classmethod
    def supported_formats(cls) -> Set[InputFormat]:
        return {InputFormat.AUDIO}
    def convert(self) -> DoclingDocument:
        """
        Parses the audio file into a structured document model.
        """
        # Parse the CSV into a structured document model
        origin = DocumentOrigin(
            filename=self.file.name or "audio.wav",
            mimetype="audio/wav",
            binary_hash=self.document_hash,
        )
        _log.info(f"origin: {origin}")
        doc = DoclingDocument(name=self.file.stem or "audio.wav", origin=origin)
        if self.is_valid():
            _log.error("time to get going ...")
        else:
            raise RuntimeError(
                f"Cannot convert doc with {self.document_hash} because the audio backend failed to init."
            )
        return doc
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -23,7 +23,7 @@ from docling_core.utils.file import resolve_source_to_path
 from pydantic import TypeAdapter
 from rich.console import Console
-from docling.backend.audio_backend import AudioBackend
+from docling.backend.audio_backend import DummyBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
@ -665,7 +665,6 @@ def convert(  # noqa: C901
            audio_format_option = AudioFormatOption(
                pipeline_cls=AsrPipeline,
                pipeline_options=pipeline_options,
                backend=AudioBackend,
            )
            format_options = {
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.asciidoc_backend import AsciiDocBackend
-from docling.backend.audio_backend import AudioBackend
+from docling.backend.audio_backend import DummyBackend
 from docling.backend.csv_backend import CsvDocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
@ -122,7 +122,7 @@ class PdfFormatOption(FormatOption):
 class AudioFormatOption(FormatOption):
    pipeline_cls: Type = AsrPipeline
-    backend: Type[AbstractDocumentBackend] = AudioBackend
+    backend: Type[AbstractDocumentBackend] = DummyBackend
 def _get_default_option(format: InputFormat) -> FormatOption:
@ -163,7 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
        InputFormat.JSON_DOCLING: FormatOption(
            pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
        ),
-        InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=AudioBackend),
+        InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=DummyBackend),
    }
    if (options := format_to_default_options.get(format)) is not None:
        return options
--- a/docling/pipeline/asr_pipeline.py
+++ b/docling/pipeline/asr_pipeline.py
@ -5,6 +5,8 @@ from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast
 from docling_core.types.doc import DoclingDocument, DocumentOrigin
 # import whisper  # type: ignore
 # import librosa
 # import numpy as np
@ -13,7 +15,7 @@ from docling_core.types.doc.labels import DocItemLabel
 from pydantic import BaseModel, Field, validator
 from docling.backend.abstract_backend import AbstractDocumentBackend
-from docling.backend.audio_backend import AudioBackend
+from docling.backend.audio_backend import DummyBackend
 # from pydub import AudioSegment  # type: ignore
 # from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
@ -149,6 +151,16 @@ class _NativeWhisperModel:
        try:
            conversation = self.transcribe(audio_path)
            # Ensure we have a proper DoclingDocument
            origin = DocumentOrigin(
                filename=conv_res.input.file.name or "audio.wav",
                mimetype="audio/wav",
                binary_hash=conv_res.input.document_hash,
            )
            conv_res.document = DoclingDocument(
                name=conv_res.input.file.stem or "audio.wav", origin=origin
            )
            for _ in conversation:
                conv_res.document.add_text(label=DocItemLabel.TEXT, text=_.to_string())
@ -235,4 +247,4 @@ class AsrPipeline(BasePipeline):
    @classmethod
    def is_backend_supported(cls, backend: AbstractDocumentBackend):
-        return isinstance(backend, AudioBackend)
+        return isinstance(backend, DummyBackend)