AudioBackend -> DummyBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2025-06-23 09:55:59 +02:00 · 2025-06-23 09:55:59 +02:00 · b43aef2eb5
commit b43aef2eb5
parent caf18e634b
4 changed files with 39 additions and 57 deletions
--- a/docling/backend/audio_backend.py
+++ b/docling/backend/audio_backend.py
@ -1,43 +1,43 @@
 import logging
-import warnings
-from io import BytesIO, StringIO
+from io import BytesIO
 from pathlib import Path
 from typing import Set, Union

-from docling_core.types.doc import (
-    DoclingDocument,
-    DocumentOrigin,
-)
-
-from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument

 _log = logging.getLogger(__name__)


-class AudioBackend(DeclarativeDocumentBackend):
-    # content: StringIO
+class DummyBackend(AbstractDocumentBackend):
+    """
+    A dummy backend that only validates input existence.
+    Used e.g. for audio files where actual processing is handled by the ASR pipeline.
+    """

    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)

-        _log.info(f"path: {path_or_stream}")
+        _log.debug(f"DummyBackend initialized for: {path_or_stream}")

-        # Load content
+        # Validate input
        try:
            if isinstance(self.path_or_stream, BytesIO):
-                _log.info(f"reading streaming: {self.path_or_stream}")
-                # self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8"))
+                # Check if stream has content
+                self.valid = len(self.path_or_stream.getvalue()) > 0
+                _log.debug(
+                    f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
+                )
            elif isinstance(self.path_or_stream, Path):
-                _log.info(f"reading file: {self.path_or_stream}")
-                # self.content = StringIO(self.path_or_stream.read())
-            self.valid = True
+                # Check if file exists
+                self.valid = self.path_or_stream.exists()
+                _log.debug(f"File exists: {self.valid}")
+            else:
+                self.valid = False
        except Exception as e:
-            raise RuntimeError(
-                f"AudioBackend could not load document with hash {self.document_hash}"
-            ) from e
-        return
+            _log.error(f"DummyBackend validation failed: {e}")
+            self.valid = False

    def is_valid(self) -> bool:
        return self.valid
@ -46,35 +46,6 @@ class AudioBackend(DeclarativeDocumentBackend):
    def supports_pagination(cls) -> bool:
        return False

-    def unload(self):
-        if isinstance(self.path_or_stream, BytesIO):
-            self.path_or_stream.close()
-        self.path_or_stream = None
-
    @classmethod
    def supported_formats(cls) -> Set[InputFormat]:
        return {InputFormat.AUDIO}
-
-    def convert(self) -> DoclingDocument:
-        """
-        Parses the audio file into a structured document model.
-        """
-
-        # Parse the CSV into a structured document model
-        origin = DocumentOrigin(
-            filename=self.file.name or "audio.wav",
-            mimetype="audio/wav",
-            binary_hash=self.document_hash,
-        )
-        _log.info(f"origin: {origin}")
-
-        doc = DoclingDocument(name=self.file.stem or "audio.wav", origin=origin)
-
-        if self.is_valid():
-            _log.error("time to get going ...")
-        else:
-            raise RuntimeError(
-                f"Cannot convert doc with {self.document_hash} because the audio backend failed to init."
-            )
-
-        return doc
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -23,7 +23,7 @@ from docling_core.utils.file import resolve_source_to_path
 from pydantic import TypeAdapter
 from rich.console import Console

-from docling.backend.audio_backend import AudioBackend
+from docling.backend.audio_backend import DummyBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
@ -665,7 +665,6 @@ def convert(  # noqa: C901
            audio_format_option = AudioFormatOption(
                pipeline_cls=AsrPipeline,
                pipeline_options=pipeline_options,
-                backend=AudioBackend,
            )

            format_options = {
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call

 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.asciidoc_backend import AsciiDocBackend
-from docling.backend.audio_backend import AudioBackend
+from docling.backend.audio_backend import DummyBackend
 from docling.backend.csv_backend import CsvDocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
@ -122,7 +122,7 @@ class PdfFormatOption(FormatOption):

 class AudioFormatOption(FormatOption):
    pipeline_cls: Type = AsrPipeline
-    backend: Type[AbstractDocumentBackend] = AudioBackend
+    backend: Type[AbstractDocumentBackend] = DummyBackend


 def _get_default_option(format: InputFormat) -> FormatOption:
@ -163,7 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
        InputFormat.JSON_DOCLING: FormatOption(
            pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
        ),
-        InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=AudioBackend),
+        InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=DummyBackend),
    }
    if (options := format_to_default_options.get(format)) is not None:
        return options
--- a/docling/pipeline/asr_pipeline.py
+++ b/docling/pipeline/asr_pipeline.py
@ -5,6 +5,8 @@ from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast

+from docling_core.types.doc import DoclingDocument, DocumentOrigin
+
 # import whisper  # type: ignore
 # import librosa
 # import numpy as np
@ -13,7 +15,7 @@ from docling_core.types.doc.labels import DocItemLabel
 from pydantic import BaseModel, Field, validator

 from docling.backend.abstract_backend import AbstractDocumentBackend
-from docling.backend.audio_backend import AudioBackend
+from docling.backend.audio_backend import DummyBackend

 # from pydub import AudioSegment  # type: ignore
 # from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
@ -149,6 +151,16 @@ class _NativeWhisperModel:
        try:
            conversation = self.transcribe(audio_path)

+            # Ensure we have a proper DoclingDocument
+            origin = DocumentOrigin(
+                filename=conv_res.input.file.name or "audio.wav",
+                mimetype="audio/wav",
+                binary_hash=conv_res.input.document_hash,
+            )
+            conv_res.document = DoclingDocument(
+                name=conv_res.input.file.stem or "audio.wav", origin=origin
+            )
+
            for _ in conversation:
                conv_res.document.add_text(label=DocItemLabel.TEXT, text=_.to_string())

@ -235,4 +247,4 @@ class AsrPipeline(BasePipeline):

    @classmethod
    def is_backend_supported(cls, backend: AbstractDocumentBackend):
-        return isinstance(backend, AudioBackend)
+        return isinstance(backend, DummyBackend)