AudioBackend -> DummyBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-23 09:55:59 +02:00
parent caf18e634b
commit b43aef2eb5
4 changed files with 39 additions and 57 deletions

View File

@ -1,43 +1,43 @@
import logging
import warnings
from io import BytesIO, StringIO
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.doc import (
DoclingDocument,
DocumentOrigin,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class AudioBackend(DeclarativeDocumentBackend):
# content: StringIO
class DummyBackend(AbstractDocumentBackend):
"""
A dummy backend that only validates input existence.
Used e.g. for audio files where actual processing is handled by the ASR pipeline.
"""
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
_log.info(f"path: {path_or_stream}")
_log.debug(f"DummyBackend initialized for: {path_or_stream}")
# Load content
# Validate input
try:
if isinstance(self.path_or_stream, BytesIO):
_log.info(f"reading streaming: {self.path_or_stream}")
# self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8"))
# Check if stream has content
self.valid = len(self.path_or_stream.getvalue()) > 0
_log.debug(
f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
)
elif isinstance(self.path_or_stream, Path):
_log.info(f"reading file: {self.path_or_stream}")
# self.content = StringIO(self.path_or_stream.read())
self.valid = True
# Check if file exists
self.valid = self.path_or_stream.exists()
_log.debug(f"File exists: {self.valid}")
else:
self.valid = False
except Exception as e:
raise RuntimeError(
f"AudioBackend could not load document with hash {self.document_hash}"
) from e
return
_log.error(f"DummyBackend validation failed: {e}")
self.valid = False
def is_valid(self) -> bool:
return self.valid
@ -46,35 +46,6 @@ class AudioBackend(DeclarativeDocumentBackend):
def supports_pagination(cls) -> bool:
return False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.AUDIO}
def convert(self) -> DoclingDocument:
"""
Parses the audio file into a structured document model.
"""
# Parse the CSV into a structured document model
origin = DocumentOrigin(
filename=self.file.name or "audio.wav",
mimetype="audio/wav",
binary_hash=self.document_hash,
)
_log.info(f"origin: {origin}")
doc = DoclingDocument(name=self.file.stem or "audio.wav", origin=origin)
if self.is_valid():
_log.error("time to get going ...")
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the audio backend failed to init."
)
return doc

View File

@ -23,7 +23,7 @@ from docling_core.utils.file import resolve_source_to_path
from pydantic import TypeAdapter
from rich.console import Console
from docling.backend.audio_backend import AudioBackend
from docling.backend.audio_backend import DummyBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
@ -665,7 +665,6 @@ def convert( # noqa: C901
audio_format_option = AudioFormatOption(
pipeline_cls=AsrPipeline,
pipeline_options=pipeline_options,
backend=AudioBackend,
)
format_options = {

View File

@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.audio_backend import AudioBackend
from docling.backend.audio_backend import DummyBackend
from docling.backend.csv_backend import CsvDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
@ -122,7 +122,7 @@ class PdfFormatOption(FormatOption):
class AudioFormatOption(FormatOption):
pipeline_cls: Type = AsrPipeline
backend: Type[AbstractDocumentBackend] = AudioBackend
backend: Type[AbstractDocumentBackend] = DummyBackend
def _get_default_option(format: InputFormat) -> FormatOption:
@ -163,7 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.JSON_DOCLING: FormatOption(
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
),
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=AudioBackend),
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=DummyBackend),
}
if (options := format_to_default_options.get(format)) is not None:
return options

View File

@ -5,6 +5,8 @@ from io import BytesIO
from pathlib import Path
from typing import List, Optional, Union, cast
from docling_core.types.doc import DoclingDocument, DocumentOrigin
# import whisper # type: ignore
# import librosa
# import numpy as np
@ -13,7 +15,7 @@ from docling_core.types.doc.labels import DocItemLabel
from pydantic import BaseModel, Field, validator
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.audio_backend import AudioBackend
from docling.backend.audio_backend import DummyBackend
# from pydub import AudioSegment # type: ignore
# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
@ -149,6 +151,16 @@ class _NativeWhisperModel:
try:
conversation = self.transcribe(audio_path)
# Ensure we have a proper DoclingDocument
origin = DocumentOrigin(
filename=conv_res.input.file.name or "audio.wav",
mimetype="audio/wav",
binary_hash=conv_res.input.document_hash,
)
conv_res.document = DoclingDocument(
name=conv_res.input.file.stem or "audio.wav", origin=origin
)
for _ in conversation:
conv_res.document.add_text(label=DocItemLabel.TEXT, text=_.to_string())
@ -235,4 +247,4 @@ class AsrPipeline(BasePipeline):
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
return isinstance(backend, AudioBackend)
return isinstance(backend, DummyBackend)