AudioBackend -> DummyBackend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-23 09:55:59 +02:00
parent caf18e634b
commit b43aef2eb5
4 changed files with 39 additions and 57 deletions

View File

@ -1,43 +1,43 @@
import logging import logging
import warnings from io import BytesIO
from io import BytesIO, StringIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
from docling_core.types.doc import ( from docling.backend.abstract_backend import AbstractDocumentBackend
DoclingDocument,
DocumentOrigin,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class AudioBackend(DeclarativeDocumentBackend): class DummyBackend(AbstractDocumentBackend):
# content: StringIO """
A dummy backend that only validates input existence.
Used e.g. for audio files where actual processing is handled by the ASR pipeline.
"""
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
_log.info(f"path: {path_or_stream}") _log.debug(f"DummyBackend initialized for: {path_or_stream}")
# Load content # Validate input
try: try:
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
_log.info(f"reading streaming: {self.path_or_stream}") # Check if stream has content
# self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8")) self.valid = len(self.path_or_stream.getvalue()) > 0
_log.debug(
f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
)
elif isinstance(self.path_or_stream, Path): elif isinstance(self.path_or_stream, Path):
_log.info(f"reading file: {self.path_or_stream}") # Check if file exists
# self.content = StringIO(self.path_or_stream.read()) self.valid = self.path_or_stream.exists()
self.valid = True _log.debug(f"File exists: {self.valid}")
else:
self.valid = False
except Exception as e: except Exception as e:
raise RuntimeError( _log.error(f"DummyBackend validation failed: {e}")
f"AudioBackend could not load document with hash {self.document_hash}" self.valid = False
) from e
return
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
@ -46,35 +46,6 @@ class AudioBackend(DeclarativeDocumentBackend):
def supports_pagination(cls) -> bool: def supports_pagination(cls) -> bool:
return False return False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod @classmethod
def supported_formats(cls) -> Set[InputFormat]: def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.AUDIO} return {InputFormat.AUDIO}
def convert(self) -> DoclingDocument:
"""
Parses the audio file into a structured document model.
"""
# Parse the CSV into a structured document model
origin = DocumentOrigin(
filename=self.file.name or "audio.wav",
mimetype="audio/wav",
binary_hash=self.document_hash,
)
_log.info(f"origin: {origin}")
doc = DoclingDocument(name=self.file.stem or "audio.wav", origin=origin)
if self.is_valid():
_log.error("time to get going ...")
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the audio backend failed to init."
)
return doc

View File

@ -23,7 +23,7 @@ from docling_core.utils.file import resolve_source_to_path
from pydantic import TypeAdapter from pydantic import TypeAdapter
from rich.console import Console from rich.console import Console
from docling.backend.audio_backend import AudioBackend from docling.backend.audio_backend import DummyBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
@ -665,7 +665,6 @@ def convert( # noqa: C901
audio_format_option = AudioFormatOption( audio_format_option = AudioFormatOption(
pipeline_cls=AsrPipeline, pipeline_cls=AsrPipeline,
pipeline_options=pipeline_options, pipeline_options=pipeline_options,
backend=AudioBackend,
) )
format_options = { format_options = {

View File

@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.asciidoc_backend import AsciiDocBackend from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.audio_backend import AudioBackend from docling.backend.audio_backend import DummyBackend
from docling.backend.csv_backend import CsvDocumentBackend from docling.backend.csv_backend import CsvDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend
@ -122,7 +122,7 @@ class PdfFormatOption(FormatOption):
class AudioFormatOption(FormatOption): class AudioFormatOption(FormatOption):
pipeline_cls: Type = AsrPipeline pipeline_cls: Type = AsrPipeline
backend: Type[AbstractDocumentBackend] = AudioBackend backend: Type[AbstractDocumentBackend] = DummyBackend
def _get_default_option(format: InputFormat) -> FormatOption: def _get_default_option(format: InputFormat) -> FormatOption:
@ -163,7 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.JSON_DOCLING: FormatOption( InputFormat.JSON_DOCLING: FormatOption(
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
), ),
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=AudioBackend), InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=DummyBackend),
} }
if (options := format_to_default_options.get(format)) is not None: if (options := format_to_default_options.get(format)) is not None:
return options return options

View File

@ -5,6 +5,8 @@ from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import List, Optional, Union, cast from typing import List, Optional, Union, cast
from docling_core.types.doc import DoclingDocument, DocumentOrigin
# import whisper # type: ignore # import whisper # type: ignore
# import librosa # import librosa
# import numpy as np # import numpy as np
@ -13,7 +15,7 @@ from docling_core.types.doc.labels import DocItemLabel
from pydantic import BaseModel, Field, validator from pydantic import BaseModel, Field, validator
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.audio_backend import AudioBackend from docling.backend.audio_backend import DummyBackend
# from pydub import AudioSegment # type: ignore # from pydub import AudioSegment # type: ignore
# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline # from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
@ -149,6 +151,16 @@ class _NativeWhisperModel:
try: try:
conversation = self.transcribe(audio_path) conversation = self.transcribe(audio_path)
# Ensure we have a proper DoclingDocument
origin = DocumentOrigin(
filename=conv_res.input.file.name or "audio.wav",
mimetype="audio/wav",
binary_hash=conv_res.input.document_hash,
)
conv_res.document = DoclingDocument(
name=conv_res.input.file.stem or "audio.wav", origin=origin
)
for _ in conversation: for _ in conversation:
conv_res.document.add_text(label=DocItemLabel.TEXT, text=_.to_string()) conv_res.document.add_text(label=DocItemLabel.TEXT, text=_.to_string())
@ -235,4 +247,4 @@ class AsrPipeline(BasePipeline):
@classmethod @classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend): def is_backend_supported(cls, backend: AbstractDocumentBackend):
return isinstance(backend, AudioBackend) return isinstance(backend, DummyBackend)