From b43aef2eb58caabc2f882e512d8f2b0fb07a26ab Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 23 Jun 2025 09:55:59 +0200 Subject: [PATCH] AudioBackend -> DummyBackend Signed-off-by: Christoph Auer --- docling/backend/audio_backend.py | 71 ++++++++++---------------------- docling/cli/main.py | 3 +- docling/document_converter.py | 6 +-- docling/pipeline/asr_pipeline.py | 16 ++++++- 4 files changed, 39 insertions(+), 57 deletions(-) diff --git a/docling/backend/audio_backend.py b/docling/backend/audio_backend.py index d0958276..87552aed 100644 --- a/docling/backend/audio_backend.py +++ b/docling/backend/audio_backend.py @@ -1,43 +1,43 @@ import logging -import warnings -from io import BytesIO, StringIO +from io import BytesIO from pathlib import Path from typing import Set, Union -from docling_core.types.doc import ( - DoclingDocument, - DocumentOrigin, -) - -from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.backend.abstract_backend import AbstractDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) -class AudioBackend(DeclarativeDocumentBackend): - # content: StringIO +class DummyBackend(AbstractDocumentBackend): + """ + A dummy backend that only validates input existence. + Used e.g. for audio files where actual processing is handled by the ASR pipeline. + """ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) - _log.info(f"path: {path_or_stream}") + _log.debug(f"DummyBackend initialized for: {path_or_stream}") - # Load content + # Validate input try: if isinstance(self.path_or_stream, BytesIO): - _log.info(f"reading streaming: {self.path_or_stream}") - # self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8")) + # Check if stream has content + self.valid = len(self.path_or_stream.getvalue()) > 0 + _log.debug( + f"BytesIO stream length: {len(self.path_or_stream.getvalue())}" + ) elif isinstance(self.path_or_stream, Path): - _log.info(f"reading file: {self.path_or_stream}") - # self.content = StringIO(self.path_or_stream.read()) - self.valid = True + # Check if file exists + self.valid = self.path_or_stream.exists() + _log.debug(f"File exists: {self.valid}") + else: + self.valid = False except Exception as e: - raise RuntimeError( - f"AudioBackend could not load document with hash {self.document_hash}" - ) from e - return + _log.error(f"DummyBackend validation failed: {e}") + self.valid = False def is_valid(self) -> bool: return self.valid @@ -46,35 +46,6 @@ class AudioBackend(DeclarativeDocumentBackend): def supports_pagination(cls) -> bool: return False - def unload(self): - if isinstance(self.path_or_stream, BytesIO): - self.path_or_stream.close() - self.path_or_stream = None - @classmethod def supported_formats(cls) -> Set[InputFormat]: return {InputFormat.AUDIO} - - def convert(self) -> DoclingDocument: - """ - Parses the audio file into a structured document model. - """ - - # Parse the CSV into a structured document model - origin = DocumentOrigin( - filename=self.file.name or "audio.wav", - mimetype="audio/wav", - binary_hash=self.document_hash, - ) - _log.info(f"origin: {origin}") - - doc = DoclingDocument(name=self.file.stem or "audio.wav", origin=origin) - - if self.is_valid(): - _log.error("time to get going ...") - else: - raise RuntimeError( - f"Cannot convert doc with {self.document_hash} because the audio backend failed to init." - ) - - return doc diff --git a/docling/cli/main.py b/docling/cli/main.py index 34b6a14d..dfc8069e 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -23,7 +23,7 @@ from docling_core.utils.file import resolve_source_to_path from pydantic import TypeAdapter from rich.console import Console -from docling.backend.audio_backend import AudioBackend +from docling.backend.audio_backend import DummyBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend @@ -665,7 +665,6 @@ def convert( # noqa: C901 audio_format_option = AudioFormatOption( pipeline_cls=AsrPipeline, pipeline_options=pipeline_options, - backend=AudioBackend, ) format_options = { diff --git a/docling/document_converter.py b/docling/document_converter.py index 4336c9bf..2f3ba062 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.asciidoc_backend import AsciiDocBackend -from docling.backend.audio_backend import AudioBackend +from docling.backend.audio_backend import DummyBackend from docling.backend.csv_backend import CsvDocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.html_backend import HTMLDocumentBackend @@ -122,7 +122,7 @@ class PdfFormatOption(FormatOption): class AudioFormatOption(FormatOption): pipeline_cls: Type = AsrPipeline - backend: Type[AbstractDocumentBackend] = AudioBackend + backend: Type[AbstractDocumentBackend] = DummyBackend def _get_default_option(format: InputFormat) -> FormatOption: @@ -163,7 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption: InputFormat.JSON_DOCLING: FormatOption( pipeline_cls=SimplePipeline, backend=DoclingJSONBackend ), - InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=AudioBackend), + InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=DummyBackend), } if (options := format_to_default_options.get(format)) is not None: return options diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py index c23c4754..8d444daa 100644 --- a/docling/pipeline/asr_pipeline.py +++ b/docling/pipeline/asr_pipeline.py @@ -5,6 +5,8 @@ from io import BytesIO from pathlib import Path from typing import List, Optional, Union, cast +from docling_core.types.doc import DoclingDocument, DocumentOrigin + # import whisper # type: ignore # import librosa # import numpy as np @@ -13,7 +15,7 @@ from docling_core.types.doc.labels import DocItemLabel from pydantic import BaseModel, Field, validator from docling.backend.abstract_backend import AbstractDocumentBackend -from docling.backend.audio_backend import AudioBackend +from docling.backend.audio_backend import DummyBackend # from pydub import AudioSegment # type: ignore # from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline @@ -149,6 +151,16 @@ class _NativeWhisperModel: try: conversation = self.transcribe(audio_path) + # Ensure we have a proper DoclingDocument + origin = DocumentOrigin( + filename=conv_res.input.file.name or "audio.wav", + mimetype="audio/wav", + binary_hash=conv_res.input.document_hash, + ) + conv_res.document = DoclingDocument( + name=conv_res.input.file.stem or "audio.wav", origin=origin + ) + for _ in conversation: conv_res.document.add_text(label=DocItemLabel.TEXT, text=_.to_string()) @@ -235,4 +247,4 @@ class AsrPipeline(BasePipeline): @classmethod def is_backend_supported(cls, backend: AbstractDocumentBackend): - return isinstance(backend, AudioBackend) + return isinstance(backend, DummyBackend)