mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
AudioBackend -> DummyBackend
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
caf18e634b
commit
b43aef2eb5
@ -1,43 +1,43 @@
|
||||
import logging
|
||||
import warnings
|
||||
from io import BytesIO, StringIO
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
)
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AudioBackend(DeclarativeDocumentBackend):
|
||||
# content: StringIO
|
||||
class DummyBackend(AbstractDocumentBackend):
|
||||
"""
|
||||
A dummy backend that only validates input existence.
|
||||
Used e.g. for audio files where actual processing is handled by the ASR pipeline.
|
||||
"""
|
||||
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
_log.info(f"path: {path_or_stream}")
|
||||
_log.debug(f"DummyBackend initialized for: {path_or_stream}")
|
||||
|
||||
# Load content
|
||||
# Validate input
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
_log.info(f"reading streaming: {self.path_or_stream}")
|
||||
# self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8"))
|
||||
# Check if stream has content
|
||||
self.valid = len(self.path_or_stream.getvalue()) > 0
|
||||
_log.debug(
|
||||
f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
|
||||
)
|
||||
elif isinstance(self.path_or_stream, Path):
|
||||
_log.info(f"reading file: {self.path_or_stream}")
|
||||
# self.content = StringIO(self.path_or_stream.read())
|
||||
self.valid = True
|
||||
# Check if file exists
|
||||
self.valid = self.path_or_stream.exists()
|
||||
_log.debug(f"File exists: {self.valid}")
|
||||
else:
|
||||
self.valid = False
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"AudioBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
return
|
||||
_log.error(f"DummyBackend validation failed: {e}")
|
||||
self.valid = False
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
@ -46,35 +46,6 @@ class AudioBackend(DeclarativeDocumentBackend):
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.AUDIO}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
"""
|
||||
Parses the audio file into a structured document model.
|
||||
"""
|
||||
|
||||
# Parse the CSV into a structured document model
|
||||
origin = DocumentOrigin(
|
||||
filename=self.file.name or "audio.wav",
|
||||
mimetype="audio/wav",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
_log.info(f"origin: {origin}")
|
||||
|
||||
doc = DoclingDocument(name=self.file.stem or "audio.wav", origin=origin)
|
||||
|
||||
if self.is_valid():
|
||||
_log.error("time to get going ...")
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert doc with {self.document_hash} because the audio backend failed to init."
|
||||
)
|
||||
|
||||
return doc
|
||||
|
@ -23,7 +23,7 @@ from docling_core.utils.file import resolve_source_to_path
|
||||
from pydantic import TypeAdapter
|
||||
from rich.console import Console
|
||||
|
||||
from docling.backend.audio_backend import AudioBackend
|
||||
from docling.backend.audio_backend import DummyBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
@ -665,7 +665,6 @@ def convert( # noqa: C901
|
||||
audio_format_option = AudioFormatOption(
|
||||
pipeline_cls=AsrPipeline,
|
||||
pipeline_options=pipeline_options,
|
||||
backend=AudioBackend,
|
||||
)
|
||||
|
||||
format_options = {
|
||||
|
@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docling.backend.audio_backend import AudioBackend
|
||||
from docling.backend.audio_backend import DummyBackend
|
||||
from docling.backend.csv_backend import CsvDocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
@ -122,7 +122,7 @@ class PdfFormatOption(FormatOption):
|
||||
|
||||
class AudioFormatOption(FormatOption):
|
||||
pipeline_cls: Type = AsrPipeline
|
||||
backend: Type[AbstractDocumentBackend] = AudioBackend
|
||||
backend: Type[AbstractDocumentBackend] = DummyBackend
|
||||
|
||||
|
||||
def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
@ -163,7 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
InputFormat.JSON_DOCLING: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
||||
),
|
||||
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=AudioBackend),
|
||||
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=DummyBackend),
|
||||
}
|
||||
if (options := format_to_default_options.get(format)) is not None:
|
||||
return options
|
||||
|
@ -5,6 +5,8 @@ from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union, cast
|
||||
|
||||
from docling_core.types.doc import DoclingDocument, DocumentOrigin
|
||||
|
||||
# import whisper # type: ignore
|
||||
# import librosa
|
||||
# import numpy as np
|
||||
@ -13,7 +15,7 @@ from docling_core.types.doc.labels import DocItemLabel
|
||||
from pydantic import BaseModel, Field, validator
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.audio_backend import AudioBackend
|
||||
from docling.backend.audio_backend import DummyBackend
|
||||
|
||||
# from pydub import AudioSegment # type: ignore
|
||||
# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
|
||||
@ -149,6 +151,16 @@ class _NativeWhisperModel:
|
||||
try:
|
||||
conversation = self.transcribe(audio_path)
|
||||
|
||||
# Ensure we have a proper DoclingDocument
|
||||
origin = DocumentOrigin(
|
||||
filename=conv_res.input.file.name or "audio.wav",
|
||||
mimetype="audio/wav",
|
||||
binary_hash=conv_res.input.document_hash,
|
||||
)
|
||||
conv_res.document = DoclingDocument(
|
||||
name=conv_res.input.file.stem or "audio.wav", origin=origin
|
||||
)
|
||||
|
||||
for _ in conversation:
|
||||
conv_res.document.add_text(label=DocItemLabel.TEXT, text=_.to_string())
|
||||
|
||||
@ -235,4 +247,4 @@ class AsrPipeline(BasePipeline):
|
||||
|
||||
@classmethod
|
||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||
return isinstance(backend, AudioBackend)
|
||||
return isinstance(backend, DummyBackend)
|
||||
|
Loading…
Reference in New Issue
Block a user