mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
AudioBackend -> DummyBackend
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
caf18e634b
commit
b43aef2eb5
@ -1,43 +1,43 @@
|
|||||||
import logging
|
import logging
|
||||||
import warnings
|
from io import BytesIO
|
||||||
from io import BytesIO, StringIO
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
DoclingDocument,
|
|
||||||
DocumentOrigin,
|
|
||||||
)
|
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class AudioBackend(DeclarativeDocumentBackend):
|
class DummyBackend(AbstractDocumentBackend):
|
||||||
# content: StringIO
|
"""
|
||||||
|
A dummy backend that only validates input existence.
|
||||||
|
Used e.g. for audio files where actual processing is handled by the ASR pipeline.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
_log.info(f"path: {path_or_stream}")
|
_log.debug(f"DummyBackend initialized for: {path_or_stream}")
|
||||||
|
|
||||||
# Load content
|
# Validate input
|
||||||
try:
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
_log.info(f"reading streaming: {self.path_or_stream}")
|
# Check if stream has content
|
||||||
# self.content = StringIO(self.path_or_stream.getvalue().decode("utf-8"))
|
self.valid = len(self.path_or_stream.getvalue()) > 0
|
||||||
|
_log.debug(
|
||||||
|
f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
|
||||||
|
)
|
||||||
elif isinstance(self.path_or_stream, Path):
|
elif isinstance(self.path_or_stream, Path):
|
||||||
_log.info(f"reading file: {self.path_or_stream}")
|
# Check if file exists
|
||||||
# self.content = StringIO(self.path_or_stream.read())
|
self.valid = self.path_or_stream.exists()
|
||||||
self.valid = True
|
_log.debug(f"File exists: {self.valid}")
|
||||||
|
else:
|
||||||
|
self.valid = False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(
|
_log.error(f"DummyBackend validation failed: {e}")
|
||||||
f"AudioBackend could not load document with hash {self.document_hash}"
|
self.valid = False
|
||||||
) from e
|
|
||||||
return
|
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return self.valid
|
return self.valid
|
||||||
@ -46,35 +46,6 @@ class AudioBackend(DeclarativeDocumentBackend):
|
|||||||
def supports_pagination(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def unload(self):
|
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
|
||||||
self.path_or_stream.close()
|
|
||||||
self.path_or_stream = None
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def supported_formats(cls) -> Set[InputFormat]:
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
return {InputFormat.AUDIO}
|
return {InputFormat.AUDIO}
|
||||||
|
|
||||||
def convert(self) -> DoclingDocument:
|
|
||||||
"""
|
|
||||||
Parses the audio file into a structured document model.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Parse the CSV into a structured document model
|
|
||||||
origin = DocumentOrigin(
|
|
||||||
filename=self.file.name or "audio.wav",
|
|
||||||
mimetype="audio/wav",
|
|
||||||
binary_hash=self.document_hash,
|
|
||||||
)
|
|
||||||
_log.info(f"origin: {origin}")
|
|
||||||
|
|
||||||
doc = DoclingDocument(name=self.file.stem or "audio.wav", origin=origin)
|
|
||||||
|
|
||||||
if self.is_valid():
|
|
||||||
_log.error("time to get going ...")
|
|
||||||
else:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Cannot convert doc with {self.document_hash} because the audio backend failed to init."
|
|
||||||
)
|
|
||||||
|
|
||||||
return doc
|
|
||||||
|
@ -23,7 +23,7 @@ from docling_core.utils.file import resolve_source_to_path
|
|||||||
from pydantic import TypeAdapter
|
from pydantic import TypeAdapter
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
|
|
||||||
from docling.backend.audio_backend import AudioBackend
|
from docling.backend.audio_backend import DummyBackend
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||||
@ -665,7 +665,6 @@ def convert( # noqa: C901
|
|||||||
audio_format_option = AudioFormatOption(
|
audio_format_option = AudioFormatOption(
|
||||||
pipeline_cls=AsrPipeline,
|
pipeline_cls=AsrPipeline,
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
backend=AudioBackend,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
format_options = {
|
format_options = {
|
||||||
|
@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
|||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||||
from docling.backend.audio_backend import AudioBackend
|
from docling.backend.audio_backend import DummyBackend
|
||||||
from docling.backend.csv_backend import CsvDocumentBackend
|
from docling.backend.csv_backend import CsvDocumentBackend
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
@ -122,7 +122,7 @@ class PdfFormatOption(FormatOption):
|
|||||||
|
|
||||||
class AudioFormatOption(FormatOption):
|
class AudioFormatOption(FormatOption):
|
||||||
pipeline_cls: Type = AsrPipeline
|
pipeline_cls: Type = AsrPipeline
|
||||||
backend: Type[AbstractDocumentBackend] = AudioBackend
|
backend: Type[AbstractDocumentBackend] = DummyBackend
|
||||||
|
|
||||||
|
|
||||||
def _get_default_option(format: InputFormat) -> FormatOption:
|
def _get_default_option(format: InputFormat) -> FormatOption:
|
||||||
@ -163,7 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|||||||
InputFormat.JSON_DOCLING: FormatOption(
|
InputFormat.JSON_DOCLING: FormatOption(
|
||||||
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
||||||
),
|
),
|
||||||
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=AudioBackend),
|
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=DummyBackend),
|
||||||
}
|
}
|
||||||
if (options := format_to_default_options.get(format)) is not None:
|
if (options := format_to_default_options.get(format)) is not None:
|
||||||
return options
|
return options
|
||||||
|
@ -5,6 +5,8 @@ from io import BytesIO
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Union, cast
|
from typing import List, Optional, Union, cast
|
||||||
|
|
||||||
|
from docling_core.types.doc import DoclingDocument, DocumentOrigin
|
||||||
|
|
||||||
# import whisper # type: ignore
|
# import whisper # type: ignore
|
||||||
# import librosa
|
# import librosa
|
||||||
# import numpy as np
|
# import numpy as np
|
||||||
@ -13,7 +15,7 @@ from docling_core.types.doc.labels import DocItemLabel
|
|||||||
from pydantic import BaseModel, Field, validator
|
from pydantic import BaseModel, Field, validator
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.audio_backend import AudioBackend
|
from docling.backend.audio_backend import DummyBackend
|
||||||
|
|
||||||
# from pydub import AudioSegment # type: ignore
|
# from pydub import AudioSegment # type: ignore
|
||||||
# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
|
# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
|
||||||
@ -149,6 +151,16 @@ class _NativeWhisperModel:
|
|||||||
try:
|
try:
|
||||||
conversation = self.transcribe(audio_path)
|
conversation = self.transcribe(audio_path)
|
||||||
|
|
||||||
|
# Ensure we have a proper DoclingDocument
|
||||||
|
origin = DocumentOrigin(
|
||||||
|
filename=conv_res.input.file.name or "audio.wav",
|
||||||
|
mimetype="audio/wav",
|
||||||
|
binary_hash=conv_res.input.document_hash,
|
||||||
|
)
|
||||||
|
conv_res.document = DoclingDocument(
|
||||||
|
name=conv_res.input.file.stem or "audio.wav", origin=origin
|
||||||
|
)
|
||||||
|
|
||||||
for _ in conversation:
|
for _ in conversation:
|
||||||
conv_res.document.add_text(label=DocItemLabel.TEXT, text=_.to_string())
|
conv_res.document.add_text(label=DocItemLabel.TEXT, text=_.to_string())
|
||||||
|
|
||||||
@ -235,4 +247,4 @@ class AsrPipeline(BasePipeline):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||||
return isinstance(backend, AudioBackend)
|
return isinstance(backend, DummyBackend)
|
||||||
|
Loading…
Reference in New Issue
Block a user