diff --git a/docling/backend/dummy_backend.py b/docling/backend/noop_backend.py similarity index 86% rename from docling/backend/dummy_backend.py rename to docling/backend/noop_backend.py index 87552aed..e4ae6d20 100644 --- a/docling/backend/dummy_backend.py +++ b/docling/backend/noop_backend.py @@ -10,16 +10,16 @@ from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) -class DummyBackend(AbstractDocumentBackend): +class NoOpBackend(AbstractDocumentBackend): """ - A dummy backend that only validates input existence. + A no-op backend that only validates input existence. Used e.g. for audio files where actual processing is handled by the ASR pipeline. """ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) - _log.debug(f"DummyBackend initialized for: {path_or_stream}") + _log.debug(f"NoOpBackend initialized for: {path_or_stream}") # Validate input try: @@ -36,7 +36,7 @@ class DummyBackend(AbstractDocumentBackend): else: self.valid = False except Exception as e: - _log.error(f"DummyBackend validation failed: {e}") + _log.error(f"NoOpBackend validation failed: {e}") self.valid = False def is_valid(self) -> bool: diff --git a/docling/cli/main.py b/docling/cli/main.py index ddf355f0..ae275ea9 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -26,7 +26,6 @@ from rich.console import Console from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend -from docling.backend.dummy_backend import DummyBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 85501a5a..f76a066c 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -106,7 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = { ], InputFormat.XML_USPTO: ["application/xml", "text/plain"], InputFormat.JSON_DOCLING: ["application/json"], - InputFormat.AUDIO: ["audio/wav", "audio/x-wav"], + InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"], } MimeTypeToFormat: dict[str, list[InputFormat]] = { diff --git a/docling/datamodel/pipeline_options_asr_model.py b/docling/datamodel/pipeline_options_asr_model.py index e892254c..20e2e453 100644 --- a/docling/datamodel/pipeline_options_asr_model.py +++ b/docling/datamodel/pipeline_options_asr_model.py @@ -17,8 +17,8 @@ class BaseAsrOptions(BaseModel): class InferenceAsrFramework(str, Enum): - MLX = "mlx" - TRANSFORMERS = "transformers" + # MLX = "mlx" # disabled for now + # TRANSFORMERS = "transformers" # disabled for now WHISPER = "whisper" diff --git a/docling/document_converter.py b/docling/document_converter.py index 5cae12e2..1a0a9d75 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -13,13 +13,13 @@ from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.asciidoc_backend import AsciiDocBackend from docling.backend.csv_backend import CsvDocumentBackend from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend -from docling.backend.dummy_backend import DummyBackend from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.json.docling_json_backend import DoclingJSONBackend from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend +from docling.backend.noop_backend import NoOpBackend from docling.backend.xml.jats_backend import JatsDocumentBackend from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend from docling.datamodel.base_models import ( @@ -122,7 +122,7 @@ class PdfFormatOption(FormatOption): class AudioFormatOption(FormatOption): pipeline_cls: Type = AsrPipeline - backend: Type[AbstractDocumentBackend] = DummyBackend + backend: Type[AbstractDocumentBackend] = NoOpBackend def _get_default_option(format: InputFormat) -> FormatOption: @@ -163,7 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption: InputFormat.JSON_DOCLING: FormatOption( pipeline_cls=SimplePipeline, backend=DoclingJSONBackend ), - InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=DummyBackend), + InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend), } if (options := format_to_default_options.get(format)) is not None: return options diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py index 65b053f5..94fa6341 100644 --- a/docling/pipeline/asr_pipeline.py +++ b/docling/pipeline/asr_pipeline.py @@ -15,7 +15,7 @@ from docling_core.types.doc.labels import DocItemLabel from pydantic import BaseModel, Field, validator from docling.backend.abstract_backend import AbstractDocumentBackend -from docling.backend.dummy_backend import DummyBackend +from docling.backend.noop_backend import NoOpBackend # from pydub import AudioSegment # type: ignore # from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline @@ -24,6 +24,7 @@ from docling.datamodel.accelerator_options import ( ) from docling.datamodel.base_models import ( ConversionStatus, + FormatToMimeType, ) from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.pipeline_options import ( @@ -154,15 +155,17 @@ class _NativeWhisperModel: # Ensure we have a proper DoclingDocument origin = DocumentOrigin( filename=conv_res.input.file.name or "audio.wav", - mimetype="audio/wav", + mimetype="audio/x-wav", binary_hash=conv_res.input.document_hash, ) conv_res.document = DoclingDocument( name=conv_res.input.file.stem or "audio.wav", origin=origin ) - for _ in conversation: - conv_res.document.add_text(label=DocItemLabel.TEXT, text=_.to_string()) + for citem in conversation: + conv_res.document.add_text( + label=DocItemLabel.TEXT, text=citem.to_string() + ) conv_res.status = ConversionStatus.SUCCESS return conv_res @@ -247,4 +250,4 @@ class AsrPipeline(BasePipeline): @classmethod def is_backend_supported(cls, backend: AbstractDocumentBackend): - return isinstance(backend, DummyBackend) + return isinstance(backend, NoOpBackend) diff --git a/tests/data/audio/sample_10s.mp3 b/tests/data/audio/sample_10s.mp3 new file mode 100644 index 00000000..93a7ec73 Binary files /dev/null and b/tests/data/audio/sample_10s.mp3 differ