mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: Support audio input (#1763)
* scaffolding in place Signed-off-by: Peter Staar <taa@zurich.ibm.com> * doing scaffolding for audio pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * WIP: got first transcription working Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, time to start cleaning up Signed-off-by: Peter Staar <taa@zurich.ibm.com> * first working ASR pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added openai-whisper as a first transcription model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updating with asr_options Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalised the first working ASR pipeline with Whisper Signed-off-by: Peter Staar <taa@zurich.ibm.com> * use whisper from the latest git commit Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * Update docling/datamodel/pipeline_options.py Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> * updated comment Signed-off-by: Peter Staar <taa@zurich.ibm.com> * AudioBackend -> DummyBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * file rename Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename to NoOpBackend, add test for ASR pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Support every format in NoOpBackend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add missing audio file and test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Install ffmpeg system dependency for ASR test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Peter W. J. Staar <91719829+PeterStaar-IBM@users.noreply.github.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
d26dac61a8
commit
1557e7ce3e
59
tests/test_asr_pipeline.py
Normal file
59
tests/test_asr_pipeline.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from docling.datamodel import asr_model_specs
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import AsrPipelineOptions
|
||||
from docling.document_converter import AudioFormatOption, DocumentConverter
|
||||
from docling.pipeline.asr_pipeline import AsrPipeline
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_audio_path():
|
||||
return Path("./tests/data/audio/sample_10s.mp3")
|
||||
|
||||
|
||||
def get_asr_converter():
|
||||
"""Create a DocumentConverter configured for ASR with whisper_turbo model."""
|
||||
pipeline_options = AsrPipelineOptions()
|
||||
pipeline_options.asr_options = asr_model_specs.WHISPER_TINY
|
||||
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.AUDIO: AudioFormatOption(
|
||||
pipeline_cls=AsrPipeline,
|
||||
pipeline_options=pipeline_options,
|
||||
)
|
||||
}
|
||||
)
|
||||
return converter
|
||||
|
||||
|
||||
def test_asr_pipeline_conversion(test_audio_path):
|
||||
"""Test ASR pipeline conversion using whisper_turbo model on sample_10s.mp3."""
|
||||
# Check if the test audio file exists
|
||||
assert test_audio_path.exists(), f"Test audio file not found: {test_audio_path}"
|
||||
|
||||
converter = get_asr_converter()
|
||||
|
||||
# Convert the audio file
|
||||
doc_result: ConversionResult = converter.convert(test_audio_path)
|
||||
|
||||
# Verify conversion was successful
|
||||
assert doc_result.status == ConversionStatus.SUCCESS, (
|
||||
f"Conversion failed with status: {doc_result.status}"
|
||||
)
|
||||
|
||||
# Verify we have a document
|
||||
assert doc_result.document is not None, "No document was created"
|
||||
|
||||
# Verify we have text content (transcribed audio)
|
||||
texts = doc_result.document.texts
|
||||
assert len(texts) > 0, "No text content found in transcribed audio"
|
||||
|
||||
# Print transcribed text for verification (optional, for debugging)
|
||||
print(f"Transcribed text from {test_audio_path.name}:")
|
||||
for i, text_item in enumerate(texts):
|
||||
print(f" {i + 1}: {text_item.text}")
|
||||
Reference in New Issue
Block a user