doing scaffolding for audio pipeline

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2025-06-12 18:24:13 +02:00 · 2025-06-12 18:24:13 +02:00 · 1d4008ac7c
commit 1d4008ac7c
parent 5c606c2574
3 changed files with 57 additions and 35 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -542,7 +542,41 @@ def convert(  # noqa: C901
        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
        pipeline_options: PaginatedPipelineOptions
-        if pipeline == ProcessingPipeline.STANDARD:
+        format_options: Dict[InputFormat, FormatOption] = {}
        if pipeline == ProcessingPipeline.VLM:
            pipeline_options = VlmPipelineOptions(
                enable_remote_services=enable_remote_services,
            )
            if vlm_model == VlmModelType.GRANITE_VISION:
                pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
            elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
                pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
            elif vlm_model == VlmModelType.SMOLDOCLING:
                pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
                if sys.platform == "darwin":
                    try:
                        import mlx_vlm
                        pipeline_options.vlm_options = SMOLDOCLING_MLX
                    except ImportError:
                        _log.warning(
                            "To run SmolDocling faster, please install mlx-vlm:\n"
                            "pip install mlx-vlm"
                        )
            pdf_format_option = PdfFormatOption(
                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
            )
            format_options: Dict[InputFormat, FormatOption] = {
                InputFormat.PDF: pdf_format_option,
                InputFormat.IMAGE: pdf_format_option,
            }
        elif pipeline == ProcessingPipeline.STANDARD:
            pipeline_options = PdfPipelineOptions(
                allow_external_plugins=allow_external_plugins,
                enable_remote_services=enable_remote_services,
@ -584,40 +618,28 @@ def convert(  # noqa: C901
                pipeline_options=pipeline_options,
                backend=backend,  # pdf_backend
            )
        elif pipeline == ProcessingPipeline.VLM:
            pipeline_options = VlmPipelineOptions(
                enable_remote_services=enable_remote_services,
            )
-            if vlm_model == VlmModelType.GRANITE_VISION:
+            format_options: Dict[InputFormat, FormatOption] = {
-                pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
+                InputFormat.PDF: pdf_format_option,
-            elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
+                InputFormat.IMAGE: pdf_format_option,
-                pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
+            }
            elif vlm_model == VlmModelType.SMOLDOCLING:
                pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
                if sys.platform == "darwin":
                    try:
                        import mlx_vlm
                        pipeline_options.vlm_options = SMOLDOCLING_MLX
                    except ImportError:
                        _log.warning(
                            "To run SmolDocling faster, please install mlx-vlm:\n"
                            "pip install mlx-vlm"
                        )
            pdf_format_option = PdfFormatOption(
                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
            )
        elif pipeline == ProcessingPipeline.ASR:
            audio_pipeline_options = AsrPipelineOptions(
                # enable_remote_services=enable_remote_services,
                artifacts_path = artifacts_path
            )
            audio_format_option = PdfFormatOption(
-                pipeline_cls=AsrPipeline, pipeline_options=audio_pipeline_options
+                pipeline_cls=AsrPipeline,
                pipeline_options=audio_pipeline_options,
                # backend = FIXME
            )
            format_options: Dict[InputFormat, FormatOption] = {
                InputFormat.AUDIO_WAV: audio_format_option,
            }
            """
            if asr_model == AsrModelType.WHISPER_TINY:
                pipeline_options.asr_options = WHISPER_TINY:
@ -627,11 +649,6 @@ def convert(  # noqa: C901
            pipeline_options.artifacts_path = artifacts_path
            # audio_pipeline_options.artifacts_path = artifacts_path
        format_options: Dict[InputFormat, FormatOption] = {
            InputFormat.PDF: pdf_format_option,
            InputFormat.IMAGE: pdf_format_option,
            InputFormat.AUDIO: audio_format_option,
        }
        doc_converter = DocumentConverter(
            allowed_formats=from_formats,
            format_options=format_options,
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -49,7 +49,7 @@ class InputFormat(str, Enum):
    XML_USPTO = "xml_uspto"
    XML_JATS = "xml_jats"
    JSON_DOCLING = "json_docling"
-    AUDIO = "audio"
+    AUDIO_WAV = "wav"
 class OutputFormat(str, Enum):
@ -74,7 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.XLSX: ["xlsx", "xlsm"],
    InputFormat.XML_USPTO: ["xml", "txt"],
    InputFormat.JSON_DOCLING: ["json"],
-    InputFormat.AUDIO: ["wav", "mp3"],
+    InputFormat.AUDIO_WAV: ["wav"],
 }
 FormatToMimeType: Dict[InputFormat, List[str]] = {
@ -106,7 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
    ],
    InputFormat.XML_USPTO: ["application/xml", "text/plain"],
    InputFormat.JSON_DOCLING: ["application/json"],
-    InputFormat.AUDIO: ["audio/wav"],
+    InputFormat.AUDIO_WAV: ["audio/wav", "audio/x-wav"],
 }
 MimeTypeToFormat: dict[str, list[InputFormat]] = {
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
            backend: Type[AbstractDocumentBackend]
            if format not in format_options.keys():
                _log.error(
-                    f"Input document {obj.name} does not match any allowed format."
+                    f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
                )
                backend = _DummyBackend
            else:
@ -280,9 +280,12 @@ class _DocumentConversionInput(BaseModel):
        if isinstance(obj, Path):
            mime = filetype.guess_mime(str(obj))
            print(f"mime: {mime}")
            if mime is None:
                ext = obj.suffix[1:]
                print(f"ext: {ext}")
                mime = _DocumentConversionInput._mime_from_extension(ext)
                print(f"mime: {mime}")
            if mime is None:  # must guess from
                with obj.open("rb") as f:
                    content = f.read(1024)  # Read first 1KB
@ -318,6 +321,8 @@ class _DocumentConversionInput(BaseModel):
        mime = mime or _DocumentConversionInput._detect_csv(content)
        mime = mime or "text/plain"
        formats = MimeTypeToFormat.get(mime, [])
        print(formats)
        if formats:
            if len(formats) == 1 and mime not in ("text/plain"):
                return formats[0]