diff --git a/docling/cli/main.py b/docling/cli/main.py
index 3e3796cf..ebebad3f 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -542,7 +542,41 @@ def convert(  # noqa: C901
         accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
         pipeline_options: PaginatedPipelineOptions
 
-        if pipeline == ProcessingPipeline.STANDARD:
+        format_options: Dict[InputFormat, FormatOption] = {}
+        
+        if pipeline == ProcessingPipeline.VLM:
+            pipeline_options = VlmPipelineOptions(
+                enable_remote_services=enable_remote_services,
+            )
+
+            if vlm_model == VlmModelType.GRANITE_VISION:
+                pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
+            elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
+                pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
+            elif vlm_model == VlmModelType.SMOLDOCLING:
+                pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
+                if sys.platform == "darwin":
+                    try:
+                        import mlx_vlm
+
+                        pipeline_options.vlm_options = SMOLDOCLING_MLX
+                    except ImportError:
+                        _log.warning(
+                            "To run SmolDocling faster, please install mlx-vlm:\n"
+                            "pip install mlx-vlm"
+                        )
+
+            pdf_format_option = PdfFormatOption(
+                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
+            )
+
+            format_options: Dict[InputFormat, FormatOption] = {
+                InputFormat.PDF: pdf_format_option,
+                InputFormat.IMAGE: pdf_format_option,
+            }
+            
+        elif pipeline == ProcessingPipeline.STANDARD:
+            
             pipeline_options = PdfPipelineOptions(
                 allow_external_plugins=allow_external_plugins,
                 enable_remote_services=enable_remote_services,
@@ -584,40 +618,28 @@ def convert(  # noqa: C901
                 pipeline_options=pipeline_options,
                 backend=backend,  # pdf_backend
             )
-        elif pipeline == ProcessingPipeline.VLM:
-            pipeline_options = VlmPipelineOptions(
-                enable_remote_services=enable_remote_services,
-            )
 
-            if vlm_model == VlmModelType.GRANITE_VISION:
-                pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
-            elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
-                pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
-            elif vlm_model == VlmModelType.SMOLDOCLING:
-                pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
-                if sys.platform == "darwin":
-                    try:
-                        import mlx_vlm
-
-                        pipeline_options.vlm_options = SMOLDOCLING_MLX
-                    except ImportError:
-                        _log.warning(
-                            "To run SmolDocling faster, please install mlx-vlm:\n"
-                            "pip install mlx-vlm"
-                        )
-
-            pdf_format_option = PdfFormatOption(
-                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
-            )
+            format_options: Dict[InputFormat, FormatOption] = {
+                InputFormat.PDF: pdf_format_option,
+                InputFormat.IMAGE: pdf_format_option,
+            }
+            
         elif pipeline == ProcessingPipeline.ASR:
             audio_pipeline_options = AsrPipelineOptions(
                 # enable_remote_services=enable_remote_services,
+                artifacts_path = artifacts_path
             )
 
             audio_format_option = PdfFormatOption(
-                pipeline_cls=AsrPipeline, pipeline_options=audio_pipeline_options
+                pipeline_cls=AsrPipeline,
+                pipeline_options=audio_pipeline_options,
+                # backend = FIXME
             )
 
+            format_options: Dict[InputFormat, FormatOption] = {
+                InputFormat.AUDIO_WAV: audio_format_option,
+            }
+            
             """
             if asr_model == AsrModelType.WHISPER_TINY:
                 pipeline_options.asr_options = WHISPER_TINY:
@@ -627,11 +649,6 @@ def convert(  # noqa: C901
             pipeline_options.artifacts_path = artifacts_path
             # audio_pipeline_options.artifacts_path = artifacts_path
 
-        format_options: Dict[InputFormat, FormatOption] = {
-            InputFormat.PDF: pdf_format_option,
-            InputFormat.IMAGE: pdf_format_option,
-            InputFormat.AUDIO: audio_format_option,
-        }
         doc_converter = DocumentConverter(
             allowed_formats=from_formats,
             format_options=format_options,
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index 3eb88548..dd6f7406 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -49,7 +49,7 @@ class InputFormat(str, Enum):
     XML_USPTO = "xml_uspto"
     XML_JATS = "xml_jats"
     JSON_DOCLING = "json_docling"
-    AUDIO = "audio"
+    AUDIO_WAV = "wav"
 
 
 class OutputFormat(str, Enum):
@@ -74,7 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.XLSX: ["xlsx", "xlsm"],
     InputFormat.XML_USPTO: ["xml", "txt"],
     InputFormat.JSON_DOCLING: ["json"],
-    InputFormat.AUDIO: ["wav", "mp3"],
+    InputFormat.AUDIO_WAV: ["wav"],
 }
 
 FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -106,7 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
     ],
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
     InputFormat.JSON_DOCLING: ["application/json"],
-    InputFormat.AUDIO: ["audio/wav"],
+    InputFormat.AUDIO_WAV: ["audio/wav", "audio/x-wav"],
 }
 
 MimeTypeToFormat: dict[str, list[InputFormat]] = {
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index 4c71f5c8..c5ca179f 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
             backend: Type[AbstractDocumentBackend]
             if format not in format_options.keys():
                 _log.error(
-                    f"Input document {obj.name} does not match any allowed format."
+                    f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
                 )
                 backend = _DummyBackend
             else:
@@ -280,9 +280,12 @@ class _DocumentConversionInput(BaseModel):
 
         if isinstance(obj, Path):
             mime = filetype.guess_mime(str(obj))
+            print(f"mime: {mime}")
             if mime is None:
                 ext = obj.suffix[1:]
+                print(f"ext: {ext}")
                 mime = _DocumentConversionInput._mime_from_extension(ext)
+                print(f"mime: {mime}")
             if mime is None:  # must guess from
                 with obj.open("rb") as f:
                     content = f.read(1024)  # Read first 1KB
@@ -318,6 +321,8 @@ class _DocumentConversionInput(BaseModel):
         mime = mime or _DocumentConversionInput._detect_csv(content)
         mime = mime or "text/plain"
         formats = MimeTypeToFormat.get(mime, [])
+        print(formats)
+        
         if formats:
             if len(formats) == 1 and mime not in ("text/plain"):
                 return formats[0]