diff --git a/docling/cli/main.py b/docling/cli/main.py index 3e3796cf..ebebad3f 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -542,7 +542,41 @@ def convert( # noqa: C901 accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device) pipeline_options: PaginatedPipelineOptions - if pipeline == ProcessingPipeline.STANDARD: + format_options: Dict[InputFormat, FormatOption] = {} + + if pipeline == ProcessingPipeline.VLM: + pipeline_options = VlmPipelineOptions( + enable_remote_services=enable_remote_services, + ) + + if vlm_model == VlmModelType.GRANITE_VISION: + pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS + elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA: + pipeline_options.vlm_options = GRANITE_VISION_OLLAMA + elif vlm_model == VlmModelType.SMOLDOCLING: + pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS + if sys.platform == "darwin": + try: + import mlx_vlm + + pipeline_options.vlm_options = SMOLDOCLING_MLX + except ImportError: + _log.warning( + "To run SmolDocling faster, please install mlx-vlm:\n" + "pip install mlx-vlm" + ) + + pdf_format_option = PdfFormatOption( + pipeline_cls=VlmPipeline, pipeline_options=pipeline_options + ) + + format_options: Dict[InputFormat, FormatOption] = { + InputFormat.PDF: pdf_format_option, + InputFormat.IMAGE: pdf_format_option, + } + + elif pipeline == ProcessingPipeline.STANDARD: + pipeline_options = PdfPipelineOptions( allow_external_plugins=allow_external_plugins, enable_remote_services=enable_remote_services, @@ -584,40 +618,28 @@ def convert( # noqa: C901 pipeline_options=pipeline_options, backend=backend, # pdf_backend ) - elif pipeline == ProcessingPipeline.VLM: - pipeline_options = VlmPipelineOptions( - enable_remote_services=enable_remote_services, - ) - if vlm_model == VlmModelType.GRANITE_VISION: - pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS - elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA: - pipeline_options.vlm_options = GRANITE_VISION_OLLAMA - elif vlm_model == VlmModelType.SMOLDOCLING: - pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS - if sys.platform == "darwin": - try: - import mlx_vlm - - pipeline_options.vlm_options = SMOLDOCLING_MLX - except ImportError: - _log.warning( - "To run SmolDocling faster, please install mlx-vlm:\n" - "pip install mlx-vlm" - ) - - pdf_format_option = PdfFormatOption( - pipeline_cls=VlmPipeline, pipeline_options=pipeline_options - ) + format_options: Dict[InputFormat, FormatOption] = { + InputFormat.PDF: pdf_format_option, + InputFormat.IMAGE: pdf_format_option, + } + elif pipeline == ProcessingPipeline.ASR: audio_pipeline_options = AsrPipelineOptions( # enable_remote_services=enable_remote_services, + artifacts_path = artifacts_path ) audio_format_option = PdfFormatOption( - pipeline_cls=AsrPipeline, pipeline_options=audio_pipeline_options + pipeline_cls=AsrPipeline, + pipeline_options=audio_pipeline_options, + # backend = FIXME ) + format_options: Dict[InputFormat, FormatOption] = { + InputFormat.AUDIO_WAV: audio_format_option, + } + """ if asr_model == AsrModelType.WHISPER_TINY: pipeline_options.asr_options = WHISPER_TINY: @@ -627,11 +649,6 @@ def convert( # noqa: C901 pipeline_options.artifacts_path = artifacts_path # audio_pipeline_options.artifacts_path = artifacts_path - format_options: Dict[InputFormat, FormatOption] = { - InputFormat.PDF: pdf_format_option, - InputFormat.IMAGE: pdf_format_option, - InputFormat.AUDIO: audio_format_option, - } doc_converter = DocumentConverter( allowed_formats=from_formats, format_options=format_options, diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 3eb88548..dd6f7406 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -49,7 +49,7 @@ class InputFormat(str, Enum): XML_USPTO = "xml_uspto" XML_JATS = "xml_jats" JSON_DOCLING = "json_docling" - AUDIO = "audio" + AUDIO_WAV = "wav" class OutputFormat(str, Enum): @@ -74,7 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = { InputFormat.XLSX: ["xlsx", "xlsm"], InputFormat.XML_USPTO: ["xml", "txt"], InputFormat.JSON_DOCLING: ["json"], - InputFormat.AUDIO: ["wav", "mp3"], + InputFormat.AUDIO_WAV: ["wav"], } FormatToMimeType: Dict[InputFormat, List[str]] = { @@ -106,7 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = { ], InputFormat.XML_USPTO: ["application/xml", "text/plain"], InputFormat.JSON_DOCLING: ["application/json"], - InputFormat.AUDIO: ["audio/wav"], + InputFormat.AUDIO_WAV: ["audio/wav", "audio/x-wav"], } MimeTypeToFormat: dict[str, list[InputFormat]] = { diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 4c71f5c8..c5ca179f 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel): backend: Type[AbstractDocumentBackend] if format not in format_options.keys(): _log.error( - f"Input document {obj.name} does not match any allowed format." + f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})" ) backend = _DummyBackend else: @@ -280,9 +280,12 @@ class _DocumentConversionInput(BaseModel): if isinstance(obj, Path): mime = filetype.guess_mime(str(obj)) + print(f"mime: {mime}") if mime is None: ext = obj.suffix[1:] + print(f"ext: {ext}") mime = _DocumentConversionInput._mime_from_extension(ext) + print(f"mime: {mime}") if mime is None: # must guess from with obj.open("rb") as f: content = f.read(1024) # Read first 1KB @@ -318,6 +321,8 @@ class _DocumentConversionInput(BaseModel): mime = mime or _DocumentConversionInput._detect_csv(content) mime = mime or "text/plain" formats = MimeTypeToFormat.get(mime, []) + print(formats) + if formats: if len(formats) == 1 and mime not in ("text/plain"): return formats[0]