diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py index d09e0a81..18be529e 100644 --- a/docling/datamodel/vlm_model_specs.py +++ b/docling/datamodel/vlm_model_specs.py @@ -65,7 +65,7 @@ SMOLDOCLING_VLLM = InlineVlmOptions( SMOLVLM500_TRANSFORMERS = InlineVlmOptions( repo_id="HuggingFaceTB/SmolVLM-500M-Instruct", prompt="Transcribe this image to plain text.", - response_format=ResponseFormat.DOCTAGS, + response_format=ResponseFormat.PLAINTEXT, inference_framework=InferenceFramework.TRANSFORMERS, transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, supported_devices=[ diff --git a/docling/pipeline/threaded_multistage_vlm_pipeline.py b/docling/pipeline/threaded_multistage_vlm_pipeline.py index a9dd9e64..3a967250 100644 --- a/docling/pipeline/threaded_multistage_vlm_pipeline.py +++ b/docling/pipeline/threaded_multistage_vlm_pipeline.py @@ -161,7 +161,7 @@ class ThreadedMultiStageVlmPipelineOptions(PaginatedPipelineOptions): # text_opts = DOLPHIN_TRANSFORMERS.model_copy() # text_opts.prompt = "Read text in the image. " - base_model = SMOLVLM500_TRANSFORMERS + base_model = SMOLVLM500_MLX text_opts = base_model.model_copy() # text_opts.prompt = "Convert this page to docling."