all working, now serious refacgtoring necessary

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar
2025-05-13 18:23:55 +02:00
parent 96862bd326
commit 3407955a47
7 changed files with 202 additions and 21 deletions

View File

@@ -39,24 +39,56 @@ pipeline_options.force_backend_text = False
## Alternative VLM models:
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
# pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
# repo_id="mistralai/Pixtral-12B-Base-2409",
# # prompt="OCR the full page to markdown.",
# prompt="OCR this image and export it in MarkDown.",
# response_format=ResponseFormat.MARKDOWN,
# inference_framework=InferenceFramework.TRANSFORMERS,
# )
"""
pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
repo_id="mistralai/Pixtral-12B-Base-2409",
prompt="OCR this image and export it in MarkDown.",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
)
vlm_conversion_options = pixtral_vlm_conversion_options
"""
pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
repo_id="mistral-community/pixtral-12b",
prompt="OCR this image and export it in MarkDown.",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration,
)
vlm_conversion_options = pixtral_vlm_conversion_options
"""
phi_vlm_conversion_options = HuggingFaceVlmOptions(
repo_id="microsoft/Phi-4-multimodal-instruct",
# prompt="OCR the full page to markdown.",
prompt="OCR this image and export it in MarkDown.",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM,
)
vlm_conversion_options = phi_vlm_conversion_options
"""
"""
pixtral_vlm_conversion_options = HuggingFaceVlmOptions(
repo_id="mlx-community/pixtral-12b-bf16",
prompt="Convert this full page to markdown. Do not miss any text and only output the bare MarkDown!",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.MLX,
)
vlm_conversion_options = pixtral_vlm_conversion_options
"""
pipeline_options.vlm_options = pixtral_vlm_conversion_options
"""
qwen_vlm_conversion_options = HuggingFaceVlmOptions(
repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
prompt="Convert this full page to markdown. Do not miss any text and only output the bare MarkDown!",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.MLX,
)
vlm_conversion_options = qwen_vlm_conversion_options
"""
pipeline_options.vlm_options = vlm_conversion_options
## Set up pipeline for PDF or image inputs
converter = DocumentConverter(