diff --git a/docling/datamodel/pipeline_model_specializations.py b/docling/datamodel/pipeline_model_specializations.py index 7082da3b..2a7350c8 100644 --- a/docling/datamodel/pipeline_model_specializations.py +++ b/docling/datamodel/pipeline_model_specializations.py @@ -119,16 +119,16 @@ granite_vision_vlm_ollama_conversion_options = ApiVlmOptions( # Pixtral pixtral_12b_vlm_conversion_options = HuggingFaceVlmOptions( repo_id="mistral-community/pixtral-12b", - prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", response_format=ResponseFormat.MARKDOWN, - inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration, + inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForVision2Seq, scale=2.0, temperature=0.0, ) pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions( repo_id="mlx-community/pixtral-12b-bf16", - prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.MLX, scale=2.0, @@ -138,7 +138,7 @@ pixtral_12b_vlm_mlx_conversion_options = HuggingFaceVlmOptions( # Phi4 phi_vlm_conversion_options = HuggingFaceVlmOptions( repo_id="microsoft/Phi-4-multimodal-instruct", - prompt="Convert this page to MarkDown. Do not miss any text and only output the bare MarkDown", + prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown", response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM, scale=2.0, @@ -148,7 +148,7 @@ phi_vlm_conversion_options = HuggingFaceVlmOptions( # Qwen qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions( repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16", - prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.MLX, scale=2.0, @@ -158,7 +158,7 @@ qwen25_vl_3b_vlm_mlx_conversion_options = HuggingFaceVlmOptions( # Gemma-3 gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions( repo_id="mlx-community/gemma-3-12b-it-bf16", - prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.MLX, scale=2.0, @@ -167,7 +167,7 @@ gemma_3_12b_mlx_conversion_options = HuggingFaceVlmOptions( gemma_3_27b_mlx_conversion_options = HuggingFaceVlmOptions( repo_id="mlx-community/gemma-3-27b-it-bf16", - prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", response_format=ResponseFormat.MARKDOWN, inference_framework=InferenceFramework.MLX, scale=2.0, diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py index e0c09c88..99dc32b1 100644 --- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py +++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForCausalLM.py @@ -116,7 +116,6 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): assert page.size is not None hi_res_image = page.get_image(scale=2) # self.vlm_options.scale) - print(hi_res_image) if hi_res_image is not None: im_width, im_height = hi_res_image.size @@ -127,7 +126,7 @@ class HuggingFaceVlmModel_AutoModelForCausalLM(BasePageModel): inputs = self.processor( text=prompt, images=hi_res_image, return_tensors="pt" - ) # .to(self.device) + ).to(self.device) # Generate response start_time = time.time() diff --git a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py index 69154d77..6de02808 100644 --- a/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py +++ b/docling/models/hf_vlm_models/hf_vlm_model_AutoModelForVision2Seq.py @@ -40,7 +40,6 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel): self.device = decide_device(accelerator_options.device) self.device = HuggingFaceVlmModel.map_device_to_cpu_if_mlx(self.device) - _log.debug(f"Available device for HuggingFace VLM: {self.device}") self.use_cache = vlm_options.use_kv_cache @@ -73,7 +72,7 @@ class HuggingFaceVlmModel_AutoModelForVision2Seq(BasePageModel): self.vlm_model = AutoModelForVision2Seq.from_pretrained( artifacts_path, device_map=self.device, - torch_dtype=torch.bfloat16, + # torch_dtype=torch.bfloat16, _attn_implementation=( "flash_attention_2" if self.device.startswith("cuda") diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/compare_vlm_models.py similarity index 51% rename from docs/examples/minimal_vlm_pipeline.py rename to docs/examples/compare_vlm_models.py index 32aecc6d..2d8915de 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/compare_vlm_models.py @@ -1,3 +1,9 @@ +# Compare VLM models +# ================== +# +# This example runs the VLM pipeline with different vision-language models. +# Their runtime as well output quality is compared. + import json import time from pathlib import Path @@ -8,9 +14,6 @@ from tabulate import tabulate from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_model_specializations import ( - HuggingFaceVlmOptions, - InferenceFramework, - ResponseFormat, gemma_3_12b_mlx_conversion_options, granite_vision_vlm_conversion_options, granite_vision_vlm_ollama_conversion_options, @@ -27,96 +30,24 @@ from docling.datamodel.pipeline_options import ( from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline -## Use experimental VlmPipeline -pipeline_options = VlmPipelineOptions() -# If force_backend_text = True, text from backend will be used instead of generated text -pipeline_options.force_backend_text = False -pipeline_options.generate_page_images = True -## On GPU systems, enable flash_attention_2 with CUDA: -# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA -# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True - -## Pick a VLM model. We choose SmolDocling-256M by default -# pipeline_options.vlm_options = smoldocling_vlm_conversion_options - -## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX -# pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options - -## Alternative VLM models: -# pipeline_options.vlm_options = granite_vision_vlm_conversion_options - -pipeline_options.vlm_options = phi_vlm_conversion_options -# pipeline_options.vlm_options = qwen25_vl_3b_vlm_mlx_conversion_options - -""" -pixtral_vlm_conversion_options = HuggingFaceVlmOptions( - repo_id="mistralai/Pixtral-12B-Base-2409", - prompt="OCR this image and export it in MarkDown.", - response_format=ResponseFormat.MARKDOWN, - inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration, -) -pipeline_options.vlm_options = pixtral_vlm_conversion_options -""" - -""" -pixtral_vlm_conversion_options = HuggingFaceVlmOptions( - repo_id="mistral-community/pixtral-12b", - prompt="OCR this image and export it in MarkDown.", - response_format=ResponseFormat.MARKDOWN, - inference_framework=InferenceFramework.TRANSFORMERS_LlavaForConditionalGeneration, -) -pipeline_options.vlm_options = pixtral_vlm_conversion_options -""" - -""" -phi_vlm_conversion_options = HuggingFaceVlmOptions( - repo_id="microsoft/Phi-4-multimodal-instruct", - # prompt="OCR the full page to markdown.", - prompt="OCR this image and export it in MarkDown.", - response_format=ResponseFormat.MARKDOWN, - inference_framework=InferenceFramework.TRANSFORMERS_AutoModelForCausalLM, -) -pipeline_options.vlm_options = phi_vlm_conversion_options -""" - -""" -pixtral_vlm_conversion_options = HuggingFaceVlmOptions( - repo_id="mlx-community/pixtral-12b-bf16", - prompt="Convert this page to markdown. Do not miss any text and only output the bare MarkDown!", - response_format=ResponseFormat.MARKDOWN, - inference_framework=InferenceFramework.MLX, - scale=1.0, -) -pipeline_options.vlm_options = pixtral_vlm_conversion_options -""" - -""" -qwen_vlm_conversion_options = HuggingFaceVlmOptions( - repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16", - prompt="Convert this full page to markdown. Do not miss any text and only output the bare MarkDown!", - response_format=ResponseFormat.MARKDOWN, - inference_framework=InferenceFramework.MLX, -) -pipeline_options.vlm_options = qwen_vlm_conversion_options -""" - - -def convert(sources: list[Path], converter): +def convert(sources: list[Path], converter: DocumentConverter): + model_id = pipeline_options.vlm_options.repo_id.replace("/", "_") + framework = pipeline_options.vlm_options.inference_framework for source in sources: - # start_time = time.time() print("================================================") - print(f"Processing... {source}") + print("Processing...") + print(f"Source: {source}") + print("---") + print(f"Model: {model_id}") + print(f"Framework: {framework}") print("================================================") print("") res = converter.convert(source) print("") - # print(res.document.export_to_markdown()) - model_id = pipeline_options.vlm_options.repo_id.replace("/", "_") - framework = pipeline_options.vlm_options.inference_framework fname = f"{res.input.file.stem}-{model_id}-{framework}" inference_time = 0.0 @@ -161,11 +92,10 @@ def convert(sources: list[Path], converter): ) print("====================================================") - # return [source, f"{out_path / fname}.html", model_id, framework, inference_time, ] return [ source, model_id, - framework, + str(framework), pg_num, inference_time, ] @@ -173,7 +103,6 @@ def convert(sources: list[Path], converter): if __name__ == "__main__": sources = [ - # "tests/data/2305.03393v1-pg9-img.png", "tests/data/pdf/2305.03393v1-pg9.pdf", ] @@ -182,9 +111,6 @@ if __name__ == "__main__": ## Use VlmPipeline pipeline_options = VlmPipelineOptions() - - # If force_backend_text = True, text from backend will be used instead of generated text - pipeline_options.force_backend_text = False pipeline_options.generate_page_images = True ## On GPU systems, enable flash_attention_2 with CUDA: @@ -193,14 +119,17 @@ if __name__ == "__main__": rows = [] for vlm_options in [ - # smoldocling_vlm_conversion_options, \ - smoldocling_vlm_mlx_conversion_options, - # granite_vision_vlm_conversion_options, \ - # phi_vlm_conversion_options, \ - # qwen25_vl_3b_vlm_mlx_conversion_options, \ + ## DocTags / SmolDocling models + smoldocling_vlm_conversion_options, + # smoldocling_vlm_mlx_conversion_options, + ## Markdown models (using MLX framework) + # qwen25_vl_3b_vlm_mlx_conversion_options, # pixtral_12b_vlm_mlx_conversion_options, - # pixtral_12b_vlm_conversion_options, - gemma_3_12b_mlx_conversion_options, + # gemma_3_12b_mlx_conversion_options, + ## Markdown models (using Transformers framework) + # granite_vision_vlm_conversion_options, + phi_vlm_conversion_options, + pixtral_12b_vlm_conversion_options, ]: pipeline_options.vlm_options = vlm_options @@ -219,11 +148,13 @@ if __name__ == "__main__": ) row = convert(sources=sources, converter=converter) - print("pipelines: \n", converter._get_initialized_pipelines()) - rows.append(row) - print(tabulate(rows)) + print( + tabulate( + rows, headers=["source", "model_id", "framework", "num_pages", "time"] + ) + ) print("see if memory gets released ...") time.sleep(10)