From a07ba863c4c3dacfecaca159faa5653097662755 Mon Sep 17 00:00:00 2001 From: geoHeil <1694964+geoHeil@users.noreply.github.com> Date: Tue, 8 Jul 2025 05:54:57 +0200 Subject: [PATCH] feat: add image-text-to-text models in transformers (#1772) * feat(dolphin): add dolphin support Signed-off-by: Georg Heiler * rename Signed-off-by: Georg Heiler * reformat Signed-off-by: Georg Heiler * fix mypy Signed-off-by: Georg Heiler * add prompt style and examples Signed-off-by: Michele Dolfi --------- Signed-off-by: Georg Heiler Signed-off-by: Michele Dolfi Co-authored-by: Michele Dolfi --- .../datamodel/pipeline_options_vlm_model.py | 7 +++ .../hf_transformers_model.py | 49 ++++++++++++------- docs/examples/compare_vlm_models.py | 39 ++++++++++++++- 3 files changed, 77 insertions(+), 18 deletions(-) diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py index fd672b1b..bcea2493 100644 --- a/docling/datamodel/pipeline_options_vlm_model.py +++ b/docling/datamodel/pipeline_options_vlm_model.py @@ -31,6 +31,12 @@ class TransformersModelType(str, Enum): AUTOMODEL = "automodel" AUTOMODEL_VISION2SEQ = "automodel-vision2seq" AUTOMODEL_CAUSALLM = "automodel-causallm" + AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext" + + +class TransformersPromptStyle(str, Enum): + CHAT = "chat" + RAW = "raw" class InlineVlmOptions(BaseVlmOptions): @@ -44,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions): inference_framework: InferenceFramework transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL + transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT response_format: ResponseFormat torch_dtype: Optional[str] = None diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py index 4e2d80b8..d84925dd 100644 --- a/docling/models/vlm_models_inline/hf_transformers_model.py +++ b/docling/models/vlm_models_inline/hf_transformers_model.py @@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options_vlm_model import ( InlineVlmOptions, TransformersModelType, + TransformersPromptStyle, ) from docling.models.base_model import BasePageModel from docling.models.utils.hf_model_download import ( @@ -41,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix from transformers import ( AutoModel, AutoModelForCausalLM, + AutoModelForImageTextToText, AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig, @@ -91,6 +93,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix == TransformersModelType.AUTOMODEL_VISION2SEQ ): model_cls = AutoModelForVision2Seq + elif ( + self.vlm_options.transformers_model_type + == TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT + ): + model_cls = AutoModelForImageTextToText self.processor = AutoProcessor.from_pretrained( artifacts_path, @@ -169,7 +176,10 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix def formulate_prompt(self, user_prompt: str) -> str: """Formulate a prompt for the VLM.""" - if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct": + if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW: + return user_prompt + + elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct": _log.debug("Using specialized prompt for Phi-4") # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally @@ -182,20 +192,25 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix return prompt - messages = [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "This is a page from a document.", - }, - {"type": "image"}, - {"type": "text", "text": user_prompt}, - ], - } - ] - prompt = self.processor.apply_chat_template( - messages, add_generation_prompt=False + elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT: + messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "This is a page from a document.", + }, + {"type": "image"}, + {"type": "text", "text": user_prompt}, + ], + } + ] + prompt = self.processor.apply_chat_template( + messages, add_generation_prompt=False + ) + return prompt + + raise RuntimeError( + f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}." ) - return prompt diff --git a/docs/examples/compare_vlm_models.py b/docs/examples/compare_vlm_models.py index f9bd2dcd..49c34387 100644 --- a/docs/examples/compare_vlm_models.py +++ b/docs/examples/compare_vlm_models.py @@ -14,11 +14,18 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS from tabulate import tabulate from docling.datamodel import vlm_model_specs +from docling.datamodel.accelerator_options import AcceleratorDevice from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( VlmPipelineOptions, ) -from docling.datamodel.pipeline_options_vlm_model import InferenceFramework +from docling.datamodel.pipeline_options_vlm_model import ( + InferenceFramework, + InlineVlmOptions, + ResponseFormat, + TransformersModelType, + TransformersPromptStyle, +) from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline @@ -101,6 +108,33 @@ if __name__ == "__main__": out_path = Path("scratch") out_path.mkdir(parents=True, exist_ok=True) + ## Definiton of more inline models + llava_qwen = InlineVlmOptions( + repo_id="llava-hf/llava-interleave-qwen-0.5b-hf", + # prompt="Read text in the image.", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", + # prompt="Parse the reading order of this document.", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, + transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, + supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU], + scale=2.0, + temperature=0.0, + ) + + # Note that this is not the expected way of using the Dolphin model, but it shows the usage of a raw prompt. + dolphin_oneshot = InlineVlmOptions( + repo_id="ByteDance/Dolphin", + prompt="Read text in the image. ", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, + transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, + transformers_prompt_style=TransformersPromptStyle.RAW, + supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU], + scale=2.0, + temperature=0.0, + ) + ## Use VlmPipeline pipeline_options = VlmPipelineOptions() pipeline_options.generate_page_images = True @@ -121,6 +155,9 @@ if __name__ == "__main__": vlm_model_specs.GRANITE_VISION_TRANSFORMERS, vlm_model_specs.PHI4_TRANSFORMERS, vlm_model_specs.PIXTRAL_12B_TRANSFORMERS, + ## More inline models + dolphin_oneshot, + llava_qwen, ] # Remove MLX models if not on Mac