From 1c3699eaf74b83afc792aa26b6f8605f571530e3 Mon Sep 17 00:00:00 2001
From: Georg Heiler <georg.kf.heiler@gmail.com>
Date: Sat, 14 Jun 2025 09:00:11 +0200
Subject: [PATCH] feat(dolphin): add dolphin support

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>
---
 docling/datamodel/pipeline_options_vlm_model.py        |  1 +
 .../models/vlm_models_inline/hf_transformers_model.py  | 10 ++++++++++
 2 files changed, 11 insertions(+)
diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
index 2289c3c7..310e9a9d 100644
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@@ -27,6 +27,7 @@ class TransformersModelType(str, Enum):
     AUTOMODEL = "automodel"
     AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
     AUTOMODEL_CAUSALLM = "automodel-causallm"
+    AUTOMODEL_FORIMAGETEXTTOTEXT = "automodel-forimagetexttotext"
 
 
 class InlineVlmOptions(BaseVlmOptions):
diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py
index 00fdfa58..73dd439b 100644
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -42,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                 AutoModel,
                 AutoModelForCausalLM,
                 AutoModelForVision2Seq,
+                AutoModelForImageTextToText,
                 AutoProcessor,
                 BitsAndBytesConfig,
                 GenerationConfig,
@@ -91,6 +92,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                 == TransformersModelType.AUTOMODEL_VISION2SEQ
             ):
                 model_cls = AutoModelForVision2Seq
+            elif (self.vlm_options.transformers_model_type
+                  == TransformersModelType.AUTOMODEL_FORIMAGETEXTTOTEXT):
+                model_cls = AutoModelForImageTextToText
 
             self.processor = AutoProcessor.from_pretrained(
                 artifacts_path,
@@ -175,6 +179,12 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
             _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
 
             return prompt
+        if self.vlm_options.repo_id.lower().startswith("bytedance/dolphin"):
+            _log.debug("Using specialized prompt for dolphin")
+            # more info here https://huggingface.co/ByteDance/Dolphin
+            prompt = f"<s>{self.vlm_options.prompt} <Answer/>"
+            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
+            return prompt
 
         messages = [
             {