feat(dolphin): add dolphin support

Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com>
2025-07-26 20:14:47 +00:00 · 2025-06-14 09:00:11 +02:00 · 2025-06-14 09:00:11 +02:00 · 1c3699eaf7
commit 1c3699eaf7
parent 7bae3b6c06
2 changed files with 11 additions and 0 deletions
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@ -27,6 +27,7 @@ class TransformersModelType(str, Enum):
    AUTOMODEL = "automodel"
    AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
    AUTOMODEL_CAUSALLM = "automodel-causallm"
+    AUTOMODEL_FORIMAGETEXTTOTEXT = "automodel-forimagetexttotext"


 class InlineVlmOptions(BaseVlmOptions):
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@ -42,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                AutoModel,
                AutoModelForCausalLM,
                AutoModelForVision2Seq,
+                AutoModelForImageTextToText,
                AutoProcessor,
                BitsAndBytesConfig,
                GenerationConfig,
@ -91,6 +92,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                == TransformersModelType.AUTOMODEL_VISION2SEQ
            ):
                model_cls = AutoModelForVision2Seq
+            elif (self.vlm_options.transformers_model_type
+                  == TransformersModelType.AUTOMODEL_FORIMAGETEXTTOTEXT):
+                model_cls = AutoModelForImageTextToText

            self.processor = AutoProcessor.from_pretrained(
                artifacts_path,
@ -175,6 +179,12 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")

            return prompt
+        if self.vlm_options.repo_id.lower().startswith("bytedance/dolphin"):
+            _log.debug("Using specialized prompt for dolphin")
+            # more info here https://huggingface.co/ByteDance/Dolphin
+            prompt = f"<s>{self.vlm_options.prompt} <Answer/>"
+            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
+            return prompt

        messages = [
            {