From 1c3699eaf74b83afc792aa26b6f8605f571530e3 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Sat, 14 Jun 2025 09:00:11 +0200 Subject: [PATCH] feat(dolphin): add dolphin support Signed-off-by: Georg Heiler --- docling/datamodel/pipeline_options_vlm_model.py | 1 + .../models/vlm_models_inline/hf_transformers_model.py | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py index 2289c3c7..310e9a9d 100644 --- a/docling/datamodel/pipeline_options_vlm_model.py +++ b/docling/datamodel/pipeline_options_vlm_model.py @@ -27,6 +27,7 @@ class TransformersModelType(str, Enum): AUTOMODEL = "automodel" AUTOMODEL_VISION2SEQ = "automodel-vision2seq" AUTOMODEL_CAUSALLM = "automodel-causallm" + AUTOMODEL_FORIMAGETEXTTOTEXT = "automodel-forimagetexttotext" class InlineVlmOptions(BaseVlmOptions): diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py index 00fdfa58..73dd439b 100644 --- a/docling/models/vlm_models_inline/hf_transformers_model.py +++ b/docling/models/vlm_models_inline/hf_transformers_model.py @@ -42,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix AutoModel, AutoModelForCausalLM, AutoModelForVision2Seq, + AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig, GenerationConfig, @@ -91,6 +92,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix == TransformersModelType.AUTOMODEL_VISION2SEQ ): model_cls = AutoModelForVision2Seq + elif (self.vlm_options.transformers_model_type + == TransformersModelType.AUTOMODEL_FORIMAGETEXTTOTEXT): + model_cls = AutoModelForImageTextToText self.processor = AutoProcessor.from_pretrained( artifacts_path, @@ -175,6 +179,12 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}") return prompt + if self.vlm_options.repo_id.lower().startswith("bytedance/dolphin"): + _log.debug("Using specialized prompt for dolphin") + # more info here https://huggingface.co/ByteDance/Dolphin + prompt = f"{self.vlm_options.prompt} " + _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}") + return prompt messages = [ {