mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-10 13:48:13 +00:00
feat: add image-text-to-text models in transformers (#1772)
* feat(dolphin): add dolphin support Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * rename Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * reformat Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * fix mypy Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> * add prompt style and examples Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Georg Heiler <georg.kf.heiler@gmail.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -31,6 +31,12 @@ class TransformersModelType(str, Enum):
|
||||
AUTOMODEL = "automodel"
|
||||
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
|
||||
AUTOMODEL_CAUSALLM = "automodel-causallm"
|
||||
AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
|
||||
|
||||
|
||||
class TransformersPromptStyle(str, Enum):
|
||||
CHAT = "chat"
|
||||
RAW = "raw"
|
||||
|
||||
|
||||
class InlineVlmOptions(BaseVlmOptions):
|
||||
@@ -44,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
|
||||
|
||||
inference_framework: InferenceFramework
|
||||
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
||||
transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
|
||||
response_format: ResponseFormat
|
||||
|
||||
torch_dtype: Optional[str] = None
|
||||
|
||||
@@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
InlineVlmOptions,
|
||||
TransformersModelType,
|
||||
TransformersPromptStyle,
|
||||
)
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.utils.hf_model_download import (
|
||||
@@ -41,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
from transformers import (
|
||||
AutoModel,
|
||||
AutoModelForCausalLM,
|
||||
AutoModelForImageTextToText,
|
||||
AutoModelForVision2Seq,
|
||||
AutoProcessor,
|
||||
BitsAndBytesConfig,
|
||||
@@ -91,6 +93,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
== TransformersModelType.AUTOMODEL_VISION2SEQ
|
||||
):
|
||||
model_cls = AutoModelForVision2Seq
|
||||
elif (
|
||||
self.vlm_options.transformers_model_type
|
||||
== TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT
|
||||
):
|
||||
model_cls = AutoModelForImageTextToText
|
||||
|
||||
self.processor = AutoProcessor.from_pretrained(
|
||||
artifacts_path,
|
||||
@@ -169,7 +176,10 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
def formulate_prompt(self, user_prompt: str) -> str:
|
||||
"""Formulate a prompt for the VLM."""
|
||||
|
||||
if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
|
||||
if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
|
||||
return user_prompt
|
||||
|
||||
elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
|
||||
_log.debug("Using specialized prompt for Phi-4")
|
||||
# more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
|
||||
|
||||
@@ -182,20 +192,25 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
|
||||
return prompt
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "This is a page from a document.",
|
||||
},
|
||||
{"type": "image"},
|
||||
{"type": "text", "text": user_prompt},
|
||||
],
|
||||
}
|
||||
]
|
||||
prompt = self.processor.apply_chat_template(
|
||||
messages, add_generation_prompt=False
|
||||
elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "This is a page from a document.",
|
||||
},
|
||||
{"type": "image"},
|
||||
{"type": "text", "text": user_prompt},
|
||||
],
|
||||
}
|
||||
]
|
||||
prompt = self.processor.apply_chat_template(
|
||||
messages, add_generation_prompt=False
|
||||
)
|
||||
return prompt
|
||||
|
||||
raise RuntimeError(
|
||||
f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
|
||||
)
|
||||
return prompt
|
||||
|
||||
Reference in New Issue
Block a user