mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
@@ -59,6 +59,7 @@ from docling.datamodel.pipeline_options import (
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.datamodel.vlm_model_specs import (
|
||||
GOT2_TRANSFORMERS,
|
||||
GRANITE_VISION_OLLAMA,
|
||||
GRANITE_VISION_TRANSFORMERS,
|
||||
SMOLDOCLING_MLX,
|
||||
@@ -621,6 +622,8 @@ def convert( # noqa: C901
|
||||
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
|
||||
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
||||
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
|
||||
elif vlm_model == VlmModelType.GOT_OCR_2:
|
||||
pipeline_options.vlm_options = GOT2_TRANSFORMERS
|
||||
elif vlm_model == VlmModelType.SMOLDOCLING:
|
||||
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
|
||||
if sys.platform == "darwin":
|
||||
|
||||
@@ -46,6 +46,7 @@ class TransformersModelType(str, Enum):
|
||||
class TransformersPromptStyle(str, Enum):
|
||||
CHAT = "chat"
|
||||
RAW = "raw"
|
||||
NONE = "none"
|
||||
|
||||
|
||||
class InlineVlmOptions(BaseVlmOptions):
|
||||
|
||||
@@ -194,6 +194,26 @@ QWEN25_VL_3B_MLX = InlineVlmOptions(
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# GoT 2.0
|
||||
GOT2_TRANSFORMERS = InlineVlmOptions(
|
||||
repo_id="stepfun-ai/GOT-OCR-2.0-hf",
|
||||
prompt="",
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||
transformers_prompt_style=TransformersPromptStyle.NONE,
|
||||
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
||||
supported_devices=[
|
||||
AcceleratorDevice.CPU,
|
||||
AcceleratorDevice.CUDA,
|
||||
# AcceleratorDevice.MPS,
|
||||
],
|
||||
scale=2.0,
|
||||
temperature=0.0,
|
||||
stop_strings=["<|im_end|>"],
|
||||
# extra_generation_config={"format": True},
|
||||
)
|
||||
|
||||
|
||||
# Gemma-3
|
||||
GEMMA3_12B_MLX = InlineVlmOptions(
|
||||
repo_id="mlx-community/gemma-3-12b-it-bf16",
|
||||
@@ -215,6 +235,8 @@ GEMMA3_27B_MLX = InlineVlmOptions(
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# Dolphin
|
||||
|
||||
DOLPHIN_TRANSFORMERS = InlineVlmOptions(
|
||||
repo_id="ByteDance/Dolphin",
|
||||
prompt="<s>Read text in the image. <Answer/>",
|
||||
@@ -238,3 +260,4 @@ class VlmModelType(str, Enum):
|
||||
GRANITE_VISION = "granite_vision"
|
||||
GRANITE_VISION_VLLM = "granite_vision_vllm"
|
||||
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
||||
GOT_OCR_2 = "got_ocr_2"
|
||||
|
||||
@@ -270,16 +270,24 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
|
||||
user_prompts = prompt
|
||||
|
||||
# Use your prompt formatter verbatim
|
||||
prompts: list[str] = [self.formulate_prompt(p) for p in user_prompts]
|
||||
if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.NONE:
|
||||
inputs = self.processor(
|
||||
pil_images,
|
||||
return_tensors="pt",
|
||||
padding=True, # pad across batch for both text and vision
|
||||
# no truncation by default; match SmolDocling examples
|
||||
)
|
||||
else:
|
||||
prompts: list[str] = [self.formulate_prompt(p) for p in user_prompts]
|
||||
|
||||
# -- Processor performs BOTH text+image preprocessing + batch padding (recommended)
|
||||
inputs = self.processor(
|
||||
text=prompts,
|
||||
images=pil_images,
|
||||
return_tensors="pt",
|
||||
padding=True, # pad across batch for both text and vision
|
||||
# no truncation by default; match SmolDocling examples
|
||||
)
|
||||
# -- Processor performs BOTH text+image preprocessing + batch padding (recommended)
|
||||
inputs = self.processor(
|
||||
text=prompts,
|
||||
images=pil_images,
|
||||
return_tensors="pt",
|
||||
padding=True, # pad across batch for both text and vision
|
||||
# no truncation by default; match SmolDocling examples
|
||||
)
|
||||
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
||||
|
||||
# -- Optional stopping criteria
|
||||
|
||||
Reference in New Issue
Block a user