Add GoT OCR 2.0

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-08-18 15:57:06 +02:00
parent 4a107f4f57
commit 6bbb8e6340
4 changed files with 45 additions and 9 deletions

View File

@@ -59,6 +59,7 @@ from docling.datamodel.pipeline_options import (
)
from docling.datamodel.settings import settings
from docling.datamodel.vlm_model_specs import (
GOT2_TRANSFORMERS,
GRANITE_VISION_OLLAMA,
GRANITE_VISION_TRANSFORMERS,
SMOLDOCLING_MLX,
@@ -621,6 +622,8 @@ def convert( # noqa: C901
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
elif vlm_model == VlmModelType.GOT_OCR_2:
pipeline_options.vlm_options = GOT2_TRANSFORMERS
elif vlm_model == VlmModelType.SMOLDOCLING:
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
if sys.platform == "darwin":

View File

@@ -46,6 +46,7 @@ class TransformersModelType(str, Enum):
class TransformersPromptStyle(str, Enum):
CHAT = "chat"
RAW = "raw"
NONE = "none"
class InlineVlmOptions(BaseVlmOptions):
@@ -71,6 +72,7 @@ class InlineVlmOptions(BaseVlmOptions):
stop_strings: List[str] = []
extra_generation_config: Dict[str, Any] = {}
extra_processor_kwargs: Dict[str, Any] = {}
use_kv_cache: bool = True
max_new_tokens: int = 4096

View File

@@ -194,6 +194,26 @@ QWEN25_VL_3B_MLX = InlineVlmOptions(
temperature=0.0,
)
# GoT 2.0
GOT2_TRANSFORMERS = InlineVlmOptions(
repo_id="stepfun-ai/GOT-OCR-2.0-hf",
prompt="",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.TRANSFORMERS,
transformers_prompt_style=TransformersPromptStyle.NONE,
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
supported_devices=[
AcceleratorDevice.CPU,
AcceleratorDevice.CUDA,
# AcceleratorDevice.MPS,
],
scale=2.0,
temperature=0.0,
stop_strings=["<|im_end|>"],
extra_processor_kwargs={"format": True},
)
# Gemma-3
GEMMA3_12B_MLX = InlineVlmOptions(
repo_id="mlx-community/gemma-3-12b-it-bf16",
@@ -215,6 +235,8 @@ GEMMA3_27B_MLX = InlineVlmOptions(
temperature=0.0,
)
# Dolphin
DOLPHIN_TRANSFORMERS = InlineVlmOptions(
repo_id="ByteDance/Dolphin",
prompt="<s>Read text in the image. <Answer/>",
@@ -238,3 +260,4 @@ class VlmModelType(str, Enum):
GRANITE_VISION = "granite_vision"
GRANITE_VISION_VLLM = "granite_vision_vllm"
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
GOT_OCR_2 = "got_ocr_2"

View File

@@ -270,16 +270,24 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
user_prompts = prompt
# Use your prompt formatter verbatim
prompts: list[str] = [self.formulate_prompt(p) for p in user_prompts]
if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.NONE:
inputs = self.processor(
pil_images,
return_tensors="pt",
padding=True, # pad across batch for both text and vision
**self.vlm_options.extra_processor_kwargs,
)
else:
prompts: list[str] = [self.formulate_prompt(p) for p in user_prompts]
# -- Processor performs BOTH text+image preprocessing + batch padding (recommended)
inputs = self.processor(
text=prompts,
images=pil_images,
return_tensors="pt",
padding=True, # pad across batch for both text and vision
# no truncation by default; match SmolDocling examples
)
# -- Processor performs BOTH text+image preprocessing + batch padding (recommended)
inputs = self.processor(
text=prompts,
images=pil_images,
return_tensors="pt",
padding=True, # pad across batch for both text and vision
**self.vlm_options.extra_processor_kwargs,
)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
# -- Optional stopping criteria