From b5b7e6dd5c595af61e4909aa390b474629bb9049 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 18 Aug 2025 15:57:06 +0200 Subject: [PATCH] Add GoT OCR 2.0 Signed-off-by: Christoph Auer --- docling/cli/main.py | 3 +++ .../datamodel/pipeline_options_vlm_model.py | 1 + docling/datamodel/vlm_model_specs.py | 23 ++++++++++++++++ .../hf_transformers_model.py | 26 ++++++++++++------- 4 files changed, 44 insertions(+), 9 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index c6948338..e72d91ee 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -59,6 +59,7 @@ from docling.datamodel.pipeline_options import ( ) from docling.datamodel.settings import settings from docling.datamodel.vlm_model_specs import ( + GOT2_TRANSFORMERS, GRANITE_VISION_OLLAMA, GRANITE_VISION_TRANSFORMERS, SMOLDOCLING_MLX, @@ -621,6 +622,8 @@ def convert( # noqa: C901 pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA: pipeline_options.vlm_options = GRANITE_VISION_OLLAMA + elif vlm_model == VlmModelType.GOT_OCR_2: + pipeline_options.vlm_options = GOT2_TRANSFORMERS elif vlm_model == VlmModelType.SMOLDOCLING: pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS if sys.platform == "darwin": diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py index 97e9f76d..0a2a98d8 100644 --- a/docling/datamodel/pipeline_options_vlm_model.py +++ b/docling/datamodel/pipeline_options_vlm_model.py @@ -46,6 +46,7 @@ class TransformersModelType(str, Enum): class TransformersPromptStyle(str, Enum): CHAT = "chat" RAW = "raw" + NONE = "none" class InlineVlmOptions(BaseVlmOptions): diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py index 377980e1..47f77ed9 100644 --- a/docling/datamodel/vlm_model_specs.py +++ b/docling/datamodel/vlm_model_specs.py @@ -194,6 +194,26 @@ QWEN25_VL_3B_MLX = InlineVlmOptions( temperature=0.0, ) +# GoT 2.0 +GOT2_TRANSFORMERS = InlineVlmOptions( + repo_id="stepfun-ai/GOT-OCR-2.0-hf", + prompt="", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, + transformers_prompt_style=TransformersPromptStyle.NONE, + transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, + supported_devices=[ + AcceleratorDevice.CPU, + AcceleratorDevice.CUDA, + # AcceleratorDevice.MPS, + ], + scale=2.0, + temperature=0.0, + stop_strings=["<|im_end|>"], + # extra_generation_config={"format": True}, +) + + # Gemma-3 GEMMA3_12B_MLX = InlineVlmOptions( repo_id="mlx-community/gemma-3-12b-it-bf16", @@ -215,6 +235,8 @@ GEMMA3_27B_MLX = InlineVlmOptions( temperature=0.0, ) +# Dolphin + DOLPHIN_TRANSFORMERS = InlineVlmOptions( repo_id="ByteDance/Dolphin", prompt="Read text in the image. ", @@ -238,3 +260,4 @@ class VlmModelType(str, Enum): GRANITE_VISION = "granite_vision" GRANITE_VISION_VLLM = "granite_vision_vllm" GRANITE_VISION_OLLAMA = "granite_vision_ollama" + GOT_OCR_2 = "got_ocr_2" diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py index e15bc684..8583dc47 100644 --- a/docling/models/vlm_models_inline/hf_transformers_model.py +++ b/docling/models/vlm_models_inline/hf_transformers_model.py @@ -270,16 +270,24 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload user_prompts = prompt # Use your prompt formatter verbatim - prompts: list[str] = [self.formulate_prompt(p) for p in user_prompts] + if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.NONE: + inputs = self.processor( + pil_images, + return_tensors="pt", + padding=True, # pad across batch for both text and vision + # no truncation by default; match SmolDocling examples + ) + else: + prompts: list[str] = [self.formulate_prompt(p) for p in user_prompts] - # -- Processor performs BOTH text+image preprocessing + batch padding (recommended) - inputs = self.processor( - text=prompts, - images=pil_images, - return_tensors="pt", - padding=True, # pad across batch for both text and vision - # no truncation by default; match SmolDocling examples - ) + # -- Processor performs BOTH text+image preprocessing + batch padding (recommended) + inputs = self.processor( + text=prompts, + images=pil_images, + return_tensors="pt", + padding=True, # pad across batch for both text and vision + # no truncation by default; match SmolDocling examples + ) inputs = {k: v.to(self.device) for k, v in inputs.items()} # -- Optional stopping criteria