From 3d07f1c78e14c88a2bb485b3a3eb7b631e3f5d7c Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Mon, 18 Aug 2025 13:37:46 +0200
Subject: [PATCH] Cleanup hf_transformers_model batching impl

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/datamodel/vlm_model_specs.py          | 30 +++++++++----------
 .../hf_transformers_model.py                  | 17 ++++-------
 .../threaded_multistage_vlm_pipeline.py       | 22 +++++++-------
 3 files changed, 30 insertions(+), 39 deletions(-)
diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py
index 18be529e..8ffa7f21 100644
--- a/docling/datamodel/vlm_model_specs.py
+++ b/docling/datamodel/vlm_model_specs.py
@@ -35,11 +35,11 @@ SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
     prompt="Convert this page to docling.",
     response_format=ResponseFormat.DOCTAGS,
     inference_framework=InferenceFramework.TRANSFORMERS,
-    transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
+    transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
     supported_devices=[
         AcceleratorDevice.CPU,
         AcceleratorDevice.CUDA,
-        AcceleratorDevice.MPS,
+        #    AcceleratorDevice.MPS,
     ],
     scale=2.0,
     temperature=0.0,
@@ -51,7 +51,7 @@ SMOLDOCLING_VLLM = InlineVlmOptions(
     prompt="Convert this page to docling.",
     response_format=ResponseFormat.DOCTAGS,
     inference_framework=InferenceFramework.VLLM,
-    transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
+    transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
     supported_devices=[
         AcceleratorDevice.CPU,
         AcceleratorDevice.CUDA,
@@ -61,9 +61,9 @@ SMOLDOCLING_VLLM = InlineVlmOptions(
     stop_strings=["</doctag>", "<end_of_utterance>"],
 )
 
-# SmolVLM-500-Instruct
-SMOLVLM500_TRANSFORMERS = InlineVlmOptions(
-    repo_id="HuggingFaceTB/SmolVLM-500M-Instruct",
+# SmolVLM-256M-Instruct
+SMOLVLM256_TRANSFORMERS = InlineVlmOptions(
+    repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
     prompt="Transcribe this image to plain text.",
     response_format=ResponseFormat.PLAINTEXT,
     inference_framework=InferenceFramework.TRANSFORMERS,
@@ -71,16 +71,16 @@ SMOLVLM500_TRANSFORMERS = InlineVlmOptions(
     supported_devices=[
         AcceleratorDevice.CPU,
         AcceleratorDevice.CUDA,
-        AcceleratorDevice.MPS,
+        # AcceleratorDevice.MPS,
     ],
     scale=2.0,
     temperature=0.0,
 )
 
-# SmolVLM-500-Instruct
-SMOLVLM500_MLX = InlineVlmOptions(
-    repo_id="moot20/SmolVLM-500M-Instruct-MLX",
-    prompt="Transcribe this image to plain text.",
+# SmolVLM2-2.2b-Instruct
+SMOLVLM256_MLX = InlineVlmOptions(
+    repo_id="moot20/SmolVLM-256M-Instruct-MLX",
+    prompt="Extract the text.",
     response_format=ResponseFormat.DOCTAGS,
     inference_framework=InferenceFramework.MLX,
     transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
@@ -91,16 +91,16 @@ SMOLVLM500_MLX = InlineVlmOptions(
     temperature=0.0,
 )
 
-SMOLVLM500_VLLM = InlineVlmOptions(
-    repo_id="HuggingFaceTB/SmolVLM-500M-Instruct",
+SMOLVLM256_VLLM = InlineVlmOptions(
+    repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
     prompt="Transcribe this image to plain text.",
-    response_format=ResponseFormat.DOCTAGS,
+    response_format=ResponseFormat.PLAINTEXT,
     inference_framework=InferenceFramework.VLLM,
     transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
     supported_devices=[
         AcceleratorDevice.CPU,
         AcceleratorDevice.CUDA,
-        AcceleratorDevice.MPS,
+        # AcceleratorDevice.MPS,
     ],
     scale=2.0,
     temperature=0.0,
diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py
index 598d71e3..c894d4e9 100644
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -107,6 +107,8 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
                 artifacts_path,
                 trust_remote_code=vlm_options.trust_remote_code,
             )
+            self.processor.tokenizer.padding_side = "left"
+
             self.vlm_model = model_cls.from_pretrained(
                 artifacts_path,
                 device_map=self.device,
@@ -215,7 +217,7 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
                 }
             ]
             prompt = self.processor.apply_chat_template(
-                messages, add_generation_prompt=False
+                messages, add_generation_prompt=True
             )
             return prompt
 
@@ -311,17 +313,8 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
             generated_ids = self.vlm_model.generate(**gen_kwargs)
         generation_time = time.time() - start_time
 
-        # -- Trim per sample using attention_mask (robust for batched prompts)
-        if "attention_mask" not in inputs:
-            raise RuntimeError(
-                "Processor did not return 'attention_mask'. Ensure padding=True and text tokenization are enabled."
-            )
-        input_lengths = inputs["attention_mask"].sum(dim=1).tolist()
-
-        trimmed_sequences: list[list[int]] = [
-            generated_ids[i, int(input_lengths[i]) :].tolist()
-            for i in range(generated_ids.shape[0])
-        ]
+        input_len = inputs["input_ids"].shape[1]  # common right-aligned prompt length
+        trimmed_sequences = generated_ids[:, input_len:]  # only newly generated tokens
 
         # -- Decode with the processor/tokenizer (skip specials, keep DocTags as text)
         decode_fn = getattr(self.processor, "batch_decode", None)
diff --git a/docling/pipeline/threaded_multistage_vlm_pipeline.py b/docling/pipeline/threaded_multistage_vlm_pipeline.py
index 3a967250..a2e9c1bd 100644
--- a/docling/pipeline/threaded_multistage_vlm_pipeline.py
+++ b/docling/pipeline/threaded_multistage_vlm_pipeline.py
@@ -57,8 +57,6 @@ from docling.datamodel.vlm_model_specs import (
     DOLPHIN_TRANSFORMERS,
     SMOLDOCLING_MLX,
     SMOLDOCLING_TRANSFORMERS,
-    SMOLVLM500_MLX,
-    SMOLVLM500_TRANSFORMERS,
 )
 from docling.models.layout_model import LayoutModel
 from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
@@ -155,29 +153,29 @@ class ThreadedMultiStageVlmPipelineOptions(PaginatedPipelineOptions):
         """Create default pipeline options with custom VLM configurations from example."""
 
         # Configure VLM options based on the custom pipeline example
-        # formula_opts = DOLPHIN_TRANSFORMERS.model_copy()
-        # formula_opts.prompt = "<s>Read text in the image. <Answer/>"
+        # base_model = SMOLVLM256_TRANSFORMERS
+        # smoldocling_model = SMOLDOCLING_TRANSFORMERS
 
-        # text_opts = DOLPHIN_TRANSFORMERS.model_copy()
-        # text_opts.prompt = "<s>Read text in the image. <Answer/>"
-
-        base_model = SMOLVLM500_MLX
+        base_model = SMOLDOCLING_TRANSFORMERS
+        smoldocling_model = SMOLDOCLING_TRANSFORMERS
 
         text_opts = base_model.model_copy()
         # text_opts.prompt = "Convert this page to docling."
-        text_opts.prompt = "What does this say?"
+        text_opts.prompt = "What does it say?"
         text_opts.response_format = ResponseFormat.PLAINTEXT
+        text_opts.max_new_tokens = 4096
 
         formula_opts = base_model.model_copy()
         # formula_opts.prompt = "Convert formula to latex."
-        formula_opts.prompt = "What does this say?"
+        formula_opts.prompt = "What does it say?"
         formula_opts.response_format = ResponseFormat.PLAINTEXT
+        formula_opts.max_new_tokens = 4096
 
-        code_opts = SMOLDOCLING_TRANSFORMERS.model_copy()
+        code_opts = smoldocling_model.model_copy()
         code_opts.prompt = "Convert code to text."
         code_opts.response_format = ResponseFormat.DOCTAGS
 
-        table_opts = SMOLDOCLING_TRANSFORMERS.model_copy()
+        table_opts = smoldocling_model.model_copy()
         table_opts.prompt = "Convert this table to OTSL."
         table_opts.response_format = ResponseFormat.OTSL