Adjust example instatiation of multi-stage VLM pipeline

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-08-18 14:36:42 +02:00
parent 3d07f1c78e
commit 4a107f4f57
3 changed files with 8 additions and 13 deletions
--- a/docling/datamodel/vlm_model_specs.py
+++ b/docling/datamodel/vlm_model_specs.py
@@ -229,7 +229,6 @@ DOLPHIN_TRANSFORMERS = InlineVlmOptions(
    ],
    scale=2.0,
    temperature=0.0,
-    max_new_tokens=4096,
 )


--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -280,9 +280,7 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
            padding=True,  # pad across batch for both text and vision
            # no truncation by default; match SmolDocling examples
        )
-        inputs = {
-            k: (v.to(self.device) if hasattr(v, "to") else v) for k, v in inputs.items()
-        }
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # -- Optional stopping criteria
        stopping_criteria = None
@@ -302,7 +300,7 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
            "max_new_tokens": self.max_new_tokens,
            "use_cache": self.use_cache,
            "generation_config": self.generation_config,
-            "temperature": self.temperature,
+            # "temperature": self.temperature,
            **self.vlm_options.extra_generation_config,
        }
        if stopping_criteria is not None:
--- a/docling/pipeline/threaded_multistage_vlm_pipeline.py
+++ b/docling/pipeline/threaded_multistage_vlm_pipeline.py
@@ -160,16 +160,14 @@ class ThreadedMultiStageVlmPipelineOptions(PaginatedPipelineOptions):
        smoldocling_model = SMOLDOCLING_TRANSFORMERS

        text_opts = base_model.model_copy()
-        # text_opts.prompt = "Convert this page to docling."
-        text_opts.prompt = "What does it say?"
-        text_opts.response_format = ResponseFormat.PLAINTEXT
-        text_opts.max_new_tokens = 4096
+        text_opts.prompt = "Convert this page to docling."
+        text_opts.response_format = ResponseFormat.DOCTAGS
+        text_opts.max_new_tokens = 1024

        formula_opts = base_model.model_copy()
-        # formula_opts.prompt = "Convert formula to latex."
-        formula_opts.prompt = "What does it say?"
-        formula_opts.response_format = ResponseFormat.PLAINTEXT
-        formula_opts.max_new_tokens = 4096
+        formula_opts.prompt = "Convert formula to latex."
+        formula_opts.response_format = ResponseFormat.DOCTAGS
+        formula_opts.max_new_tokens = 512

        code_opts = smoldocling_model.model_copy()
        code_opts.prompt = "Convert code to text."