Tweak defaults

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-08-15 14:49:34 +02:00
parent 16fea9cd8b
commit 1aa522792a
2 changed files with 29 additions and 11 deletions
--- a/docling/models/vlm_models_inline/vllm_model.py
+++ b/docling/models/vlm_models_inline/vllm_model.py
@@ -61,16 +61,15 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
            # Initialize VLLM LLM
            llm_kwargs = {
                "model": str(artifacts_path),
-                "model_impl": "transformers",
                "limit_mm_per_prompt": {"image": 1},
                "trust_remote_code": vlm_options.trust_remote_code,
+                "model_impl": "transformers",
+                "gpu_memory_utilization": 0.3,  # hardcoded for now, leaves room for ~3 different models.
            }

            # Add device-specific configurations
-            if self.device.startswith("cuda"):
-                # VLLM automatically detects GPU
-                pass
-            elif self.device == "cpu":
+
+            if self.device == "cpu":
                llm_kwargs["device"] = "cpu"

            # Add quantization if specified
--- a/docling/pipeline/threaded_multistage_vlm_pipeline.py
+++ b/docling/pipeline/threaded_multistage_vlm_pipeline.py
@@ -55,6 +55,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
 from docling.datamodel.settings import settings
 from docling.datamodel.vlm_model_specs import (
    DOLPHIN_TRANSFORMERS,
+    SMOLDOCLING_MLX,
    SMOLDOCLING_TRANSFORMERS,
 )
 from docling.models.layout_model import LayoutModel
@@ -152,13 +153,27 @@ class ThreadedMultiStageVlmPipelineOptions(PaginatedPipelineOptions):
        """Create default pipeline options with custom VLM configurations from example."""

        # Configure VLM options based on the custom pipeline example
-        formula_opts = DOLPHIN_TRANSFORMERS.model_copy()
-        formula_opts.prompt = "<s>Read text in the image. <Answer/>"
+        # formula_opts = DOLPHIN_TRANSFORMERS.model_copy()
+        # formula_opts.prompt = "<s>Read text in the image. <Answer/>"

-        text_opts = DOLPHIN_TRANSFORMERS.model_copy()
-        text_opts.prompt = "<s>Read text in the image. <Answer/>"
+        # text_opts = DOLPHIN_TRANSFORMERS.model_copy()
+        # text_opts.prompt = "<s>Read text in the image. <Answer/>"

-        table_opts = SMOLDOCLING_TRANSFORMERS.model_copy()
+        base_model = SMOLDOCLING_TRANSFORMERS
+
+        formula_opts = base_model.model_copy()
+        formula_opts.prompt = "Convert formula to latex."
+        formula_opts.response_format = ResponseFormat.OTSL
+
+        code_opts = base_model.model_copy()
+        code_opts.prompt = "Convert code to text."
+        code_opts.response_format = ResponseFormat.OTSL
+
+        text_opts = base_model.model_copy()
+        text_opts.prompt = "Convert this page to docling."
+        text_opts.response_format = ResponseFormat.OTSL
+
+        table_opts = base_model.model_copy()
        table_opts.prompt = "Convert this table to OTSL."
        table_opts.response_format = ResponseFormat.OTSL

@@ -175,6 +190,11 @@ class ThreadedMultiStageVlmPipelineOptions(PaginatedPipelineOptions):
                    labels=[DocItemLabel.FORMULA],
                    batch_size=16,
                ),
+                "code": VlmTaskConfig(
+                    vlm_options=code_opts,
+                    labels=[DocItemLabel.CODE],
+                    batch_size=16,
+                ),
                "text": VlmTaskConfig(
                    vlm_options=text_opts,
                    labels=[
@@ -189,7 +209,6 @@ class ThreadedMultiStageVlmPipelineOptions(PaginatedPipelineOptions):
                        DocItemLabel.CHECKBOX_SELECTED,
                        DocItemLabel.CHECKBOX_UNSELECTED,
                        DocItemLabel.HANDWRITTEN_TEXT,
-                        DocItemLabel.CODE,
                        DocItemLabel.EMPTY_VALUE,
                    ],
                    batch_size=16,