From 1aa522792ac682600c6e8e4a46c5a25bbe447a05 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Fri, 15 Aug 2025 14:49:34 +0200 Subject: [PATCH] Tweak defaults Signed-off-by: Christoph Auer --- .../models/vlm_models_inline/vllm_model.py | 9 +++--- .../threaded_multistage_vlm_pipeline.py | 31 +++++++++++++++---- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/docling/models/vlm_models_inline/vllm_model.py b/docling/models/vlm_models_inline/vllm_model.py index 61c84cde..bc18ea32 100644 --- a/docling/models/vlm_models_inline/vllm_model.py +++ b/docling/models/vlm_models_inline/vllm_model.py @@ -61,16 +61,15 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin): # Initialize VLLM LLM llm_kwargs = { "model": str(artifacts_path), - "model_impl": "transformers", "limit_mm_per_prompt": {"image": 1}, "trust_remote_code": vlm_options.trust_remote_code, + "model_impl": "transformers", + "gpu_memory_utilization": 0.3, # hardcoded for now, leaves room for ~3 different models. } # Add device-specific configurations - if self.device.startswith("cuda"): - # VLLM automatically detects GPU - pass - elif self.device == "cpu": + + if self.device == "cpu": llm_kwargs["device"] = "cpu" # Add quantization if specified diff --git a/docling/pipeline/threaded_multistage_vlm_pipeline.py b/docling/pipeline/threaded_multistage_vlm_pipeline.py index 3ea2483a..9921af87 100644 --- a/docling/pipeline/threaded_multistage_vlm_pipeline.py +++ b/docling/pipeline/threaded_multistage_vlm_pipeline.py @@ -55,6 +55,7 @@ from docling.datamodel.pipeline_options_vlm_model import ( from docling.datamodel.settings import settings from docling.datamodel.vlm_model_specs import ( DOLPHIN_TRANSFORMERS, + SMOLDOCLING_MLX, SMOLDOCLING_TRANSFORMERS, ) from docling.models.layout_model import LayoutModel @@ -152,13 +153,27 @@ class ThreadedMultiStageVlmPipelineOptions(PaginatedPipelineOptions): """Create default pipeline options with custom VLM configurations from example.""" # Configure VLM options based on the custom pipeline example - formula_opts = DOLPHIN_TRANSFORMERS.model_copy() - formula_opts.prompt = "Read text in the image. " + # formula_opts = DOLPHIN_TRANSFORMERS.model_copy() + # formula_opts.prompt = "Read text in the image. " - text_opts = DOLPHIN_TRANSFORMERS.model_copy() - text_opts.prompt = "Read text in the image. " + # text_opts = DOLPHIN_TRANSFORMERS.model_copy() + # text_opts.prompt = "Read text in the image. " - table_opts = SMOLDOCLING_TRANSFORMERS.model_copy() + base_model = SMOLDOCLING_TRANSFORMERS + + formula_opts = base_model.model_copy() + formula_opts.prompt = "Convert formula to latex." + formula_opts.response_format = ResponseFormat.OTSL + + code_opts = base_model.model_copy() + code_opts.prompt = "Convert code to text." + code_opts.response_format = ResponseFormat.OTSL + + text_opts = base_model.model_copy() + text_opts.prompt = "Convert this page to docling." + text_opts.response_format = ResponseFormat.OTSL + + table_opts = base_model.model_copy() table_opts.prompt = "Convert this table to OTSL." table_opts.response_format = ResponseFormat.OTSL @@ -175,6 +190,11 @@ class ThreadedMultiStageVlmPipelineOptions(PaginatedPipelineOptions): labels=[DocItemLabel.FORMULA], batch_size=16, ), + "code": VlmTaskConfig( + vlm_options=code_opts, + labels=[DocItemLabel.CODE], + batch_size=16, + ), "text": VlmTaskConfig( vlm_options=text_opts, labels=[ @@ -189,7 +209,6 @@ class ThreadedMultiStageVlmPipelineOptions(PaginatedPipelineOptions): DocItemLabel.CHECKBOX_SELECTED, DocItemLabel.CHECKBOX_UNSELECTED, DocItemLabel.HANDWRITTEN_TEXT, - DocItemLabel.CODE, DocItemLabel.EMPTY_VALUE, ], batch_size=16,