From 3d07f1c78e14c88a2bb485b3a3eb7b631e3f5d7c Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 18 Aug 2025 13:37:46 +0200 Subject: [PATCH] Cleanup hf_transformers_model batching impl Signed-off-by: Christoph Auer --- docling/datamodel/vlm_model_specs.py | 30 +++++++++---------- .../hf_transformers_model.py | 17 ++++------- .../threaded_multistage_vlm_pipeline.py | 22 +++++++------- 3 files changed, 30 insertions(+), 39 deletions(-) diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py index 18be529e..8ffa7f21 100644 --- a/docling/datamodel/vlm_model_specs.py +++ b/docling/datamodel/vlm_model_specs.py @@ -35,11 +35,11 @@ SMOLDOCLING_TRANSFORMERS = InlineVlmOptions( prompt="Convert this page to docling.", response_format=ResponseFormat.DOCTAGS, inference_framework=InferenceFramework.TRANSFORMERS, - transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ, + transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, supported_devices=[ AcceleratorDevice.CPU, AcceleratorDevice.CUDA, - AcceleratorDevice.MPS, + # AcceleratorDevice.MPS, ], scale=2.0, temperature=0.0, @@ -51,7 +51,7 @@ SMOLDOCLING_VLLM = InlineVlmOptions( prompt="Convert this page to docling.", response_format=ResponseFormat.DOCTAGS, inference_framework=InferenceFramework.VLLM, - transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ, + transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, supported_devices=[ AcceleratorDevice.CPU, AcceleratorDevice.CUDA, @@ -61,9 +61,9 @@ SMOLDOCLING_VLLM = InlineVlmOptions( stop_strings=["", ""], ) -# SmolVLM-500-Instruct -SMOLVLM500_TRANSFORMERS = InlineVlmOptions( - repo_id="HuggingFaceTB/SmolVLM-500M-Instruct", +# SmolVLM-256M-Instruct +SMOLVLM256_TRANSFORMERS = InlineVlmOptions( + repo_id="HuggingFaceTB/SmolVLM-256M-Instruct", prompt="Transcribe this image to plain text.", response_format=ResponseFormat.PLAINTEXT, inference_framework=InferenceFramework.TRANSFORMERS, @@ -71,16 +71,16 @@ SMOLVLM500_TRANSFORMERS = InlineVlmOptions( supported_devices=[ AcceleratorDevice.CPU, AcceleratorDevice.CUDA, - AcceleratorDevice.MPS, + # AcceleratorDevice.MPS, ], scale=2.0, temperature=0.0, ) -# SmolVLM-500-Instruct -SMOLVLM500_MLX = InlineVlmOptions( - repo_id="moot20/SmolVLM-500M-Instruct-MLX", - prompt="Transcribe this image to plain text.", +# SmolVLM2-2.2b-Instruct +SMOLVLM256_MLX = InlineVlmOptions( + repo_id="moot20/SmolVLM-256M-Instruct-MLX", + prompt="Extract the text.", response_format=ResponseFormat.DOCTAGS, inference_framework=InferenceFramework.MLX, transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, @@ -91,16 +91,16 @@ SMOLVLM500_MLX = InlineVlmOptions( temperature=0.0, ) -SMOLVLM500_VLLM = InlineVlmOptions( - repo_id="HuggingFaceTB/SmolVLM-500M-Instruct", +SMOLVLM256_VLLM = InlineVlmOptions( + repo_id="HuggingFaceTB/SmolVLM-256M-Instruct", prompt="Transcribe this image to plain text.", - response_format=ResponseFormat.DOCTAGS, + response_format=ResponseFormat.PLAINTEXT, inference_framework=InferenceFramework.VLLM, transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, supported_devices=[ AcceleratorDevice.CPU, AcceleratorDevice.CUDA, - AcceleratorDevice.MPS, + # AcceleratorDevice.MPS, ], scale=2.0, temperature=0.0, diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py index 598d71e3..c894d4e9 100644 --- a/docling/models/vlm_models_inline/hf_transformers_model.py +++ b/docling/models/vlm_models_inline/hf_transformers_model.py @@ -107,6 +107,8 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload artifacts_path, trust_remote_code=vlm_options.trust_remote_code, ) + self.processor.tokenizer.padding_side = "left" + self.vlm_model = model_cls.from_pretrained( artifacts_path, device_map=self.device, @@ -215,7 +217,7 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload } ] prompt = self.processor.apply_chat_template( - messages, add_generation_prompt=False + messages, add_generation_prompt=True ) return prompt @@ -311,17 +313,8 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload generated_ids = self.vlm_model.generate(**gen_kwargs) generation_time = time.time() - start_time - # -- Trim per sample using attention_mask (robust for batched prompts) - if "attention_mask" not in inputs: - raise RuntimeError( - "Processor did not return 'attention_mask'. Ensure padding=True and text tokenization are enabled." - ) - input_lengths = inputs["attention_mask"].sum(dim=1).tolist() - - trimmed_sequences: list[list[int]] = [ - generated_ids[i, int(input_lengths[i]) :].tolist() - for i in range(generated_ids.shape[0]) - ] + input_len = inputs["input_ids"].shape[1] # common right-aligned prompt length + trimmed_sequences = generated_ids[:, input_len:] # only newly generated tokens # -- Decode with the processor/tokenizer (skip specials, keep DocTags as text) decode_fn = getattr(self.processor, "batch_decode", None) diff --git a/docling/pipeline/threaded_multistage_vlm_pipeline.py b/docling/pipeline/threaded_multistage_vlm_pipeline.py index 3a967250..a2e9c1bd 100644 --- a/docling/pipeline/threaded_multistage_vlm_pipeline.py +++ b/docling/pipeline/threaded_multistage_vlm_pipeline.py @@ -57,8 +57,6 @@ from docling.datamodel.vlm_model_specs import ( DOLPHIN_TRANSFORMERS, SMOLDOCLING_MLX, SMOLDOCLING_TRANSFORMERS, - SMOLVLM500_MLX, - SMOLVLM500_TRANSFORMERS, ) from docling.models.layout_model import LayoutModel from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions @@ -155,29 +153,29 @@ class ThreadedMultiStageVlmPipelineOptions(PaginatedPipelineOptions): """Create default pipeline options with custom VLM configurations from example.""" # Configure VLM options based on the custom pipeline example - # formula_opts = DOLPHIN_TRANSFORMERS.model_copy() - # formula_opts.prompt = "Read text in the image. " + # base_model = SMOLVLM256_TRANSFORMERS + # smoldocling_model = SMOLDOCLING_TRANSFORMERS - # text_opts = DOLPHIN_TRANSFORMERS.model_copy() - # text_opts.prompt = "Read text in the image. " - - base_model = SMOLVLM500_MLX + base_model = SMOLDOCLING_TRANSFORMERS + smoldocling_model = SMOLDOCLING_TRANSFORMERS text_opts = base_model.model_copy() # text_opts.prompt = "Convert this page to docling." - text_opts.prompt = "What does this say?" + text_opts.prompt = "What does it say?" text_opts.response_format = ResponseFormat.PLAINTEXT + text_opts.max_new_tokens = 4096 formula_opts = base_model.model_copy() # formula_opts.prompt = "Convert formula to latex." - formula_opts.prompt = "What does this say?" + formula_opts.prompt = "What does it say?" formula_opts.response_format = ResponseFormat.PLAINTEXT + formula_opts.max_new_tokens = 4096 - code_opts = SMOLDOCLING_TRANSFORMERS.model_copy() + code_opts = smoldocling_model.model_copy() code_opts.prompt = "Convert code to text." code_opts.response_format = ResponseFormat.DOCTAGS - table_opts = SMOLDOCLING_TRANSFORMERS.model_copy() + table_opts = smoldocling_model.model_copy() table_opts.prompt = "Convert this table to OTSL." table_opts.response_format = ResponseFormat.OTSL