Cleanup hf_transformers_model batching impl

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-08-18 13:37:46 +02:00
parent fead482e92
commit 3d07f1c78e
3 changed files with 30 additions and 39 deletions

View File

@@ -35,11 +35,11 @@ SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
prompt="Convert this page to docling.", prompt="Convert this page to docling.",
response_format=ResponseFormat.DOCTAGS, response_format=ResponseFormat.DOCTAGS,
inference_framework=InferenceFramework.TRANSFORMERS, inference_framework=InferenceFramework.TRANSFORMERS,
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ, transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
supported_devices=[ supported_devices=[
AcceleratorDevice.CPU, AcceleratorDevice.CPU,
AcceleratorDevice.CUDA, AcceleratorDevice.CUDA,
AcceleratorDevice.MPS, # AcceleratorDevice.MPS,
], ],
scale=2.0, scale=2.0,
temperature=0.0, temperature=0.0,
@@ -51,7 +51,7 @@ SMOLDOCLING_VLLM = InlineVlmOptions(
prompt="Convert this page to docling.", prompt="Convert this page to docling.",
response_format=ResponseFormat.DOCTAGS, response_format=ResponseFormat.DOCTAGS,
inference_framework=InferenceFramework.VLLM, inference_framework=InferenceFramework.VLLM,
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ, transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
supported_devices=[ supported_devices=[
AcceleratorDevice.CPU, AcceleratorDevice.CPU,
AcceleratorDevice.CUDA, AcceleratorDevice.CUDA,
@@ -61,9 +61,9 @@ SMOLDOCLING_VLLM = InlineVlmOptions(
stop_strings=["</doctag>", "<end_of_utterance>"], stop_strings=["</doctag>", "<end_of_utterance>"],
) )
# SmolVLM-500-Instruct # SmolVLM-256M-Instruct
SMOLVLM500_TRANSFORMERS = InlineVlmOptions( SMOLVLM256_TRANSFORMERS = InlineVlmOptions(
repo_id="HuggingFaceTB/SmolVLM-500M-Instruct", repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
prompt="Transcribe this image to plain text.", prompt="Transcribe this image to plain text.",
response_format=ResponseFormat.PLAINTEXT, response_format=ResponseFormat.PLAINTEXT,
inference_framework=InferenceFramework.TRANSFORMERS, inference_framework=InferenceFramework.TRANSFORMERS,
@@ -71,16 +71,16 @@ SMOLVLM500_TRANSFORMERS = InlineVlmOptions(
supported_devices=[ supported_devices=[
AcceleratorDevice.CPU, AcceleratorDevice.CPU,
AcceleratorDevice.CUDA, AcceleratorDevice.CUDA,
AcceleratorDevice.MPS, # AcceleratorDevice.MPS,
], ],
scale=2.0, scale=2.0,
temperature=0.0, temperature=0.0,
) )
# SmolVLM-500-Instruct # SmolVLM2-2.2b-Instruct
SMOLVLM500_MLX = InlineVlmOptions( SMOLVLM256_MLX = InlineVlmOptions(
repo_id="moot20/SmolVLM-500M-Instruct-MLX", repo_id="moot20/SmolVLM-256M-Instruct-MLX",
prompt="Transcribe this image to plain text.", prompt="Extract the text.",
response_format=ResponseFormat.DOCTAGS, response_format=ResponseFormat.DOCTAGS,
inference_framework=InferenceFramework.MLX, inference_framework=InferenceFramework.MLX,
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
@@ -91,16 +91,16 @@ SMOLVLM500_MLX = InlineVlmOptions(
temperature=0.0, temperature=0.0,
) )
SMOLVLM500_VLLM = InlineVlmOptions( SMOLVLM256_VLLM = InlineVlmOptions(
repo_id="HuggingFaceTB/SmolVLM-500M-Instruct", repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
prompt="Transcribe this image to plain text.", prompt="Transcribe this image to plain text.",
response_format=ResponseFormat.DOCTAGS, response_format=ResponseFormat.PLAINTEXT,
inference_framework=InferenceFramework.VLLM, inference_framework=InferenceFramework.VLLM,
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
supported_devices=[ supported_devices=[
AcceleratorDevice.CPU, AcceleratorDevice.CPU,
AcceleratorDevice.CUDA, AcceleratorDevice.CUDA,
AcceleratorDevice.MPS, # AcceleratorDevice.MPS,
], ],
scale=2.0, scale=2.0,
temperature=0.0, temperature=0.0,

View File

@@ -107,6 +107,8 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
artifacts_path, artifacts_path,
trust_remote_code=vlm_options.trust_remote_code, trust_remote_code=vlm_options.trust_remote_code,
) )
self.processor.tokenizer.padding_side = "left"
self.vlm_model = model_cls.from_pretrained( self.vlm_model = model_cls.from_pretrained(
artifacts_path, artifacts_path,
device_map=self.device, device_map=self.device,
@@ -215,7 +217,7 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
} }
] ]
prompt = self.processor.apply_chat_template( prompt = self.processor.apply_chat_template(
messages, add_generation_prompt=False messages, add_generation_prompt=True
) )
return prompt return prompt
@@ -311,17 +313,8 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
generated_ids = self.vlm_model.generate(**gen_kwargs) generated_ids = self.vlm_model.generate(**gen_kwargs)
generation_time = time.time() - start_time generation_time = time.time() - start_time
# -- Trim per sample using attention_mask (robust for batched prompts) input_len = inputs["input_ids"].shape[1] # common right-aligned prompt length
if "attention_mask" not in inputs: trimmed_sequences = generated_ids[:, input_len:] # only newly generated tokens
raise RuntimeError(
"Processor did not return 'attention_mask'. Ensure padding=True and text tokenization are enabled."
)
input_lengths = inputs["attention_mask"].sum(dim=1).tolist()
trimmed_sequences: list[list[int]] = [
generated_ids[i, int(input_lengths[i]) :].tolist()
for i in range(generated_ids.shape[0])
]
# -- Decode with the processor/tokenizer (skip specials, keep DocTags as text) # -- Decode with the processor/tokenizer (skip specials, keep DocTags as text)
decode_fn = getattr(self.processor, "batch_decode", None) decode_fn = getattr(self.processor, "batch_decode", None)

View File

@@ -57,8 +57,6 @@ from docling.datamodel.vlm_model_specs import (
DOLPHIN_TRANSFORMERS, DOLPHIN_TRANSFORMERS,
SMOLDOCLING_MLX, SMOLDOCLING_MLX,
SMOLDOCLING_TRANSFORMERS, SMOLDOCLING_TRANSFORMERS,
SMOLVLM500_MLX,
SMOLVLM500_TRANSFORMERS,
) )
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
@@ -155,29 +153,29 @@ class ThreadedMultiStageVlmPipelineOptions(PaginatedPipelineOptions):
"""Create default pipeline options with custom VLM configurations from example.""" """Create default pipeline options with custom VLM configurations from example."""
# Configure VLM options based on the custom pipeline example # Configure VLM options based on the custom pipeline example
# formula_opts = DOLPHIN_TRANSFORMERS.model_copy() # base_model = SMOLVLM256_TRANSFORMERS
# formula_opts.prompt = "<s>Read text in the image. <Answer/>" # smoldocling_model = SMOLDOCLING_TRANSFORMERS
# text_opts = DOLPHIN_TRANSFORMERS.model_copy() base_model = SMOLDOCLING_TRANSFORMERS
# text_opts.prompt = "<s>Read text in the image. <Answer/>" smoldocling_model = SMOLDOCLING_TRANSFORMERS
base_model = SMOLVLM500_MLX
text_opts = base_model.model_copy() text_opts = base_model.model_copy()
# text_opts.prompt = "Convert this page to docling." # text_opts.prompt = "Convert this page to docling."
text_opts.prompt = "What does this say?" text_opts.prompt = "What does it say?"
text_opts.response_format = ResponseFormat.PLAINTEXT text_opts.response_format = ResponseFormat.PLAINTEXT
text_opts.max_new_tokens = 4096
formula_opts = base_model.model_copy() formula_opts = base_model.model_copy()
# formula_opts.prompt = "Convert formula to latex." # formula_opts.prompt = "Convert formula to latex."
formula_opts.prompt = "What does this say?" formula_opts.prompt = "What does it say?"
formula_opts.response_format = ResponseFormat.PLAINTEXT formula_opts.response_format = ResponseFormat.PLAINTEXT
formula_opts.max_new_tokens = 4096
code_opts = SMOLDOCLING_TRANSFORMERS.model_copy() code_opts = smoldocling_model.model_copy()
code_opts.prompt = "Convert code to text." code_opts.prompt = "Convert code to text."
code_opts.response_format = ResponseFormat.DOCTAGS code_opts.response_format = ResponseFormat.DOCTAGS
table_opts = SMOLDOCLING_TRANSFORMERS.model_copy() table_opts = smoldocling_model.model_copy()
table_opts.prompt = "Convert this table to OTSL." table_opts.prompt = "Convert this table to OTSL."
table_opts.response_format = ResponseFormat.OTSL table_opts.response_format = ResponseFormat.OTSL