mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
Cleanup hf_transformers_model batching impl
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -35,11 +35,11 @@ SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
|
|||||||
prompt="Convert this page to docling.",
|
prompt="Convert this page to docling.",
|
||||||
response_format=ResponseFormat.DOCTAGS,
|
response_format=ResponseFormat.DOCTAGS,
|
||||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||||
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
||||||
supported_devices=[
|
supported_devices=[
|
||||||
AcceleratorDevice.CPU,
|
AcceleratorDevice.CPU,
|
||||||
AcceleratorDevice.CUDA,
|
AcceleratorDevice.CUDA,
|
||||||
AcceleratorDevice.MPS,
|
# AcceleratorDevice.MPS,
|
||||||
],
|
],
|
||||||
scale=2.0,
|
scale=2.0,
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
@@ -51,7 +51,7 @@ SMOLDOCLING_VLLM = InlineVlmOptions(
|
|||||||
prompt="Convert this page to docling.",
|
prompt="Convert this page to docling.",
|
||||||
response_format=ResponseFormat.DOCTAGS,
|
response_format=ResponseFormat.DOCTAGS,
|
||||||
inference_framework=InferenceFramework.VLLM,
|
inference_framework=InferenceFramework.VLLM,
|
||||||
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
||||||
supported_devices=[
|
supported_devices=[
|
||||||
AcceleratorDevice.CPU,
|
AcceleratorDevice.CPU,
|
||||||
AcceleratorDevice.CUDA,
|
AcceleratorDevice.CUDA,
|
||||||
@@ -61,9 +61,9 @@ SMOLDOCLING_VLLM = InlineVlmOptions(
|
|||||||
stop_strings=["</doctag>", "<end_of_utterance>"],
|
stop_strings=["</doctag>", "<end_of_utterance>"],
|
||||||
)
|
)
|
||||||
|
|
||||||
# SmolVLM-500-Instruct
|
# SmolVLM-256M-Instruct
|
||||||
SMOLVLM500_TRANSFORMERS = InlineVlmOptions(
|
SMOLVLM256_TRANSFORMERS = InlineVlmOptions(
|
||||||
repo_id="HuggingFaceTB/SmolVLM-500M-Instruct",
|
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
|
||||||
prompt="Transcribe this image to plain text.",
|
prompt="Transcribe this image to plain text.",
|
||||||
response_format=ResponseFormat.PLAINTEXT,
|
response_format=ResponseFormat.PLAINTEXT,
|
||||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||||
@@ -71,16 +71,16 @@ SMOLVLM500_TRANSFORMERS = InlineVlmOptions(
|
|||||||
supported_devices=[
|
supported_devices=[
|
||||||
AcceleratorDevice.CPU,
|
AcceleratorDevice.CPU,
|
||||||
AcceleratorDevice.CUDA,
|
AcceleratorDevice.CUDA,
|
||||||
AcceleratorDevice.MPS,
|
# AcceleratorDevice.MPS,
|
||||||
],
|
],
|
||||||
scale=2.0,
|
scale=2.0,
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
# SmolVLM-500-Instruct
|
# SmolVLM2-2.2b-Instruct
|
||||||
SMOLVLM500_MLX = InlineVlmOptions(
|
SMOLVLM256_MLX = InlineVlmOptions(
|
||||||
repo_id="moot20/SmolVLM-500M-Instruct-MLX",
|
repo_id="moot20/SmolVLM-256M-Instruct-MLX",
|
||||||
prompt="Transcribe this image to plain text.",
|
prompt="Extract the text.",
|
||||||
response_format=ResponseFormat.DOCTAGS,
|
response_format=ResponseFormat.DOCTAGS,
|
||||||
inference_framework=InferenceFramework.MLX,
|
inference_framework=InferenceFramework.MLX,
|
||||||
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
||||||
@@ -91,16 +91,16 @@ SMOLVLM500_MLX = InlineVlmOptions(
|
|||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
SMOLVLM500_VLLM = InlineVlmOptions(
|
SMOLVLM256_VLLM = InlineVlmOptions(
|
||||||
repo_id="HuggingFaceTB/SmolVLM-500M-Instruct",
|
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct",
|
||||||
prompt="Transcribe this image to plain text.",
|
prompt="Transcribe this image to plain text.",
|
||||||
response_format=ResponseFormat.DOCTAGS,
|
response_format=ResponseFormat.PLAINTEXT,
|
||||||
inference_framework=InferenceFramework.VLLM,
|
inference_framework=InferenceFramework.VLLM,
|
||||||
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
||||||
supported_devices=[
|
supported_devices=[
|
||||||
AcceleratorDevice.CPU,
|
AcceleratorDevice.CPU,
|
||||||
AcceleratorDevice.CUDA,
|
AcceleratorDevice.CUDA,
|
||||||
AcceleratorDevice.MPS,
|
# AcceleratorDevice.MPS,
|
||||||
],
|
],
|
||||||
scale=2.0,
|
scale=2.0,
|
||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
|
|||||||
@@ -107,6 +107,8 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
|
|||||||
artifacts_path,
|
artifacts_path,
|
||||||
trust_remote_code=vlm_options.trust_remote_code,
|
trust_remote_code=vlm_options.trust_remote_code,
|
||||||
)
|
)
|
||||||
|
self.processor.tokenizer.padding_side = "left"
|
||||||
|
|
||||||
self.vlm_model = model_cls.from_pretrained(
|
self.vlm_model = model_cls.from_pretrained(
|
||||||
artifacts_path,
|
artifacts_path,
|
||||||
device_map=self.device,
|
device_map=self.device,
|
||||||
@@ -215,7 +217,7 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
prompt = self.processor.apply_chat_template(
|
prompt = self.processor.apply_chat_template(
|
||||||
messages, add_generation_prompt=False
|
messages, add_generation_prompt=True
|
||||||
)
|
)
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
@@ -311,17 +313,8 @@ class HuggingFaceTransformersVlmModel(BaseVlmPageModel, HuggingFaceModelDownload
|
|||||||
generated_ids = self.vlm_model.generate(**gen_kwargs)
|
generated_ids = self.vlm_model.generate(**gen_kwargs)
|
||||||
generation_time = time.time() - start_time
|
generation_time = time.time() - start_time
|
||||||
|
|
||||||
# -- Trim per sample using attention_mask (robust for batched prompts)
|
input_len = inputs["input_ids"].shape[1] # common right-aligned prompt length
|
||||||
if "attention_mask" not in inputs:
|
trimmed_sequences = generated_ids[:, input_len:] # only newly generated tokens
|
||||||
raise RuntimeError(
|
|
||||||
"Processor did not return 'attention_mask'. Ensure padding=True and text tokenization are enabled."
|
|
||||||
)
|
|
||||||
input_lengths = inputs["attention_mask"].sum(dim=1).tolist()
|
|
||||||
|
|
||||||
trimmed_sequences: list[list[int]] = [
|
|
||||||
generated_ids[i, int(input_lengths[i]) :].tolist()
|
|
||||||
for i in range(generated_ids.shape[0])
|
|
||||||
]
|
|
||||||
|
|
||||||
# -- Decode with the processor/tokenizer (skip specials, keep DocTags as text)
|
# -- Decode with the processor/tokenizer (skip specials, keep DocTags as text)
|
||||||
decode_fn = getattr(self.processor, "batch_decode", None)
|
decode_fn = getattr(self.processor, "batch_decode", None)
|
||||||
|
|||||||
@@ -57,8 +57,6 @@ from docling.datamodel.vlm_model_specs import (
|
|||||||
DOLPHIN_TRANSFORMERS,
|
DOLPHIN_TRANSFORMERS,
|
||||||
SMOLDOCLING_MLX,
|
SMOLDOCLING_MLX,
|
||||||
SMOLDOCLING_TRANSFORMERS,
|
SMOLDOCLING_TRANSFORMERS,
|
||||||
SMOLVLM500_MLX,
|
|
||||||
SMOLVLM500_TRANSFORMERS,
|
|
||||||
)
|
)
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||||
@@ -155,29 +153,29 @@ class ThreadedMultiStageVlmPipelineOptions(PaginatedPipelineOptions):
|
|||||||
"""Create default pipeline options with custom VLM configurations from example."""
|
"""Create default pipeline options with custom VLM configurations from example."""
|
||||||
|
|
||||||
# Configure VLM options based on the custom pipeline example
|
# Configure VLM options based on the custom pipeline example
|
||||||
# formula_opts = DOLPHIN_TRANSFORMERS.model_copy()
|
# base_model = SMOLVLM256_TRANSFORMERS
|
||||||
# formula_opts.prompt = "<s>Read text in the image. <Answer/>"
|
# smoldocling_model = SMOLDOCLING_TRANSFORMERS
|
||||||
|
|
||||||
# text_opts = DOLPHIN_TRANSFORMERS.model_copy()
|
base_model = SMOLDOCLING_TRANSFORMERS
|
||||||
# text_opts.prompt = "<s>Read text in the image. <Answer/>"
|
smoldocling_model = SMOLDOCLING_TRANSFORMERS
|
||||||
|
|
||||||
base_model = SMOLVLM500_MLX
|
|
||||||
|
|
||||||
text_opts = base_model.model_copy()
|
text_opts = base_model.model_copy()
|
||||||
# text_opts.prompt = "Convert this page to docling."
|
# text_opts.prompt = "Convert this page to docling."
|
||||||
text_opts.prompt = "What does this say?"
|
text_opts.prompt = "What does it say?"
|
||||||
text_opts.response_format = ResponseFormat.PLAINTEXT
|
text_opts.response_format = ResponseFormat.PLAINTEXT
|
||||||
|
text_opts.max_new_tokens = 4096
|
||||||
|
|
||||||
formula_opts = base_model.model_copy()
|
formula_opts = base_model.model_copy()
|
||||||
# formula_opts.prompt = "Convert formula to latex."
|
# formula_opts.prompt = "Convert formula to latex."
|
||||||
formula_opts.prompt = "What does this say?"
|
formula_opts.prompt = "What does it say?"
|
||||||
formula_opts.response_format = ResponseFormat.PLAINTEXT
|
formula_opts.response_format = ResponseFormat.PLAINTEXT
|
||||||
|
formula_opts.max_new_tokens = 4096
|
||||||
|
|
||||||
code_opts = SMOLDOCLING_TRANSFORMERS.model_copy()
|
code_opts = smoldocling_model.model_copy()
|
||||||
code_opts.prompt = "Convert code to text."
|
code_opts.prompt = "Convert code to text."
|
||||||
code_opts.response_format = ResponseFormat.DOCTAGS
|
code_opts.response_format = ResponseFormat.DOCTAGS
|
||||||
|
|
||||||
table_opts = SMOLDOCLING_TRANSFORMERS.model_copy()
|
table_opts = smoldocling_model.model_copy()
|
||||||
table_opts.prompt = "Convert this table to OTSL."
|
table_opts.prompt = "Convert this table to OTSL."
|
||||||
table_opts.response_format = ResponseFormat.OTSL
|
table_opts.response_format = ResponseFormat.OTSL
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user