mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
Expose control over using flash_attention_2
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
84c77c9fcb
commit
10f64a948c
@ -41,6 +41,7 @@ class AcceleratorOptions(BaseSettings):
|
|||||||
|
|
||||||
num_threads: int = 4
|
num_threads: int = 4
|
||||||
device: Union[str, AcceleratorDevice] = "auto"
|
device: Union[str, AcceleratorDevice] = "auto"
|
||||||
|
cuda_use_flash_attention2: bool = False
|
||||||
|
|
||||||
@field_validator("device")
|
@field_validator("device")
|
||||||
def validate_device(cls, value):
|
def validate_device(cls, value):
|
||||||
|
@ -64,9 +64,12 @@ class HuggingFaceVlmModel(BasePageModel):
|
|||||||
self.vlm_model = AutoModelForVision2Seq.from_pretrained(
|
self.vlm_model = AutoModelForVision2Seq.from_pretrained(
|
||||||
artifacts_path,
|
artifacts_path,
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
# _attn_implementation=(
|
_attn_implementation=(
|
||||||
# "flash_attention_2" if self.device.startswith("cuda") else "eager"
|
"flash_attention_2"
|
||||||
# ),
|
if self.device.startswith("cuda")
|
||||||
|
and accelerator_options.cuda_use_flash_attention2
|
||||||
|
else "eager"
|
||||||
|
),
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -74,9 +77,12 @@ class HuggingFaceVlmModel(BasePageModel):
|
|||||||
artifacts_path,
|
artifacts_path,
|
||||||
torch_dtype="auto",
|
torch_dtype="auto",
|
||||||
quantization_config=self.param_quantization_config,
|
quantization_config=self.param_quantization_config,
|
||||||
# _attn_implementation=(
|
_attn_implementation=(
|
||||||
# "flash_attention_2" if self.device.startswith("cuda") else "eager"
|
"flash_attention_2"
|
||||||
# ),
|
if self.device.startswith("cuda")
|
||||||
|
and accelerator_options.cuda_use_flash_attention2
|
||||||
|
else "eager"
|
||||||
|
),
|
||||||
).to(self.device)
|
).to(self.device)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -6,6 +6,7 @@ import yaml
|
|||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
|
AcceleratorDevice,
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
granite_vision_vlm_conversion_options,
|
granite_vision_vlm_conversion_options,
|
||||||
smoldocling_vlm_conversion_options,
|
smoldocling_vlm_conversion_options,
|
||||||
@ -24,9 +25,13 @@ pipeline_options.generate_page_images = True
|
|||||||
# If force_backend_text = True, text from backend will be used instead of generated text
|
# If force_backend_text = True, text from backend will be used instead of generated text
|
||||||
pipeline_options.force_backend_text = False
|
pipeline_options.force_backend_text = False
|
||||||
|
|
||||||
|
## Enable flash_attention_2 with CUDA:
|
||||||
|
# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
|
||||||
|
# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
|
||||||
|
|
||||||
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
||||||
|
|
||||||
# Choose alternative VLM models:
|
## Choose alternative VLM models:
|
||||||
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
||||||
|
|
||||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||||
|
Loading…
Reference in New Issue
Block a user