fix: enable use_cuda_flash_attention2 for PictureDescriptionVlmModel

Signed-off-by: Zach Cox <zach.s.cox@gmail.com>
2025-07-26 20:14:47 +00:00 · 2025-04-29 19:33:58 -04:00 · 2025-04-29 19:33:58 -04:00 · 0961cda5fb
commit 0961cda5fb
parent 976e92e289
1 changed files with 4 additions and 1 deletions
--- a/docling/models/picture_description_vlm_model.py
+++ b/docling/models/picture_description_vlm_model.py
@ -57,7 +57,10 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
                artifacts_path,
                torch_dtype=torch.bfloat16,
                _attn_implementation=(
-                    "flash_attention_2" if self.device.startswith("cuda") else "eager"
+                    "flash_attention_2"
+                    if self.device.startswith("cuda")
+                    and accelerator_options.cuda_use_flash_attention2
+                    else "eager"
                ),
            ).to(self.device)