From 0961cda5fb40fb848c9df0fe13bc224c956e1e46 Mon Sep 17 00:00:00 2001
From: Zach Cox <zach.s.cox@gmail.com>
Date: Tue, 29 Apr 2025 19:33:58 -0400
Subject: [PATCH] fix: enable use_cuda_flash_attention2 for
 PictureDescriptionVlmModel

Signed-off-by: Zach Cox <zach.s.cox@gmail.com>
---
 docling/models/picture_description_vlm_model.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py
index 374f575d..679e80c2 100644
--- a/docling/models/picture_description_vlm_model.py
+++ b/docling/models/picture_description_vlm_model.py
@@ -57,7 +57,10 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
                 artifacts_path,
                 torch_dtype=torch.bfloat16,
                 _attn_implementation=(
-                    "flash_attention_2" if self.device.startswith("cuda") else "eager"
+                    "flash_attention_2"
+                    if self.device.startswith("cuda")
+                    and accelerator_options.cuda_use_flash_attention2
+                    else "eager"
                 ),
             ).to(self.device)