From 0fa752e58b593440ced0dd1cec0630f9b7b92664 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Thu, 21 Aug 2025 15:04:10 -0400
Subject: [PATCH] upgrade flash-attn to 2.8.3 for gpt-oss attn sink support
 (#3082)

---
 examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml  | 2 +-
 examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml | 2 +-
 examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml   | 2 +-
 examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml           | 2 +-
 examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml  | 2 +-
 setup.py                                              | 4 ++--
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
index 4b4fbd89b..62f3167e8 100644
--- a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
@@ -44,7 +44,7 @@ bf16: true
 tf32: true
 
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3
+attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
 activation_offloading: true
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
index 440f0c509..ccb84e28e 100644
--- a/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
@@ -40,7 +40,7 @@ bf16: true
 tf32: true
 
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3
+attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
 activation_offloading: true
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
index 1b142b6c3..69a3c434d 100644
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
@@ -41,7 +41,7 @@ bf16: true
 tf32: true
 
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3
+attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
 activation_offloading: true
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
index bdbb70fae..4a0f1ad70 100644
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
@@ -40,7 +40,7 @@ bf16: true
 tf32: true
 
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3
+attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
 activation_offloading: true
diff --git a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
index c4e1a982d..b6deacb1b 100644
--- a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
@@ -53,7 +53,7 @@ bf16: true
 tf32: true
 
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3
+attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
 activation_offloading: true
diff --git a/setup.py b/setup.py
index de6f19e56..5aab9d7c0 100644
--- a/setup.py
+++ b/setup.py
@@ -118,9 +118,9 @@ def get_package_version():
 
 
 extras_require = {
-    "flash-attn": ["flash-attn==2.8.2"],
+    "flash-attn": ["flash-attn==2.8.3"],
     "ring-flash-attn": [
-        "flash-attn==2.8.2",
+        "flash-attn==2.8.3",
         "ring-flash-attn>=0.1.7",
         "yunchang==0.6.0",
     ],