fix duplicate attn_implementation in gpt-oss yamls and flaky caplog tests

2026-04-25 08:53:28 +00:00
parent aeca18a8b0
commit 6886def92c
7 changed files with 28 additions and 18 deletions
--- a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
@@ -47,7 +47,6 @@ learning_rate: 2e-5
 bf16: true
 tf32: true

-attn_implementation: flash_attention_2
 attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3

 gradient_checkpointing: true
--- a/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
@@ -43,7 +43,6 @@ learning_rate: 2e-5
 bf16: true
 tf32: true

-attn_implementation: flash_attention_2
 attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3

 gradient_checkpointing: true
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
@@ -44,7 +44,6 @@ learning_rate: 2e-5
 bf16: true
 tf32: true

-attn_implementation: flash_attention_2
 attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3

 gradient_checkpointing: true
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
@@ -43,7 +43,6 @@ learning_rate: 2e-5
 bf16: true
 tf32: true

-attn_implementation: flash_attention_2
 attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3

 gradient_checkpointing: true
--- a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
@@ -56,7 +56,6 @@ learning_rate: 2e-4
 bf16: true
 tf32: true

-attn_implementation: flash_attention_2
 attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3

 gradient_checkpointing: true
--- a/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml
+++ b/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml
@@ -56,7 +56,6 @@ learning_rate: 2e-4
 bf16: true
 tf32: true

-attn_implementation: flash_attention_2
 attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3

 gradient_checkpointing: true