move fa3 tests to multigpu since we only run those on hopper

2025-05-18 10:08:08 -07:00
parent bb6464c4c6
commit 0735454782
2 changed files with 9 additions and 8 deletions
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -101,7 +101,13 @@ class TestMultiGPULlama:
        "gradient_accumulation_steps",
        [1, 2],
    )
-    def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
+    @pytest.mark.parametrize(
+        "use_flash_attention_3",
+        [False, "auto"],
+    )
+    def test_lora_ddp_packed(
+        self, temp_dir, gradient_accumulation_steps, use_flash_attention_3
+    ):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
@@ -138,6 +144,7 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
+                "use_flash_attention_3": use_flash_attention_3,
            }
        )

--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -5,7 +5,6 @@ E2E tests for packed training
 import logging
 import os

-import pytest
 from transformers.utils import is_torch_bf16_gpu_available

 from axolotl.cli.args import TrainerCliArgs
@@ -25,11 +24,7 @@ class TestPackedLlama:
    Test case for Packed training of llama models
    """

-    @pytest.mark.parametrize(
-        "use_flash_attention_3",
-        [False, "auto"],
-    )
-    def test_loss_packed(self, temp_dir, use_flash_attention_3):
+    def test_loss_packed(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
@@ -57,7 +52,6 @@ class TestPackedLlama:
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "use_tensorboard": True,
-                "use_flash_attention_3": use_flash_attention_3,
            }
        )
        if is_torch_bf16_gpu_available():