diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index 38e6e741a..705d9240f 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -101,7 +101,13 @@ class TestMultiGPULlama:
         "gradient_accumulation_steps",
         [1, 2],
     )
-    def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
+    @pytest.mark.parametrize(
+        "use_flash_attention_3",
+        [False, "auto"],
+    )
+    def test_lora_ddp_packed(
+        self, temp_dir, gradient_accumulation_steps, use_flash_attention_3
+    ):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
@@ -138,6 +144,7 @@ class TestMultiGPULlama:
                 "flash_attention": True,
                 "use_tensorboard": True,
                 "bf16": True,
+                "use_flash_attention_3": use_flash_attention_3,
             }
         )
 
diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py
index 45894718e..e6ceb5156 100644
--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -5,7 +5,6 @@ E2E tests for packed training
 import logging
 import os
 
-import pytest
 from transformers.utils import is_torch_bf16_gpu_available
 
 from axolotl.cli.args import TrainerCliArgs
@@ -25,11 +24,7 @@ class TestPackedLlama:
     Test case for Packed training of llama models
     """
 
-    @pytest.mark.parametrize(
-        "use_flash_attention_3",
-        [False, "auto"],
-    )
-    def test_loss_packed(self, temp_dir, use_flash_attention_3):
+    def test_loss_packed(self, temp_dir):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
@@ -57,7 +52,6 @@ class TestPackedLlama:
                 "lr_scheduler": "cosine",
                 "max_steps": 5,
                 "use_tensorboard": True,
-                "use_flash_attention_3": use_flash_attention_3,
             }
         )
         if is_torch_bf16_gpu_available():