diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py index 38e6e741a..705d9240f 100644 --- a/tests/e2e/multigpu/test_llama.py +++ b/tests/e2e/multigpu/test_llama.py @@ -101,7 +101,13 @@ class TestMultiGPULlama: "gradient_accumulation_steps", [1, 2], ) - def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps): + @pytest.mark.parametrize( + "use_flash_attention_3", + [False, "auto"], + ) + def test_lora_ddp_packed( + self, temp_dir, gradient_accumulation_steps, use_flash_attention_3 + ): # pylint: disable=duplicate-code cfg = DictDefault( { @@ -138,6 +144,7 @@ class TestMultiGPULlama: "flash_attention": True, "use_tensorboard": True, "bf16": True, + "use_flash_attention_3": use_flash_attention_3, } ) diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py index 45894718e..e6ceb5156 100644 --- a/tests/e2e/test_packing_loss.py +++ b/tests/e2e/test_packing_loss.py @@ -5,7 +5,6 @@ E2E tests for packed training import logging import os -import pytest from transformers.utils import is_torch_bf16_gpu_available from axolotl.cli.args import TrainerCliArgs @@ -25,11 +24,7 @@ class TestPackedLlama: Test case for Packed training of llama models """ - @pytest.mark.parametrize( - "use_flash_attention_3", - [False, "auto"], - ) - def test_loss_packed(self, temp_dir, use_flash_attention_3): + def test_loss_packed(self, temp_dir): # pylint: disable=duplicate-code cfg = DictDefault( { @@ -57,7 +52,6 @@ class TestPackedLlama: "lr_scheduler": "cosine", "max_steps": 5, "use_tensorboard": True, - "use_flash_attention_3": use_flash_attention_3, } ) if is_torch_bf16_gpu_available():