From 0102ca5943eefa5c374e38bc9eab2510f577b476 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Sun, 6 Jul 2025 19:11:46 -0400
Subject: [PATCH] fix cfg merge

---
 tests/conftest.py                |  7 +---
 tests/e2e/multigpu/test_llama.py | 64 ++++++++++++++++----------------
 2 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 51b5b1049..23012cfc3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,7 +10,7 @@ import shutil
 import sys
 import tempfile
 import time
-from pathlib import Path, PosixPath
+from pathlib import Path
 from typing import Generator
 
 import datasets
@@ -432,11 +432,6 @@ def module_temp_dir() -> Generator[str, None, None]:
     shutil.rmtree(_temp_dir)
 
 
-@pytest.fixture(scope="function", autouse=True)
-def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None:
-    os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache"
-
-
 @pytest.fixture(scope="function", autouse=True)
 def cleanup_monkeypatches():
     from transformers import Trainer
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index 4ced79632..30e7446b1 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -32,7 +32,7 @@ def sft_base_cfg():
     cfg = DictDefault(
         base_model="HuggingFaceTB/SmolLM2-135M",
         tokenizer_config="HuggingFaceTB/SmolLM2-135M",  # this has to be manually set since we haven't done validation
-        sequence_len=2048,
+        sequence_len=1024,
         special_tokens={
             "pad_token": "<|endoftext|>",
         },
@@ -442,36 +442,38 @@ class TestMultiGPULlama:
         self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
     ):
         # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "pad_to_sequence_len": True,
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 2,
-                "gradient_accumulation_steps": 2,
-                # "gradient_checkpointing": True,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "fsdp": [
-                    "full_shard",
-                    "auto_wrap",
-                ],
-                "fsdp_config": {
-                    "fsdp_limit_all_gathers": True,
-                    "fsdp_offload_params": False,
-                    "fsdp_sync_module_states": True,
-                    "fsdp_use_orig_params": False,
-                    "fsdp_cpu_ram_efficient_loading": False,
-                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": fsdp_state_dict_type,
-                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-                },
-                "use_tensorboard": True,
-            }
+        cfg = (
+            DictDefault(
+                {
+                    "pad_to_sequence_len": True,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 2,
+                    "gradient_accumulation_steps": 2,
+                    # "gradient_checkpointing": True,
+                    "output_dir": temp_dir,
+                    "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_torch_fused",
+                    "lr_scheduler": "cosine",
+                    "flash_attention": True,
+                    "fsdp": [
+                        "full_shard",
+                        "auto_wrap",
+                    ],
+                    "fsdp_config": {
+                        "fsdp_limit_all_gathers": True,
+                        "fsdp_offload_params": False,
+                        "fsdp_sync_module_states": True,
+                        "fsdp_use_orig_params": False,
+                        "fsdp_cpu_ram_efficient_loading": False,
+                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                        "fsdp_state_dict_type": fsdp_state_dict_type,
+                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    },
+                    "use_tensorboard": True,
+                }
+            )
             | sft_prepared_dataset_alpaca_cfg
         )