From 6130e40c37d7ddefc8ab10b5690a44ff99df88ad Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Sun, 22 Mar 2026 10:38:46 -0400
Subject: [PATCH] fix flaky tests; should be using train loss from final step
 rather than final avg train loss

---
 tests/e2e/multigpu/patched/test_sp.py         |  2 +-
 tests/e2e/multigpu/solo/test_flex.py          |  2 +-
 tests/e2e/multigpu/test_dist_muon_fsdp2.py    |  2 +-
 tests/e2e/multigpu/test_fp8_fsdp2.py          |  2 +-
 tests/e2e/multigpu/test_fsdp1.py              |  2 +-
 tests/e2e/multigpu/test_fsdp2.py              |  2 +-
 tests/e2e/multigpu/test_gemma3.py             |  2 +-
 tests/e2e/multigpu/test_llama.py              | 24 +++++++++----------
 tests/e2e/multigpu/test_ray.py                |  6 ++---
 tests/e2e/multigpu/test_tp.py                 |  2 +-
 tests/e2e/patched/test_fa_xentropy.py         |  2 +-
 tests/e2e/patched/test_flattening.py          |  2 +-
 tests/e2e/patched/test_unsloth_qlora.py       |  6 ++---
 tests/e2e/solo/test_flex.py                   |  2 +-
 tests/e2e/test_embeddings_lr.py               |  8 ++-----
 tests/e2e/test_llama_pretrain.py              |  2 +-
 tests/e2e/test_packing_loss.py                |  2 +-
 .../e2e/test_process_reward_model_smollm2.py  |  2 +-
 tests/e2e/test_qat.py                         |  2 +-
 tests/e2e/test_reward_model_smollm2.py        |  2 +-
 tests/e2e/test_streaming.py                   |  2 +-
 21 files changed, 37 insertions(+), 41 deletions(-)

diff --git a/tests/e2e/multigpu/patched/test_sp.py b/tests/e2e/multigpu/patched/test_sp.py
index a005e6742..ba0636879 100644
--- a/tests/e2e/multigpu/patched/test_sp.py
+++ b/tests/e2e/multigpu/patched/test_sp.py
@@ -94,7 +94,7 @@ class TestSequenceParallelism:
 
         check_tensorboard(
             temp_dir + "/runs",
-            "train/train_loss",
+            "train/loss",
             threshold,
             "Train Loss (%s) is too high",
         )
diff --git a/tests/e2e/multigpu/solo/test_flex.py b/tests/e2e/multigpu/solo/test_flex.py
index 881d75c25..ae98a2bab 100644
--- a/tests/e2e/multigpu/solo/test_flex.py
+++ b/tests/e2e/multigpu/solo/test_flex.py
@@ -86,5 +86,5 @@ class TestPackedFlex:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_dist_muon_fsdp2.py b/tests/e2e/multigpu/test_dist_muon_fsdp2.py
index 93db473a9..0cdeb0c66 100644
--- a/tests/e2e/multigpu/test_dist_muon_fsdp2.py
+++ b/tests/e2e/multigpu/test_dist_muon_fsdp2.py
@@ -37,7 +37,7 @@ def verify_training_success(temp_dir):
             event_file = os.path.join(tb_log_path, event_files[0])
             reader = SummaryReader(event_file)
             df = reader.scalars
-            train_loss_df = df[df.tag == "train/train_loss"]
+            train_loss_df = df[df.tag == "train/loss"]
             if len(train_loss_df) > 0:
                 final_loss = train_loss_df.value.values[-1]
                 assert not torch.isnan(torch.tensor(final_loss)), (
diff --git a/tests/e2e/multigpu/test_fp8_fsdp2.py b/tests/e2e/multigpu/test_fp8_fsdp2.py
index 8d7c01ce8..cc6c5d440 100644
--- a/tests/e2e/multigpu/test_fp8_fsdp2.py
+++ b/tests/e2e/multigpu/test_fp8_fsdp2.py
@@ -37,7 +37,7 @@ def verify_fp8_training_success(temp_dir):
             event_file = os.path.join(tb_log_path, event_files[0])
             reader = SummaryReader(event_file)
             df = reader.scalars
-            train_loss_df = df[df.tag == "train/train_loss"]
+            train_loss_df = df[df.tag == "train/loss"]
             if len(train_loss_df) > 0:
                 final_loss = train_loss_df.value.values[-1]
                 assert not torch.isnan(torch.tensor(final_loss)), (
diff --git a/tests/e2e/multigpu/test_fsdp1.py b/tests/e2e/multigpu/test_fsdp1.py
index 5b6724791..e8537970c 100644
--- a/tests/e2e/multigpu/test_fsdp1.py
+++ b/tests/e2e/multigpu/test_fsdp1.py
@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
             event_file = os.path.join(tb_log_path, event_files[0])
             reader = SummaryReader(event_file)
             df = reader.scalars
-            train_loss_df = df[df.tag == "train/train_loss"]
+            train_loss_df = df[df.tag == "train/loss"]
             if len(train_loss_df) > 0:
                 final_loss = train_loss_df.value.values[-1]
                 assert not torch.isnan(torch.tensor(final_loss)), (
diff --git a/tests/e2e/multigpu/test_fsdp2.py b/tests/e2e/multigpu/test_fsdp2.py
index a70ff9aa7..1c5c78aeb 100644
--- a/tests/e2e/multigpu/test_fsdp2.py
+++ b/tests/e2e/multigpu/test_fsdp2.py
@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
             event_file = os.path.join(tb_log_path, event_files[0])
             reader = SummaryReader(event_file)
             df = reader.scalars
-            train_loss_df = df[df.tag == "train/train_loss"]
+            train_loss_df = df[df.tag == "train/loss"]
             if len(train_loss_df) > 0:
                 final_loss = train_loss_df.value.values[-1]
                 assert not torch.isnan(torch.tensor(final_loss)), (
diff --git a/tests/e2e/multigpu/test_gemma3.py b/tests/e2e/multigpu/test_gemma3.py
index 34f98c037..931f62ac5 100644
--- a/tests/e2e/multigpu/test_gemma3.py
+++ b/tests/e2e/multigpu/test_gemma3.py
@@ -94,5 +94,5 @@ class TestMultiGPUGemma3:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 1.8, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index 1e3757dcf..fe1f8a5ee 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -90,7 +90,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.8, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.8, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -156,7 +156,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     def test_dpo_lora_ddp(self, temp_dir):
@@ -233,7 +233,7 @@ class TestMultiGPULlama:
         loss_threshold = 2.3
         check_tensorboard(
             temp_dir + "/runs",
-            "train/train_loss",
+            "train/loss",
             loss_threshold,
             "Train Loss (%s) is too high",
         )
@@ -312,7 +312,7 @@ class TestMultiGPULlama:
         loss_threshold = 2.3
         check_tensorboard(
             temp_dir + "/runs",
-            "train/train_loss",
+            "train/loss",
             loss_threshold,
             "Train Loss (%s) is too high",
         )
@@ -385,7 +385,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -461,7 +461,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @require_torch_2_6_0
@@ -543,7 +543,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
         )
 
     def test_fsdp_qlora_prequant_packed(self, temp_dir):
@@ -623,7 +623,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -708,7 +708,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.45, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.45, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -784,7 +784,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -859,7 +859,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.skip(
@@ -925,5 +925,5 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 4.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_ray.py b/tests/e2e/multigpu/test_ray.py
index df41b1444..494b7eb05 100644
--- a/tests/e2e/multigpu/test_ray.py
+++ b/tests/e2e/multigpu/test_ray.py
@@ -79,7 +79,7 @@ class TestMultiGPURay:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @require_torch_2_7_0
@@ -138,7 +138,7 @@ class TestMultiGPURay:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @require_torch_2_7_0
@@ -205,5 +205,5 @@ class TestMultiGPURay:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_tp.py b/tests/e2e/multigpu/test_tp.py
index 9891a0906..905c753ac 100644
--- a/tests/e2e/multigpu/test_tp.py
+++ b/tests/e2e/multigpu/test_tp.py
@@ -64,5 +64,5 @@ class TestTensorParallel:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 1.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/patched/test_fa_xentropy.py b/tests/e2e/patched/test_fa_xentropy.py
index 9f4699854..d14a5a5de 100644
--- a/tests/e2e/patched/test_fa_xentropy.py
+++ b/tests/e2e/patched/test_fa_xentropy.py
@@ -78,5 +78,5 @@ class TestFAXentropyLlama:
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/patched/test_flattening.py b/tests/e2e/patched/test_flattening.py
index 2c247d406..e7546c092 100644
--- a/tests/e2e/patched/test_flattening.py
+++ b/tests/e2e/patched/test_flattening.py
@@ -77,5 +77,5 @@ class TestFAFlattening:
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/patched/test_unsloth_qlora.py b/tests/e2e/patched/test_unsloth_qlora.py
index bf00e8a5f..b75ce3588 100644
--- a/tests/e2e/patched/test_unsloth_qlora.py
+++ b/tests/e2e/patched/test_unsloth_qlora.py
@@ -73,7 +73,7 @@ class TestUnslothQLoRA:
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
         )
 
     def test_unsloth_llama_qlora_unpacked(self, temp_dir):
@@ -124,7 +124,7 @@ class TestUnslothQLoRA:
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -180,5 +180,5 @@ class TestUnslothQLoRA:
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/solo/test_flex.py b/tests/e2e/solo/test_flex.py
index abe8fb69a..1d08ef082 100644
--- a/tests/e2e/solo/test_flex.py
+++ b/tests/e2e/solo/test_flex.py
@@ -63,5 +63,5 @@ class TestPackedFlex(unittest.TestCase):
         train(cfg=cfg, dataset_meta=dataset_meta)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/test_embeddings_lr.py b/tests/e2e/test_embeddings_lr.py
index 2b2e8e5e8..a9d3b20a3 100644
--- a/tests/e2e/test_embeddings_lr.py
+++ b/tests/e2e/test_embeddings_lr.py
@@ -57,9 +57,7 @@ class TestEmbeddingsLrScale(unittest.TestCase):
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
 
-        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
-        )
+        check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")
 
     @with_temp_dir
     def test_train_w_embedding_lr(self, temp_dir):
@@ -100,6 +98,4 @@ class TestEmbeddingsLrScale(unittest.TestCase):
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
 
-        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
-        )
+        check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")
diff --git a/tests/e2e/test_llama_pretrain.py b/tests/e2e/test_llama_pretrain.py
index 3aa594fbd..e259491ef 100644
--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -66,7 +66,7 @@ class TestPretrainLlama:
             loss_threshold = 6.5
         check_tensorboard(
             temp_dir + "/runs",
-            "train/train_loss",
+            "train/loss",
             loss_threshold,
             "Train Loss (%s) is too high",
         )
diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py
index 7cb979ce6..f3f578702 100644
--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -62,5 +62,5 @@ class TestPackedLlama(unittest.TestCase):
         train(cfg=cfg, dataset_meta=dataset_meta)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/test_process_reward_model_smollm2.py b/tests/e2e/test_process_reward_model_smollm2.py
index 9d83aabbc..33c65093c 100644
--- a/tests/e2e/test_process_reward_model_smollm2.py
+++ b/tests/e2e/test_process_reward_model_smollm2.py
@@ -57,7 +57,7 @@ class TestProcessRewardSmolLM2(unittest.TestCase):
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.7, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.7, "Train Loss (%s) is too high"
         )
 
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_qat.py b/tests/e2e/test_qat.py
index 251d5b17b..b2706e142 100644
--- a/tests/e2e/test_qat.py
+++ b/tests/e2e/test_qat.py
@@ -128,7 +128,7 @@ class TestQATLlama:
         loss_threshold = 2.3
         check_tensorboard(
             temp_dir + "/runs",
-            "train/train_loss",
+            "train/loss",
             loss_threshold,
             "Train Loss (%s) is too high",
         )
diff --git a/tests/e2e/test_reward_model_smollm2.py b/tests/e2e/test_reward_model_smollm2.py
index cc768b173..58259e5e9 100644
--- a/tests/e2e/test_reward_model_smollm2.py
+++ b/tests/e2e/test_reward_model_smollm2.py
@@ -66,6 +66,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
         )
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_streaming.py b/tests/e2e/test_streaming.py
index 404fb53da..53329a9ae 100644
--- a/tests/e2e/test_streaming.py
+++ b/tests/e2e/test_streaming.py
@@ -66,7 +66,7 @@ class TestStreamingDatasets:
         # Verify training actually happened by checking loss decrease
         check_tensorboard(
             temp_dir + "/runs",
-            "train/train_loss",
+            "train/loss",
             3.0,
             "Train Loss (%s) is too high",
         )