From 6130e40c37d7ddefc8ab10b5690a44ff99df88ad Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 22 Mar 2026 10:38:46 -0400 Subject: [PATCH] fix flaky tests; should be using train loss from final step rather than final avg train loss --- tests/e2e/multigpu/patched/test_sp.py | 2 +- tests/e2e/multigpu/solo/test_flex.py | 2 +- tests/e2e/multigpu/test_dist_muon_fsdp2.py | 2 +- tests/e2e/multigpu/test_fp8_fsdp2.py | 2 +- tests/e2e/multigpu/test_fsdp1.py | 2 +- tests/e2e/multigpu/test_fsdp2.py | 2 +- tests/e2e/multigpu/test_gemma3.py | 2 +- tests/e2e/multigpu/test_llama.py | 24 +++++++++---------- tests/e2e/multigpu/test_ray.py | 6 ++--- tests/e2e/multigpu/test_tp.py | 2 +- tests/e2e/patched/test_fa_xentropy.py | 2 +- tests/e2e/patched/test_flattening.py | 2 +- tests/e2e/patched/test_unsloth_qlora.py | 6 ++--- tests/e2e/solo/test_flex.py | 2 +- tests/e2e/test_embeddings_lr.py | 8 ++----- tests/e2e/test_llama_pretrain.py | 2 +- tests/e2e/test_packing_loss.py | 2 +- .../e2e/test_process_reward_model_smollm2.py | 2 +- tests/e2e/test_qat.py | 2 +- tests/e2e/test_reward_model_smollm2.py | 2 +- tests/e2e/test_streaming.py | 2 +- 21 files changed, 37 insertions(+), 41 deletions(-) diff --git a/tests/e2e/multigpu/patched/test_sp.py b/tests/e2e/multigpu/patched/test_sp.py index a005e6742..ba0636879 100644 --- a/tests/e2e/multigpu/patched/test_sp.py +++ b/tests/e2e/multigpu/patched/test_sp.py @@ -94,7 +94,7 @@ class TestSequenceParallelism: check_tensorboard( temp_dir + "/runs", - "train/train_loss", + "train/loss", threshold, "Train Loss (%s) is too high", ) diff --git a/tests/e2e/multigpu/solo/test_flex.py b/tests/e2e/multigpu/solo/test_flex.py index 881d75c25..ae98a2bab 100644 --- a/tests/e2e/multigpu/solo/test_flex.py +++ b/tests/e2e/multigpu/solo/test_flex.py @@ -86,5 +86,5 @@ class TestPackedFlex: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/multigpu/test_dist_muon_fsdp2.py b/tests/e2e/multigpu/test_dist_muon_fsdp2.py index 93db473a9..0cdeb0c66 100644 --- a/tests/e2e/multigpu/test_dist_muon_fsdp2.py +++ b/tests/e2e/multigpu/test_dist_muon_fsdp2.py @@ -37,7 +37,7 @@ def verify_training_success(temp_dir): event_file = os.path.join(tb_log_path, event_files[0]) reader = SummaryReader(event_file) df = reader.scalars - train_loss_df = df[df.tag == "train/train_loss"] + train_loss_df = df[df.tag == "train/loss"] if len(train_loss_df) > 0: final_loss = train_loss_df.value.values[-1] assert not torch.isnan(torch.tensor(final_loss)), ( diff --git a/tests/e2e/multigpu/test_fp8_fsdp2.py b/tests/e2e/multigpu/test_fp8_fsdp2.py index 8d7c01ce8..cc6c5d440 100644 --- a/tests/e2e/multigpu/test_fp8_fsdp2.py +++ b/tests/e2e/multigpu/test_fp8_fsdp2.py @@ -37,7 +37,7 @@ def verify_fp8_training_success(temp_dir): event_file = os.path.join(tb_log_path, event_files[0]) reader = SummaryReader(event_file) df = reader.scalars - train_loss_df = df[df.tag == "train/train_loss"] + train_loss_df = df[df.tag == "train/loss"] if len(train_loss_df) > 0: final_loss = train_loss_df.value.values[-1] assert not torch.isnan(torch.tensor(final_loss)), ( diff --git a/tests/e2e/multigpu/test_fsdp1.py b/tests/e2e/multigpu/test_fsdp1.py index 5b6724791..e8537970c 100644 --- a/tests/e2e/multigpu/test_fsdp1.py +++ b/tests/e2e/multigpu/test_fsdp1.py @@ -38,7 +38,7 @@ def verify_training_success(temp_dir): event_file = os.path.join(tb_log_path, event_files[0]) reader = SummaryReader(event_file) df = reader.scalars - train_loss_df = df[df.tag == "train/train_loss"] + train_loss_df = df[df.tag == "train/loss"] if len(train_loss_df) > 0: final_loss = train_loss_df.value.values[-1] assert not torch.isnan(torch.tensor(final_loss)), ( diff --git a/tests/e2e/multigpu/test_fsdp2.py b/tests/e2e/multigpu/test_fsdp2.py index a70ff9aa7..1c5c78aeb 100644 --- a/tests/e2e/multigpu/test_fsdp2.py +++ b/tests/e2e/multigpu/test_fsdp2.py @@ -38,7 +38,7 @@ def verify_training_success(temp_dir): event_file = os.path.join(tb_log_path, event_files[0]) reader = SummaryReader(event_file) df = reader.scalars - train_loss_df = df[df.tag == "train/train_loss"] + train_loss_df = df[df.tag == "train/loss"] if len(train_loss_df) > 0: final_loss = train_loss_df.value.values[-1] assert not torch.isnan(torch.tensor(final_loss)), ( diff --git a/tests/e2e/multigpu/test_gemma3.py b/tests/e2e/multigpu/test_gemma3.py index 34f98c037..931f62ac5 100644 --- a/tests/e2e/multigpu/test_gemma3.py +++ b/tests/e2e/multigpu/test_gemma3.py @@ -94,5 +94,5 @@ class TestMultiGPUGemma3: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 1.8, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py index 1e3757dcf..fe1f8a5ee 100644 --- a/tests/e2e/multigpu/test_llama.py +++ b/tests/e2e/multigpu/test_llama.py @@ -90,7 +90,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.8, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.8, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( @@ -156,7 +156,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high" ) def test_dpo_lora_ddp(self, temp_dir): @@ -233,7 +233,7 @@ class TestMultiGPULlama: loss_threshold = 2.3 check_tensorboard( temp_dir + "/runs", - "train/train_loss", + "train/loss", loss_threshold, "Train Loss (%s) is too high", ) @@ -312,7 +312,7 @@ class TestMultiGPULlama: loss_threshold = 2.3 check_tensorboard( temp_dir + "/runs", - "train/train_loss", + "train/loss", loss_threshold, "Train Loss (%s) is too high", ) @@ -385,7 +385,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( @@ -461,7 +461,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high" ) @require_torch_2_6_0 @@ -543,7 +543,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high" ) def test_fsdp_qlora_prequant_packed(self, temp_dir): @@ -623,7 +623,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( @@ -708,7 +708,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.45, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.45, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( @@ -784,7 +784,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( @@ -859,7 +859,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high" ) @pytest.mark.skip( @@ -925,5 +925,5 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 4.0, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/multigpu/test_ray.py b/tests/e2e/multigpu/test_ray.py index df41b1444..494b7eb05 100644 --- a/tests/e2e/multigpu/test_ray.py +++ b/tests/e2e/multigpu/test_ray.py @@ -79,7 +79,7 @@ class TestMultiGPURay: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high" ) @require_torch_2_7_0 @@ -138,7 +138,7 @@ class TestMultiGPURay: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high" ) @require_torch_2_7_0 @@ -205,5 +205,5 @@ class TestMultiGPURay: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/multigpu/test_tp.py b/tests/e2e/multigpu/test_tp.py index 9891a0906..905c753ac 100644 --- a/tests/e2e/multigpu/test_tp.py +++ b/tests/e2e/multigpu/test_tp.py @@ -64,5 +64,5 @@ class TestTensorParallel: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 1.0, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/patched/test_fa_xentropy.py b/tests/e2e/patched/test_fa_xentropy.py index 9f4699854..d14a5a5de 100644 --- a/tests/e2e/patched/test_fa_xentropy.py +++ b/tests/e2e/patched/test_fa_xentropy.py @@ -78,5 +78,5 @@ class TestFAXentropyLlama: check_model_output_exists(temp_dir, cfg) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/patched/test_flattening.py b/tests/e2e/patched/test_flattening.py index 2c247d406..e7546c092 100644 --- a/tests/e2e/patched/test_flattening.py +++ b/tests/e2e/patched/test_flattening.py @@ -77,5 +77,5 @@ class TestFAFlattening: check_model_output_exists(temp_dir, cfg) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/patched/test_unsloth_qlora.py b/tests/e2e/patched/test_unsloth_qlora.py index bf00e8a5f..b75ce3588 100644 --- a/tests/e2e/patched/test_unsloth_qlora.py +++ b/tests/e2e/patched/test_unsloth_qlora.py @@ -73,7 +73,7 @@ class TestUnslothQLoRA: check_model_output_exists(temp_dir, cfg) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high" ) def test_unsloth_llama_qlora_unpacked(self, temp_dir): @@ -124,7 +124,7 @@ class TestUnslothQLoRA: check_model_output_exists(temp_dir, cfg) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( @@ -180,5 +180,5 @@ class TestUnslothQLoRA: check_model_output_exists(temp_dir, cfg) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/solo/test_flex.py b/tests/e2e/solo/test_flex.py index abe8fb69a..1d08ef082 100644 --- a/tests/e2e/solo/test_flex.py +++ b/tests/e2e/solo/test_flex.py @@ -63,5 +63,5 @@ class TestPackedFlex(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/test_embeddings_lr.py b/tests/e2e/test_embeddings_lr.py index 2b2e8e5e8..a9d3b20a3 100644 --- a/tests/e2e/test_embeddings_lr.py +++ b/tests/e2e/test_embeddings_lr.py @@ -57,9 +57,7 @@ class TestEmbeddingsLrScale(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) - check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high" - ) + check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high") @with_temp_dir def test_train_w_embedding_lr(self, temp_dir): @@ -100,6 +98,4 @@ class TestEmbeddingsLrScale(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) - check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high" - ) + check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high") diff --git a/tests/e2e/test_llama_pretrain.py b/tests/e2e/test_llama_pretrain.py index 3aa594fbd..e259491ef 100644 --- a/tests/e2e/test_llama_pretrain.py +++ b/tests/e2e/test_llama_pretrain.py @@ -66,7 +66,7 @@ class TestPretrainLlama: loss_threshold = 6.5 check_tensorboard( temp_dir + "/runs", - "train/train_loss", + "train/loss", loss_threshold, "Train Loss (%s) is too high", ) diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py index 7cb979ce6..f3f578702 100644 --- a/tests/e2e/test_packing_loss.py +++ b/tests/e2e/test_packing_loss.py @@ -62,5 +62,5 @@ class TestPackedLlama(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/test_process_reward_model_smollm2.py b/tests/e2e/test_process_reward_model_smollm2.py index 9d83aabbc..33c65093c 100644 --- a/tests/e2e/test_process_reward_model_smollm2.py +++ b/tests/e2e/test_process_reward_model_smollm2.py @@ -57,7 +57,7 @@ class TestProcessRewardSmolLM2(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.7, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.7, "Train Loss (%s) is too high" ) check_model_output_exists(temp_dir, cfg) diff --git a/tests/e2e/test_qat.py b/tests/e2e/test_qat.py index 251d5b17b..b2706e142 100644 --- a/tests/e2e/test_qat.py +++ b/tests/e2e/test_qat.py @@ -128,7 +128,7 @@ class TestQATLlama: loss_threshold = 2.3 check_tensorboard( temp_dir + "/runs", - "train/train_loss", + "train/loss", loss_threshold, "Train Loss (%s) is too high", ) diff --git a/tests/e2e/test_reward_model_smollm2.py b/tests/e2e/test_reward_model_smollm2.py index cc768b173..58259e5e9 100644 --- a/tests/e2e/test_reward_model_smollm2.py +++ b/tests/e2e/test_reward_model_smollm2.py @@ -66,6 +66,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high" + temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high" ) check_model_output_exists(temp_dir, cfg) diff --git a/tests/e2e/test_streaming.py b/tests/e2e/test_streaming.py index 404fb53da..53329a9ae 100644 --- a/tests/e2e/test_streaming.py +++ b/tests/e2e/test_streaming.py @@ -66,7 +66,7 @@ class TestStreamingDatasets: # Verify training actually happened by checking loss decrease check_tensorboard( temp_dir + "/runs", - "train/train_loss", + "train/loss", 3.0, "Train Loss (%s) is too high", )