fix flaky tests; should be using train loss from final step rather than final avg train loss

2026-03-22 10:38:46 -04:00
parent 5b2e3f00ce
commit 6130e40c37
21 changed files with 37 additions and 41 deletions
--- a/tests/e2e/multigpu/test_fp8_fsdp2.py
+++ b/tests/e2e/multigpu/test_fp8_fsdp2.py
@@ -37,7 +37,7 @@ def verify_fp8_training_success(temp_dir):
            event_file = os.path.join(tb_log_path, event_files[0])
            reader = SummaryReader(event_file)
            df = reader.scalars
-            train_loss_df = df[df.tag == "train/train_loss"]
+            train_loss_df = df[df.tag == "train/loss"]
            if len(train_loss_df) > 0:
                final_loss = train_loss_df.value.values[-1]
                assert not torch.isnan(torch.tensor(final_loss)), (