fix flaky tests; should be using train loss from final step rather than final avg train loss

2026-03-22 10:38:46 -04:00
parent 5b2e3f00ce
commit 6130e40c37
21 changed files with 37 additions and 41 deletions
--- a/tests/e2e/multigpu/patched/test_sp.py
+++ b/tests/e2e/multigpu/patched/test_sp.py
@@ -94,7 +94,7 @@ class TestSequenceParallelism:

        check_tensorboard(
            temp_dir + "/runs",
-            "train/train_loss",
+            "train/loss",
            threshold,
            "Train Loss (%s) is too high",
        )