fix flaky tests; should be using train loss from final step rather than final avg train loss
This commit is contained in:
@@ -94,7 +94,7 @@ class TestSequenceParallelism:
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs",
|
||||
"train/train_loss",
|
||||
"train/loss",
|
||||
threshold,
|
||||
"Train Loss (%s) is too high",
|
||||
)
|
||||
|
||||
@@ -86,5 +86,5 @@ class TestPackedFlex:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -37,7 +37,7 @@ def verify_training_success(temp_dir):
|
||||
event_file = os.path.join(tb_log_path, event_files[0])
|
||||
reader = SummaryReader(event_file)
|
||||
df = reader.scalars
|
||||
train_loss_df = df[df.tag == "train/train_loss"]
|
||||
train_loss_df = df[df.tag == "train/loss"]
|
||||
if len(train_loss_df) > 0:
|
||||
final_loss = train_loss_df.value.values[-1]
|
||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||
|
||||
@@ -37,7 +37,7 @@ def verify_fp8_training_success(temp_dir):
|
||||
event_file = os.path.join(tb_log_path, event_files[0])
|
||||
reader = SummaryReader(event_file)
|
||||
df = reader.scalars
|
||||
train_loss_df = df[df.tag == "train/train_loss"]
|
||||
train_loss_df = df[df.tag == "train/loss"]
|
||||
if len(train_loss_df) > 0:
|
||||
final_loss = train_loss_df.value.values[-1]
|
||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||
|
||||
@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
|
||||
event_file = os.path.join(tb_log_path, event_files[0])
|
||||
reader = SummaryReader(event_file)
|
||||
df = reader.scalars
|
||||
train_loss_df = df[df.tag == "train/train_loss"]
|
||||
train_loss_df = df[df.tag == "train/loss"]
|
||||
if len(train_loss_df) > 0:
|
||||
final_loss = train_loss_df.value.values[-1]
|
||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||
|
||||
@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
|
||||
event_file = os.path.join(tb_log_path, event_files[0])
|
||||
reader = SummaryReader(event_file)
|
||||
df = reader.scalars
|
||||
train_loss_df = df[df.tag == "train/train_loss"]
|
||||
train_loss_df = df[df.tag == "train/loss"]
|
||||
if len(train_loss_df) > 0:
|
||||
final_loss = train_loss_df.value.values[-1]
|
||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||
|
||||
@@ -94,5 +94,5 @@ class TestMultiGPUGemma3:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 1.8, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -90,7 +90,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.8, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.8, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -156,7 +156,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
def test_dpo_lora_ddp(self, temp_dir):
|
||||
@@ -233,7 +233,7 @@ class TestMultiGPULlama:
|
||||
loss_threshold = 2.3
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs",
|
||||
"train/train_loss",
|
||||
"train/loss",
|
||||
loss_threshold,
|
||||
"Train Loss (%s) is too high",
|
||||
)
|
||||
@@ -312,7 +312,7 @@ class TestMultiGPULlama:
|
||||
loss_threshold = 2.3
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs",
|
||||
"train/train_loss",
|
||||
"train/loss",
|
||||
loss_threshold,
|
||||
"Train Loss (%s) is too high",
|
||||
)
|
||||
@@ -385,7 +385,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -461,7 +461,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@require_torch_2_6_0
|
||||
@@ -543,7 +543,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
def test_fsdp_qlora_prequant_packed(self, temp_dir):
|
||||
@@ -623,7 +623,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -708,7 +708,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.45, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.45, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -784,7 +784,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -859,7 +859,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.skip(
|
||||
@@ -925,5 +925,5 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 4.0, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -79,7 +79,7 @@ class TestMultiGPURay:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@require_torch_2_7_0
|
||||
@@ -138,7 +138,7 @@ class TestMultiGPURay:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@require_torch_2_7_0
|
||||
@@ -205,5 +205,5 @@ class TestMultiGPURay:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -64,5 +64,5 @@ class TestTensorParallel:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 1.0, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user