fix flaky tests; should be using train loss from final step rather than final avg train loss
This commit is contained in:
@@ -94,7 +94,7 @@ class TestSequenceParallelism:
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs",
|
||||
"train/train_loss",
|
||||
"train/loss",
|
||||
threshold,
|
||||
"Train Loss (%s) is too high",
|
||||
)
|
||||
|
||||
@@ -86,5 +86,5 @@ class TestPackedFlex:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -37,7 +37,7 @@ def verify_training_success(temp_dir):
|
||||
event_file = os.path.join(tb_log_path, event_files[0])
|
||||
reader = SummaryReader(event_file)
|
||||
df = reader.scalars
|
||||
train_loss_df = df[df.tag == "train/train_loss"]
|
||||
train_loss_df = df[df.tag == "train/loss"]
|
||||
if len(train_loss_df) > 0:
|
||||
final_loss = train_loss_df.value.values[-1]
|
||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||
|
||||
@@ -37,7 +37,7 @@ def verify_fp8_training_success(temp_dir):
|
||||
event_file = os.path.join(tb_log_path, event_files[0])
|
||||
reader = SummaryReader(event_file)
|
||||
df = reader.scalars
|
||||
train_loss_df = df[df.tag == "train/train_loss"]
|
||||
train_loss_df = df[df.tag == "train/loss"]
|
||||
if len(train_loss_df) > 0:
|
||||
final_loss = train_loss_df.value.values[-1]
|
||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||
|
||||
@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
|
||||
event_file = os.path.join(tb_log_path, event_files[0])
|
||||
reader = SummaryReader(event_file)
|
||||
df = reader.scalars
|
||||
train_loss_df = df[df.tag == "train/train_loss"]
|
||||
train_loss_df = df[df.tag == "train/loss"]
|
||||
if len(train_loss_df) > 0:
|
||||
final_loss = train_loss_df.value.values[-1]
|
||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||
|
||||
@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
|
||||
event_file = os.path.join(tb_log_path, event_files[0])
|
||||
reader = SummaryReader(event_file)
|
||||
df = reader.scalars
|
||||
train_loss_df = df[df.tag == "train/train_loss"]
|
||||
train_loss_df = df[df.tag == "train/loss"]
|
||||
if len(train_loss_df) > 0:
|
||||
final_loss = train_loss_df.value.values[-1]
|
||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||
|
||||
@@ -94,5 +94,5 @@ class TestMultiGPUGemma3:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 1.8, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -90,7 +90,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.8, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.8, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -156,7 +156,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
def test_dpo_lora_ddp(self, temp_dir):
|
||||
@@ -233,7 +233,7 @@ class TestMultiGPULlama:
|
||||
loss_threshold = 2.3
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs",
|
||||
"train/train_loss",
|
||||
"train/loss",
|
||||
loss_threshold,
|
||||
"Train Loss (%s) is too high",
|
||||
)
|
||||
@@ -312,7 +312,7 @@ class TestMultiGPULlama:
|
||||
loss_threshold = 2.3
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs",
|
||||
"train/train_loss",
|
||||
"train/loss",
|
||||
loss_threshold,
|
||||
"Train Loss (%s) is too high",
|
||||
)
|
||||
@@ -385,7 +385,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -461,7 +461,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@require_torch_2_6_0
|
||||
@@ -543,7 +543,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
def test_fsdp_qlora_prequant_packed(self, temp_dir):
|
||||
@@ -623,7 +623,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -708,7 +708,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.45, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.45, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -784,7 +784,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -859,7 +859,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.skip(
|
||||
@@ -925,5 +925,5 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 4.0, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -79,7 +79,7 @@ class TestMultiGPURay:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@require_torch_2_7_0
|
||||
@@ -138,7 +138,7 @@ class TestMultiGPURay:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@require_torch_2_7_0
|
||||
@@ -205,5 +205,5 @@ class TestMultiGPURay:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -64,5 +64,5 @@ class TestTensorParallel:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 1.0, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -78,5 +78,5 @@ class TestFAXentropyLlama:
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -77,5 +77,5 @@ class TestFAFlattening:
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -73,7 +73,7 @@ class TestUnslothQLoRA:
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
def test_unsloth_llama_qlora_unpacked(self, temp_dir):
|
||||
@@ -124,7 +124,7 @@ class TestUnslothQLoRA:
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -180,5 +180,5 @@ class TestUnslothQLoRA:
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -63,5 +63,5 @@ class TestPackedFlex(unittest.TestCase):
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -57,9 +57,7 @@ class TestEmbeddingsLrScale(unittest.TestCase):
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
|
||||
)
|
||||
check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")
|
||||
|
||||
@with_temp_dir
|
||||
def test_train_w_embedding_lr(self, temp_dir):
|
||||
@@ -100,6 +98,4 @@ class TestEmbeddingsLrScale(unittest.TestCase):
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
|
||||
)
|
||||
check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")
|
||||
|
||||
@@ -66,7 +66,7 @@ class TestPretrainLlama:
|
||||
loss_threshold = 6.5
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs",
|
||||
"train/train_loss",
|
||||
"train/loss",
|
||||
loss_threshold,
|
||||
"Train Loss (%s) is too high",
|
||||
)
|
||||
|
||||
@@ -62,5 +62,5 @@ class TestPackedLlama(unittest.TestCase):
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -57,7 +57,7 @@ class TestProcessRewardSmolLM2(unittest.TestCase):
|
||||
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.7, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.7, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
@@ -128,7 +128,7 @@ class TestQATLlama:
|
||||
loss_threshold = 2.3
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs",
|
||||
"train/train_loss",
|
||||
"train/loss",
|
||||
loss_threshold,
|
||||
"Train Loss (%s) is too high",
|
||||
)
|
||||
|
||||
@@ -66,6 +66,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
|
||||
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
|
||||
)
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
@@ -66,7 +66,7 @@ class TestStreamingDatasets:
|
||||
# Verify training actually happened by checking loss decrease
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs",
|
||||
"train/train_loss",
|
||||
"train/loss",
|
||||
3.0,
|
||||
"Train Loss (%s) is too high",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user