fix flaky tests; should be using train loss from final step rather than final avg train loss
This commit is contained in:
@@ -94,7 +94,7 @@ class TestSequenceParallelism:
|
|||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs",
|
temp_dir + "/runs",
|
||||||
"train/train_loss",
|
"train/loss",
|
||||||
threshold,
|
threshold,
|
||||||
"Train Loss (%s) is too high",
|
"Train Loss (%s) is too high",
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -86,5 +86,5 @@ class TestPackedFlex:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ def verify_training_success(temp_dir):
|
|||||||
event_file = os.path.join(tb_log_path, event_files[0])
|
event_file = os.path.join(tb_log_path, event_files[0])
|
||||||
reader = SummaryReader(event_file)
|
reader = SummaryReader(event_file)
|
||||||
df = reader.scalars
|
df = reader.scalars
|
||||||
train_loss_df = df[df.tag == "train/train_loss"]
|
train_loss_df = df[df.tag == "train/loss"]
|
||||||
if len(train_loss_df) > 0:
|
if len(train_loss_df) > 0:
|
||||||
final_loss = train_loss_df.value.values[-1]
|
final_loss = train_loss_df.value.values[-1]
|
||||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ def verify_fp8_training_success(temp_dir):
|
|||||||
event_file = os.path.join(tb_log_path, event_files[0])
|
event_file = os.path.join(tb_log_path, event_files[0])
|
||||||
reader = SummaryReader(event_file)
|
reader = SummaryReader(event_file)
|
||||||
df = reader.scalars
|
df = reader.scalars
|
||||||
train_loss_df = df[df.tag == "train/train_loss"]
|
train_loss_df = df[df.tag == "train/loss"]
|
||||||
if len(train_loss_df) > 0:
|
if len(train_loss_df) > 0:
|
||||||
final_loss = train_loss_df.value.values[-1]
|
final_loss = train_loss_df.value.values[-1]
|
||||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
|
|||||||
event_file = os.path.join(tb_log_path, event_files[0])
|
event_file = os.path.join(tb_log_path, event_files[0])
|
||||||
reader = SummaryReader(event_file)
|
reader = SummaryReader(event_file)
|
||||||
df = reader.scalars
|
df = reader.scalars
|
||||||
train_loss_df = df[df.tag == "train/train_loss"]
|
train_loss_df = df[df.tag == "train/loss"]
|
||||||
if len(train_loss_df) > 0:
|
if len(train_loss_df) > 0:
|
||||||
final_loss = train_loss_df.value.values[-1]
|
final_loss = train_loss_df.value.values[-1]
|
||||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
|
|||||||
event_file = os.path.join(tb_log_path, event_files[0])
|
event_file = os.path.join(tb_log_path, event_files[0])
|
||||||
reader = SummaryReader(event_file)
|
reader = SummaryReader(event_file)
|
||||||
df = reader.scalars
|
df = reader.scalars
|
||||||
train_loss_df = df[df.tag == "train/train_loss"]
|
train_loss_df = df[df.tag == "train/loss"]
|
||||||
if len(train_loss_df) > 0:
|
if len(train_loss_df) > 0:
|
||||||
final_loss = train_loss_df.value.values[-1]
|
final_loss = train_loss_df.value.values[-1]
|
||||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||||
|
|||||||
@@ -94,5 +94,5 @@ class TestMultiGPUGemma3:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 1.8, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.8, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.8, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -156,7 +156,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_dpo_lora_ddp(self, temp_dir):
|
def test_dpo_lora_ddp(self, temp_dir):
|
||||||
@@ -233,7 +233,7 @@ class TestMultiGPULlama:
|
|||||||
loss_threshold = 2.3
|
loss_threshold = 2.3
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs",
|
temp_dir + "/runs",
|
||||||
"train/train_loss",
|
"train/loss",
|
||||||
loss_threshold,
|
loss_threshold,
|
||||||
"Train Loss (%s) is too high",
|
"Train Loss (%s) is too high",
|
||||||
)
|
)
|
||||||
@@ -312,7 +312,7 @@ class TestMultiGPULlama:
|
|||||||
loss_threshold = 2.3
|
loss_threshold = 2.3
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs",
|
temp_dir + "/runs",
|
||||||
"train/train_loss",
|
"train/loss",
|
||||||
loss_threshold,
|
loss_threshold,
|
||||||
"Train Loss (%s) is too high",
|
"Train Loss (%s) is too high",
|
||||||
)
|
)
|
||||||
@@ -385,7 +385,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -461,7 +461,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@require_torch_2_6_0
|
@require_torch_2_6_0
|
||||||
@@ -543,7 +543,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_fsdp_qlora_prequant_packed(self, temp_dir):
|
def test_fsdp_qlora_prequant_packed(self, temp_dir):
|
||||||
@@ -623,7 +623,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -708,7 +708,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.45, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.45, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -784,7 +784,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -859,7 +859,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.skip(
|
@pytest.mark.skip(
|
||||||
@@ -925,5 +925,5 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 4.0, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ class TestMultiGPURay:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@require_torch_2_7_0
|
@require_torch_2_7_0
|
||||||
@@ -138,7 +138,7 @@ class TestMultiGPURay:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@require_torch_2_7_0
|
@require_torch_2_7_0
|
||||||
@@ -205,5 +205,5 @@ class TestMultiGPURay:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -64,5 +64,5 @@ class TestTensorParallel:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 1.0, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -78,5 +78,5 @@ class TestFAXentropyLlama:
|
|||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -77,5 +77,5 @@ class TestFAFlattening:
|
|||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ class TestUnslothQLoRA:
|
|||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_unsloth_llama_qlora_unpacked(self, temp_dir):
|
def test_unsloth_llama_qlora_unpacked(self, temp_dir):
|
||||||
@@ -124,7 +124,7 @@ class TestUnslothQLoRA:
|
|||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -180,5 +180,5 @@ class TestUnslothQLoRA:
|
|||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -63,5 +63,5 @@ class TestPackedFlex(unittest.TestCase):
|
|||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -57,9 +57,7 @@ class TestEmbeddingsLrScale(unittest.TestCase):
|
|||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
|
|
||||||
)
|
|
||||||
|
|
||||||
@with_temp_dir
|
@with_temp_dir
|
||||||
def test_train_w_embedding_lr(self, temp_dir):
|
def test_train_w_embedding_lr(self, temp_dir):
|
||||||
@@ -100,6 +98,4 @@ class TestEmbeddingsLrScale(unittest.TestCase):
|
|||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ class TestPretrainLlama:
|
|||||||
loss_threshold = 6.5
|
loss_threshold = 6.5
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs",
|
temp_dir + "/runs",
|
||||||
"train/train_loss",
|
"train/loss",
|
||||||
loss_threshold,
|
loss_threshold,
|
||||||
"Train Loss (%s) is too high",
|
"Train Loss (%s) is too high",
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -62,5 +62,5 @@ class TestPackedLlama(unittest.TestCase):
|
|||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ class TestProcessRewardSmolLM2(unittest.TestCase):
|
|||||||
|
|
||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.7, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.7, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ class TestQATLlama:
|
|||||||
loss_threshold = 2.3
|
loss_threshold = 2.3
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs",
|
temp_dir + "/runs",
|
||||||
"train/train_loss",
|
"train/loss",
|
||||||
loss_threshold,
|
loss_threshold,
|
||||||
"Train Loss (%s) is too high",
|
"Train Loss (%s) is too high",
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -66,6 +66,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
|
|||||||
|
|
||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ class TestStreamingDatasets:
|
|||||||
# Verify training actually happened by checking loss decrease
|
# Verify training actually happened by checking loss decrease
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs",
|
temp_dir + "/runs",
|
||||||
"train/train_loss",
|
"train/loss",
|
||||||
3.0,
|
3.0,
|
||||||
"Train Loss (%s) is too high",
|
"Train Loss (%s) is too high",
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user