Compare commits

...

3 Commits

Author SHA1 Message Date
Wing Lian
598c965043 use train_loss for sp test 2026-03-22 12:00:55 -04:00
Wing Lian
a96733930e retry and more info on download failure 2026-03-22 11:09:33 -04:00
Wing Lian
6130e40c37 fix flaky tests; should be using train loss from final step rather than final avg train loss 2026-03-22 10:38:46 -04:00
22 changed files with 39 additions and 41 deletions

View File

@@ -3,7 +3,8 @@ set -e
python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
curl --silent -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/" --use-compress-program unzstd --strip-components=1
set -o pipefail
curl --silent --show-error --fail --retry 3 --retry-delay 5 -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/" --use-compress-program unzstd --strip-components=1
# hf download "NousResearch/Meta-Llama-3-8B"
# hf download "NousResearch/Meta-Llama-3-8B-Instruct"
# hf download "microsoft/Phi-4-reasoning"

View File

@@ -37,6 +37,7 @@ coverage:
only_pulls: false
flags: null
paths: null
informational: true
parsers:
gcov:

View File

@@ -86,5 +86,5 @@ class TestPackedFlex:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
)

View File

@@ -37,7 +37,7 @@ def verify_training_success(temp_dir):
event_file = os.path.join(tb_log_path, event_files[0])
reader = SummaryReader(event_file)
df = reader.scalars
train_loss_df = df[df.tag == "train/train_loss"]
train_loss_df = df[df.tag == "train/loss"]
if len(train_loss_df) > 0:
final_loss = train_loss_df.value.values[-1]
assert not torch.isnan(torch.tensor(final_loss)), (

View File

@@ -37,7 +37,7 @@ def verify_fp8_training_success(temp_dir):
event_file = os.path.join(tb_log_path, event_files[0])
reader = SummaryReader(event_file)
df = reader.scalars
train_loss_df = df[df.tag == "train/train_loss"]
train_loss_df = df[df.tag == "train/loss"]
if len(train_loss_df) > 0:
final_loss = train_loss_df.value.values[-1]
assert not torch.isnan(torch.tensor(final_loss)), (

View File

@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
event_file = os.path.join(tb_log_path, event_files[0])
reader = SummaryReader(event_file)
df = reader.scalars
train_loss_df = df[df.tag == "train/train_loss"]
train_loss_df = df[df.tag == "train/loss"]
if len(train_loss_df) > 0:
final_loss = train_loss_df.value.values[-1]
assert not torch.isnan(torch.tensor(final_loss)), (

View File

@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
event_file = os.path.join(tb_log_path, event_files[0])
reader = SummaryReader(event_file)
df = reader.scalars
train_loss_df = df[df.tag == "train/train_loss"]
train_loss_df = df[df.tag == "train/loss"]
if len(train_loss_df) > 0:
final_loss = train_loss_df.value.values[-1]
assert not torch.isnan(torch.tensor(final_loss)), (

View File

@@ -94,5 +94,5 @@ class TestMultiGPUGemma3:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 1.8, "Train Loss (%s) is too high"
)

View File

@@ -90,7 +90,7 @@ class TestMultiGPULlama:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.8, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.8, "Train Loss (%s) is too high"
)
@pytest.mark.parametrize(
@@ -156,7 +156,7 @@ class TestMultiGPULlama:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)
def test_dpo_lora_ddp(self, temp_dir):
@@ -233,7 +233,7 @@ class TestMultiGPULlama:
loss_threshold = 2.3
check_tensorboard(
temp_dir + "/runs",
"train/train_loss",
"train/loss",
loss_threshold,
"Train Loss (%s) is too high",
)
@@ -312,7 +312,7 @@ class TestMultiGPULlama:
loss_threshold = 2.3
check_tensorboard(
temp_dir + "/runs",
"train/train_loss",
"train/loss",
loss_threshold,
"Train Loss (%s) is too high",
)
@@ -385,7 +385,7 @@ class TestMultiGPULlama:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)
@pytest.mark.parametrize(
@@ -461,7 +461,7 @@ class TestMultiGPULlama:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)
@require_torch_2_6_0
@@ -543,7 +543,7 @@ class TestMultiGPULlama:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
)
def test_fsdp_qlora_prequant_packed(self, temp_dir):
@@ -623,7 +623,7 @@ class TestMultiGPULlama:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)
@pytest.mark.parametrize(
@@ -708,7 +708,7 @@ class TestMultiGPULlama:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.45, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.45, "Train Loss (%s) is too high"
)
@pytest.mark.parametrize(
@@ -784,7 +784,7 @@ class TestMultiGPULlama:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)
@pytest.mark.parametrize(
@@ -859,7 +859,7 @@ class TestMultiGPULlama:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
)
@pytest.mark.skip(
@@ -925,5 +925,5 @@ class TestMultiGPULlama:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 4.0, "Train Loss (%s) is too high"
)

View File

@@ -79,7 +79,7 @@ class TestMultiGPURay:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)
@require_torch_2_7_0
@@ -138,7 +138,7 @@ class TestMultiGPURay:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)
@require_torch_2_7_0
@@ -205,5 +205,5 @@ class TestMultiGPURay:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
)

View File

@@ -64,5 +64,5 @@ class TestTensorParallel:
)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 1.0, "Train Loss (%s) is too high"
)

View File

@@ -78,5 +78,5 @@ class TestFAXentropyLlama:
check_model_output_exists(temp_dir, cfg)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
)

View File

@@ -77,5 +77,5 @@ class TestFAFlattening:
check_model_output_exists(temp_dir, cfg)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
)

View File

@@ -73,7 +73,7 @@ class TestUnslothQLoRA:
check_model_output_exists(temp_dir, cfg)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
)
def test_unsloth_llama_qlora_unpacked(self, temp_dir):
@@ -124,7 +124,7 @@ class TestUnslothQLoRA:
check_model_output_exists(temp_dir, cfg)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
)
@pytest.mark.parametrize(
@@ -180,5 +180,5 @@ class TestUnslothQLoRA:
check_model_output_exists(temp_dir, cfg)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
)

View File

@@ -63,5 +63,5 @@ class TestPackedFlex(unittest.TestCase):
train(cfg=cfg, dataset_meta=dataset_meta)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
)

View File

@@ -57,9 +57,7 @@ class TestEmbeddingsLrScale(unittest.TestCase):
train(cfg=cfg, dataset_meta=dataset_meta)
check_model_output_exists(temp_dir, cfg)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
)
check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")
@with_temp_dir
def test_train_w_embedding_lr(self, temp_dir):
@@ -100,6 +98,4 @@ class TestEmbeddingsLrScale(unittest.TestCase):
train(cfg=cfg, dataset_meta=dataset_meta)
check_model_output_exists(temp_dir, cfg)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
)
check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")

View File

@@ -66,7 +66,7 @@ class TestPretrainLlama:
loss_threshold = 6.5
check_tensorboard(
temp_dir + "/runs",
"train/train_loss",
"train/loss",
loss_threshold,
"Train Loss (%s) is too high",
)

View File

@@ -62,5 +62,5 @@ class TestPackedLlama(unittest.TestCase):
train(cfg=cfg, dataset_meta=dataset_meta)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
)

View File

@@ -57,7 +57,7 @@ class TestProcessRewardSmolLM2(unittest.TestCase):
train(cfg=cfg, dataset_meta=dataset_meta)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.7, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.7, "Train Loss (%s) is too high"
)
check_model_output_exists(temp_dir, cfg)

View File

@@ -128,7 +128,7 @@ class TestQATLlama:
loss_threshold = 2.3
check_tensorboard(
temp_dir + "/runs",
"train/train_loss",
"train/loss",
loss_threshold,
"Train Loss (%s) is too high",
)

View File

@@ -66,6 +66,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
train(cfg=cfg, dataset_meta=dataset_meta)
check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
)
check_model_output_exists(temp_dir, cfg)

View File

@@ -66,7 +66,7 @@ class TestStreamingDatasets:
# Verify training actually happened by checking loss decrease
check_tensorboard(
temp_dir + "/runs",
"train/train_loss",
"train/loss",
3.0,
"Train Loss (%s) is too high",
)