Compare commits
3 Commits
main
...
tensorboar
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
598c965043 | ||
|
|
a96733930e | ||
|
|
6130e40c37 |
@@ -3,7 +3,8 @@ set -e
|
|||||||
|
|
||||||
python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
|
python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
|
||||||
|
|
||||||
curl --silent -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/" --use-compress-program unzstd --strip-components=1
|
set -o pipefail
|
||||||
|
curl --silent --show-error --fail --retry 3 --retry-delay 5 -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/" --use-compress-program unzstd --strip-components=1
|
||||||
# hf download "NousResearch/Meta-Llama-3-8B"
|
# hf download "NousResearch/Meta-Llama-3-8B"
|
||||||
# hf download "NousResearch/Meta-Llama-3-8B-Instruct"
|
# hf download "NousResearch/Meta-Llama-3-8B-Instruct"
|
||||||
# hf download "microsoft/Phi-4-reasoning"
|
# hf download "microsoft/Phi-4-reasoning"
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ coverage:
|
|||||||
only_pulls: false
|
only_pulls: false
|
||||||
flags: null
|
flags: null
|
||||||
paths: null
|
paths: null
|
||||||
|
informational: true
|
||||||
|
|
||||||
parsers:
|
parsers:
|
||||||
gcov:
|
gcov:
|
||||||
|
|||||||
@@ -86,5 +86,5 @@ class TestPackedFlex:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ def verify_training_success(temp_dir):
|
|||||||
event_file = os.path.join(tb_log_path, event_files[0])
|
event_file = os.path.join(tb_log_path, event_files[0])
|
||||||
reader = SummaryReader(event_file)
|
reader = SummaryReader(event_file)
|
||||||
df = reader.scalars
|
df = reader.scalars
|
||||||
train_loss_df = df[df.tag == "train/train_loss"]
|
train_loss_df = df[df.tag == "train/loss"]
|
||||||
if len(train_loss_df) > 0:
|
if len(train_loss_df) > 0:
|
||||||
final_loss = train_loss_df.value.values[-1]
|
final_loss = train_loss_df.value.values[-1]
|
||||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ def verify_fp8_training_success(temp_dir):
|
|||||||
event_file = os.path.join(tb_log_path, event_files[0])
|
event_file = os.path.join(tb_log_path, event_files[0])
|
||||||
reader = SummaryReader(event_file)
|
reader = SummaryReader(event_file)
|
||||||
df = reader.scalars
|
df = reader.scalars
|
||||||
train_loss_df = df[df.tag == "train/train_loss"]
|
train_loss_df = df[df.tag == "train/loss"]
|
||||||
if len(train_loss_df) > 0:
|
if len(train_loss_df) > 0:
|
||||||
final_loss = train_loss_df.value.values[-1]
|
final_loss = train_loss_df.value.values[-1]
|
||||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
|
|||||||
event_file = os.path.join(tb_log_path, event_files[0])
|
event_file = os.path.join(tb_log_path, event_files[0])
|
||||||
reader = SummaryReader(event_file)
|
reader = SummaryReader(event_file)
|
||||||
df = reader.scalars
|
df = reader.scalars
|
||||||
train_loss_df = df[df.tag == "train/train_loss"]
|
train_loss_df = df[df.tag == "train/loss"]
|
||||||
if len(train_loss_df) > 0:
|
if len(train_loss_df) > 0:
|
||||||
final_loss = train_loss_df.value.values[-1]
|
final_loss = train_loss_df.value.values[-1]
|
||||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ def verify_training_success(temp_dir):
|
|||||||
event_file = os.path.join(tb_log_path, event_files[0])
|
event_file = os.path.join(tb_log_path, event_files[0])
|
||||||
reader = SummaryReader(event_file)
|
reader = SummaryReader(event_file)
|
||||||
df = reader.scalars
|
df = reader.scalars
|
||||||
train_loss_df = df[df.tag == "train/train_loss"]
|
train_loss_df = df[df.tag == "train/loss"]
|
||||||
if len(train_loss_df) > 0:
|
if len(train_loss_df) > 0:
|
||||||
final_loss = train_loss_df.value.values[-1]
|
final_loss = train_loss_df.value.values[-1]
|
||||||
assert not torch.isnan(torch.tensor(final_loss)), (
|
assert not torch.isnan(torch.tensor(final_loss)), (
|
||||||
|
|||||||
@@ -94,5 +94,5 @@ class TestMultiGPUGemma3:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 1.8, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.8, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.8, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -156,7 +156,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_dpo_lora_ddp(self, temp_dir):
|
def test_dpo_lora_ddp(self, temp_dir):
|
||||||
@@ -233,7 +233,7 @@ class TestMultiGPULlama:
|
|||||||
loss_threshold = 2.3
|
loss_threshold = 2.3
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs",
|
temp_dir + "/runs",
|
||||||
"train/train_loss",
|
"train/loss",
|
||||||
loss_threshold,
|
loss_threshold,
|
||||||
"Train Loss (%s) is too high",
|
"Train Loss (%s) is too high",
|
||||||
)
|
)
|
||||||
@@ -312,7 +312,7 @@ class TestMultiGPULlama:
|
|||||||
loss_threshold = 2.3
|
loss_threshold = 2.3
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs",
|
temp_dir + "/runs",
|
||||||
"train/train_loss",
|
"train/loss",
|
||||||
loss_threshold,
|
loss_threshold,
|
||||||
"Train Loss (%s) is too high",
|
"Train Loss (%s) is too high",
|
||||||
)
|
)
|
||||||
@@ -385,7 +385,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -461,7 +461,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@require_torch_2_6_0
|
@require_torch_2_6_0
|
||||||
@@ -543,7 +543,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_fsdp_qlora_prequant_packed(self, temp_dir):
|
def test_fsdp_qlora_prequant_packed(self, temp_dir):
|
||||||
@@ -623,7 +623,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -708,7 +708,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.45, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.45, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -784,7 +784,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -859,7 +859,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.skip(
|
@pytest.mark.skip(
|
||||||
@@ -925,5 +925,5 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 4.0, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ class TestMultiGPURay:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@require_torch_2_7_0
|
@require_torch_2_7_0
|
||||||
@@ -138,7 +138,7 @@ class TestMultiGPURay:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@require_torch_2_7_0
|
@require_torch_2_7_0
|
||||||
@@ -205,5 +205,5 @@ class TestMultiGPURay:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -64,5 +64,5 @@ class TestTensorParallel:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 1.0, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -78,5 +78,5 @@ class TestFAXentropyLlama:
|
|||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -77,5 +77,5 @@ class TestFAFlattening:
|
|||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 1.5, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ class TestUnslothQLoRA:
|
|||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_unsloth_llama_qlora_unpacked(self, temp_dir):
|
def test_unsloth_llama_qlora_unpacked(self, temp_dir):
|
||||||
@@ -124,7 +124,7 @@ class TestUnslothQLoRA:
|
|||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -180,5 +180,5 @@ class TestUnslothQLoRA:
|
|||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -63,5 +63,5 @@ class TestPackedFlex(unittest.TestCase):
|
|||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.1, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -57,9 +57,7 @@ class TestEmbeddingsLrScale(unittest.TestCase):
|
|||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
|
|
||||||
)
|
|
||||||
|
|
||||||
@with_temp_dir
|
@with_temp_dir
|
||||||
def test_train_w_embedding_lr(self, temp_dir):
|
def test_train_w_embedding_lr(self, temp_dir):
|
||||||
@@ -100,6 +98,4 @@ class TestEmbeddingsLrScale(unittest.TestCase):
|
|||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(temp_dir + "/runs", "train/loss", 2.0, "Loss is too high")
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ class TestPretrainLlama:
|
|||||||
loss_threshold = 6.5
|
loss_threshold = 6.5
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs",
|
temp_dir + "/runs",
|
||||||
"train/train_loss",
|
"train/loss",
|
||||||
loss_threshold,
|
loss_threshold,
|
||||||
"Train Loss (%s) is too high",
|
"Train Loss (%s) is too high",
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -62,5 +62,5 @@ class TestPackedLlama(unittest.TestCase):
|
|||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.0, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ class TestProcessRewardSmolLM2(unittest.TestCase):
|
|||||||
|
|
||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.7, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.7, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ class TestQATLlama:
|
|||||||
loss_threshold = 2.3
|
loss_threshold = 2.3
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs",
|
temp_dir + "/runs",
|
||||||
"train/train_loss",
|
"train/loss",
|
||||||
loss_threshold,
|
loss_threshold,
|
||||||
"Train Loss (%s) is too high",
|
"Train Loss (%s) is too high",
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -66,6 +66,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
|
|||||||
|
|
||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 2.5, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ class TestStreamingDatasets:
|
|||||||
# Verify training actually happened by checking loss decrease
|
# Verify training actually happened by checking loss decrease
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs",
|
temp_dir + "/runs",
|
||||||
"train/train_loss",
|
"train/loss",
|
||||||
3.0,
|
3.0,
|
||||||
"Train Loss (%s) is too high",
|
"Train Loss (%s) is too high",
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user