From a0670abc94bd70927ad80ef7f1e8a3b74f3f14f5 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 18 Apr 2025 08:11:11 -0700 Subject: [PATCH] add output for train loss in assertian err --- tests/e2e/integrations/test_kd.py | 4 ++-- tests/e2e/multigpu/solo/test_flex.py | 2 +- tests/e2e/multigpu/test_gemma3.py | 2 +- tests/e2e/multigpu/test_llama.py | 20 ++++++++++---------- tests/e2e/multigpu/test_ray.py | 4 ++-- tests/e2e/multigpu/test_sp.py | 2 +- tests/e2e/patched/test_fa_xentropy.py | 2 +- tests/e2e/patched/test_unsloth_qlora.py | 6 +++--- tests/e2e/solo/test_flex.py | 2 +- tests/e2e/test_llama_pretrain.py | 2 +- tests/e2e/test_packing_loss.py | 2 +- tests/e2e/test_reward_model_smollm2.py | 2 +- 12 files changed, 25 insertions(+), 25 deletions(-) diff --git a/tests/e2e/integrations/test_kd.py b/tests/e2e/integrations/test_kd.py index 9bfe5aaef..681674f1c 100644 --- a/tests/e2e/integrations/test_kd.py +++ b/tests/e2e/integrations/test_kd.py @@ -90,7 +90,7 @@ class TestKnowledgeDistillation: train(cfg=cfg, dataset_meta=dataset_meta) assert (Path(temp_dir) / "model.safetensors").exists() check_tensorboard( - temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high" + temp_dir + "/runs", "train/loss", 1.0, "Train loss (%s) is too high" ) @pytest.mark.parametrize( @@ -121,5 +121,5 @@ class TestKnowledgeDistillation: train(cfg=cfg, dataset_meta=dataset_meta) assert (Path(temp_dir) / "adapter_model.safetensors").exists() check_tensorboard( - temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high" + temp_dir + "/runs", "train/loss", 1.0, "Train loss (%s) is too high" ) diff --git a/tests/e2e/multigpu/solo/test_flex.py b/tests/e2e/multigpu/solo/test_flex.py index cbe3794b3..785788451 100644 --- a/tests/e2e/multigpu/solo/test_flex.py +++ b/tests/e2e/multigpu/solo/test_flex.py @@ -89,5 +89,5 @@ class TestPackedFlex: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high" ) diff --git a/tests/e2e/multigpu/test_gemma3.py b/tests/e2e/multigpu/test_gemma3.py index 9de3ed82f..7de594718 100644 --- a/tests/e2e/multigpu/test_gemma3.py +++ b/tests/e2e/multigpu/test_gemma3.py @@ -96,5 +96,5 @@ class TestMultiGPUGemma3: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 1.8, "Train loss (%s) is too high" ) diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py index 6f875c4a7..28183d3f0 100644 --- a/tests/e2e/multigpu/test_llama.py +++ b/tests/e2e/multigpu/test_llama.py @@ -94,7 +94,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high" ) @pytest.mark.parametrize( @@ -159,7 +159,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high" ) def test_dpo_lora_ddp(self, temp_dir): @@ -385,7 +385,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high" ) @pytest.mark.parametrize( @@ -457,7 +457,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high" ) @require_torch_2_6_0 @@ -538,7 +538,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.1, "Train loss (%s) is too high" ) def test_fsdp_qlora_prequant_packed(self, temp_dir): @@ -618,7 +618,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high" ) @pytest.mark.parametrize( @@ -702,7 +702,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high" ) @pytest.mark.parametrize( @@ -776,7 +776,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high" ) @pytest.mark.parametrize( @@ -850,7 +850,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high" ) @pytest.mark.skip( @@ -917,5 +917,5 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 4.0, "Train loss (%s) is too high" ) diff --git a/tests/e2e/multigpu/test_ray.py b/tests/e2e/multigpu/test_ray.py index 843adac91..5a816fb57 100644 --- a/tests/e2e/multigpu/test_ray.py +++ b/tests/e2e/multigpu/test_ray.py @@ -80,7 +80,7 @@ class TestMultiGPURay: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high" ) @require_torch_lt_2_6_0 @@ -138,5 +138,5 @@ class TestMultiGPURay: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high" ) diff --git a/tests/e2e/multigpu/test_sp.py b/tests/e2e/multigpu/test_sp.py index 72e5cb88c..dc1f7494b 100644 --- a/tests/e2e/multigpu/test_sp.py +++ b/tests/e2e/multigpu/test_sp.py @@ -93,7 +93,7 @@ class TestSequenceParallelism: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.6, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.6, "Train loss (%s) is too high" ) @pytest.mark.parametrize( diff --git a/tests/e2e/patched/test_fa_xentropy.py b/tests/e2e/patched/test_fa_xentropy.py index f71e4fb4a..39bb38c69 100644 --- a/tests/e2e/patched/test_fa_xentropy.py +++ b/tests/e2e/patched/test_fa_xentropy.py @@ -86,5 +86,5 @@ class TestFAXentropyLlama: check_model_output_exists(temp_dir, cfg) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 1.5, "Train loss (%s) is too high" ) diff --git a/tests/e2e/patched/test_unsloth_qlora.py b/tests/e2e/patched/test_unsloth_qlora.py index 4cea0d26f..be31bf106 100644 --- a/tests/e2e/patched/test_unsloth_qlora.py +++ b/tests/e2e/patched/test_unsloth_qlora.py @@ -80,7 +80,7 @@ class TestUnslothQLoRA: check_model_output_exists(temp_dir, cfg) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high" ) def test_unsloth_llama_qlora_unpacked(self, temp_dir): @@ -130,7 +130,7 @@ class TestUnslothQLoRA: check_model_output_exists(temp_dir, cfg) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high" ) @pytest.mark.parametrize( @@ -185,5 +185,5 @@ class TestUnslothQLoRA: check_model_output_exists(temp_dir, cfg) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high" ) diff --git a/tests/e2e/solo/test_flex.py b/tests/e2e/solo/test_flex.py index 6de813e37..28c926bcd 100644 --- a/tests/e2e/solo/test_flex.py +++ b/tests/e2e/solo/test_flex.py @@ -69,5 +69,5 @@ class TestPackedFlex(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high" ) diff --git a/tests/e2e/test_llama_pretrain.py b/tests/e2e/test_llama_pretrain.py index 647285e46..a8bfb60da 100644 --- a/tests/e2e/test_llama_pretrain.py +++ b/tests/e2e/test_llama_pretrain.py @@ -84,5 +84,5 @@ class TestPretrainLlama: temp_dir + "/runs", "train/train_loss", loss_threshold, - "Train Loss is too high", + "Train Loss (%s) is too high", ) diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py index 4e8e70419..e4c448963 100644 --- a/tests/e2e/test_packing_loss.py +++ b/tests/e2e/test_packing_loss.py @@ -68,5 +68,5 @@ class TestPackedLlama(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high" ) diff --git a/tests/e2e/test_reward_model_smollm2.py b/tests/e2e/test_reward_model_smollm2.py index 240c4b392..9411ba304 100644 --- a/tests/e2e/test_reward_model_smollm2.py +++ b/tests/e2e/test_reward_model_smollm2.py @@ -73,6 +73,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.5, "Train loss (%s) is too high" ) check_model_output_exists(temp_dir, cfg)