From f77408a3d040be5e48248d8dd0da300b4b4f948d Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 23 Apr 2026 23:47:28 +0000 Subject: [PATCH] fix tests --- .../integrations/test_cut_cross_entropy.py | 15 +++++++----- tests/e2e/multigpu/test_dist_muon_fsdp2.py | 6 +++-- tests/e2e/multigpu/test_fsdp1.py | 5 ++-- tests/e2e/multigpu/test_fsdp2.py | 3 ++- tests/e2e/test_falcon.py | 6 +++++ tests/e2e/test_phi.py | 4 ++++ tests/e2e/utils.py | 23 +++++++++++-------- 7 files changed, 42 insertions(+), 20 deletions(-) diff --git a/tests/e2e/integrations/test_cut_cross_entropy.py b/tests/e2e/integrations/test_cut_cross_entropy.py index 2cebefdd0..75fb9d0db 100644 --- a/tests/e2e/integrations/test_cut_cross_entropy.py +++ b/tests/e2e/integrations/test_cut_cross_entropy.py @@ -38,14 +38,16 @@ def min_cfg(temp_dir): "num_epochs": 1, "micro_batch_size": 8, "gradient_accumulation_steps": 1, - "learning_rate": 0.00001, + "learning_rate": 5e-4, "optimizer": "adamw_torch_fused", "output_dir": temp_dir, "lr_scheduler": "cosine", - "max_steps": 10, + "max_steps": 40, + "warmup_steps": 5, "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } @@ -72,8 +74,8 @@ class TestCutCrossEntropyIntegration: temp_dir + "/runs", initial_window=5, final_window=5, - max_initial=5.0, - max_final=4.7, + max_initial=2.2, + max_final=2.0, ) def test_qwen2_w_cce(self, temp_dir): @@ -106,6 +108,7 @@ class TestCutCrossEntropyIntegration: "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -159,6 +162,6 @@ class TestCutCrossEntropyIntegration: temp_dir + "/runs", initial_window=5, final_window=5, - max_initial=5.0, - max_final=4.7, + max_initial=2.2, + max_final=2.0, ) diff --git a/tests/e2e/multigpu/test_dist_muon_fsdp2.py b/tests/e2e/multigpu/test_dist_muon_fsdp2.py index 86ed665a1..68fa69ca7 100644 --- a/tests/e2e/multigpu/test_dist_muon_fsdp2.py +++ b/tests/e2e/multigpu/test_dist_muon_fsdp2.py @@ -56,7 +56,8 @@ class TestDistMuon: }, ], "num_epochs": 1, - "max_steps": 20, + "max_steps": 30, + "warmup_steps": 3, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, @@ -118,7 +119,8 @@ class TestDistMuon: "lora_dropout": 0.05, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 20, + "max_steps": 30, + "warmup_steps": 3, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, diff --git a/tests/e2e/multigpu/test_fsdp1.py b/tests/e2e/multigpu/test_fsdp1.py index eb696562a..1b45ff9de 100644 --- a/tests/e2e/multigpu/test_fsdp1.py +++ b/tests/e2e/multigpu/test_fsdp1.py @@ -133,10 +133,11 @@ class TestFSDP1: "load_in_4bit": adapter_config["load_in_4bit"], "lora_r": 8, "lora_alpha": 16, - "lora_dropout": 0.05, + "lora_dropout": 0.0, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 20, + "max_steps": 30, + "warmup_steps": 3, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, diff --git a/tests/e2e/multigpu/test_fsdp2.py b/tests/e2e/multigpu/test_fsdp2.py index 3686a9c59..4ae434333 100644 --- a/tests/e2e/multigpu/test_fsdp2.py +++ b/tests/e2e/multigpu/test_fsdp2.py @@ -314,7 +314,8 @@ class TestFSDP2: "lora_alpha": 16, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 20, + "max_steps": 30, + "warmup_steps": 3, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, diff --git a/tests/e2e/test_falcon.py b/tests/e2e/test_falcon.py index 6a7ead1e0..42ec16107 100644 --- a/tests/e2e/test_falcon.py +++ b/tests/e2e/test_falcon.py @@ -57,12 +57,14 @@ class TestFalcon(unittest.TestCase): "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 50, + "warmup_steps": 5, "logging_steps": 1, "save_steps": 50, "eval_steps": 50, "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) @@ -120,12 +122,14 @@ class TestFalcon(unittest.TestCase): "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 50, + "warmup_steps": 5, "logging_steps": 1, "save_steps": 50, "eval_steps": 50, "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) @@ -169,12 +173,14 @@ class TestFalcon(unittest.TestCase): "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "max_steps": 50, + "warmup_steps": 5, "logging_steps": 1, "save_steps": 50, "eval_steps": 50, "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) diff --git a/tests/e2e/test_phi.py b/tests/e2e/test_phi.py index 393713b20..c2a637883 100644 --- a/tests/e2e/test_phi.py +++ b/tests/e2e/test_phi.py @@ -53,12 +53,14 @@ class TestPhi(unittest.TestCase): "lr_scheduler": "cosine", "flash_attention": True, "max_steps": 50, + "warmup_steps": 5, "logging_steps": 1, "save_steps": 50, "eval_steps": 50, "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -111,12 +113,14 @@ class TestPhi(unittest.TestCase): "lr_scheduler": "cosine", "flash_attention": True, "max_steps": 50, + "warmup_steps": 5, "logging_steps": 1, "save_steps": 50, "eval_steps": 50, "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py index f0f5d9f13..8306b72ce 100644 --- a/tests/e2e/utils.py +++ b/tests/e2e/utils.py @@ -207,9 +207,10 @@ def check_tensorboard_loss_decreased( min_delta: float | None = None, max_initial: float | None = None, max_final: float | None = None, - max_loss_ratio: float = 1.10, + max_loss_ratio: float = 0.95, ) -> None: - """Check that training didn't regress — loss stayed in a sensible range. + """Check that training actually learned — loss went down and stayed in + a sensible range. Used with the tiny ``axolotl-ai-co/tiny-*`` CI models, where pretraining was brief enough that final loss won't clear the absolute thresholds used @@ -228,14 +229,17 @@ def check_tensorboard_loss_decreased( known-good run. Both are optional but strongly encouraged — loss going *down* from a bad starting scale still looks like "learning." - 2. **Training diverged.** ``max_loss_ratio`` (default 1.10) requires - ``final <= initial * ratio``. Allows small noise in flat-loss cases - (common with tiny pretrained models that start near optimum), but - a final loss 10%+ above initial flags instability / NaNs / drift. + 2. **Loss didn't go down enough.** ``max_loss_ratio`` (default 0.95) + requires ``final <= initial * ratio``. A default below 1.0 means the + final window mean must sit at least 5% below the initial window mean + — real learning, not noise that happened to land below start. Only + raise this for configs where a smaller drop is expected *and* + documented (e.g. DPO with near-trivial pairs); in that case you are + intentionally weakening the test. ``min_delta`` is optional; when set, additionally requires ``final + min_delta <= initial`` — use for configs with enough signal - to demand a strict decrease. + to demand a specific minimum absolute drop. """ tb_log_path = most_recent_subdir(temp_run_dir) event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) @@ -270,10 +274,11 @@ def check_tensorboard_loss_decreased( ) assert final > 1e-5, "Expected loss to be greater than zero" assert final <= initial * max_loss_ratio, ( - f"Loss regressed for {chosen_tag}: " + f"Loss did not decrease for {chosen_tag}: " f"initial(mean of first {initial_window})={initial:.4f}, " f"final(mean of last {final_window})={final:.4f}, " - f"ratio={final / initial:.4f} (max allowed {max_loss_ratio})" + f"ratio={final / initial:.4f} (max allowed {max_loss_ratio}). " + f"Expected final <= initial — training did not learn." ) if min_delta is not None: assert final + min_delta <= initial, (