From 1b9520cc8b8115a8bc0c1b3bce00345443bed54d Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 25 Apr 2026 02:17:48 +0000 Subject: [PATCH] more train steps --- tests/e2e/multigpu/test_dist_muon_fsdp2.py | 24 +++++---- tests/e2e/multigpu/test_fsdp1.py | 27 +++++++--- tests/e2e/multigpu/test_fsdp2.py | 55 ++++++++++++++------ tests/e2e/patched/test_falcon_samplepack.py | 2 + tests/e2e/patched/test_mistral_samplepack.py | 2 + tests/e2e/patched/test_mixtral_samplepack.py | 28 +++++----- tests/e2e/patched/test_phi_multipack.py | 2 + tests/e2e/test_falcon.py | 14 ++--- 8 files changed, 105 insertions(+), 49 deletions(-) diff --git a/tests/e2e/multigpu/test_dist_muon_fsdp2.py b/tests/e2e/multigpu/test_dist_muon_fsdp2.py index 68fa69ca7..05841bb64 100644 --- a/tests/e2e/multigpu/test_dist_muon_fsdp2.py +++ b/tests/e2e/multigpu/test_dist_muon_fsdp2.py @@ -31,8 +31,8 @@ def verify_training_success(temp_dir): check_tensorboard_loss_decreased( temp_dir + "/runs", - initial_window=3, - final_window=3, + initial_window=10, + final_window=10, max_initial=5.0, max_final=4.7, ) @@ -56,12 +56,12 @@ class TestDistMuon: }, ], "num_epochs": 1, - "max_steps": 30, - "warmup_steps": 3, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.02, + "learning_rate": 2e-3, "optimizer": "muon", "weight_decay": 0.01, "lr_scheduler": "cosine", @@ -76,6 +76,9 @@ class TestDistMuon: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, } ) @@ -116,15 +119,15 @@ class TestDistMuon: "adapter": "lora", "lora_r": 8, "lora_alpha": 16, - "lora_dropout": 0.05, + "lora_dropout": 0.0, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 30, - "warmup_steps": 3, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.02, + "learning_rate": 2e-3, "optimizer": "muon", "weight_decay": 0.01, "lr_scheduler": "cosine", @@ -139,6 +142,9 @@ class TestDistMuon: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, } ) diff --git a/tests/e2e/multigpu/test_fsdp1.py b/tests/e2e/multigpu/test_fsdp1.py index 1b45ff9de..c6a8a47e9 100644 --- a/tests/e2e/multigpu/test_fsdp1.py +++ b/tests/e2e/multigpu/test_fsdp1.py @@ -32,8 +32,8 @@ def verify_training_success(temp_dir): check_tensorboard_loss_decreased( temp_dir + "/runs", - initial_window=3, - final_window=3, + initial_window=10, + final_window=10, max_initial=5.0, max_final=4.7, ) @@ -60,7 +60,8 @@ class TestFSDP1: }, ], "num_epochs": 1, - "max_steps": 20, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, @@ -80,6 +81,9 @@ class TestFSDP1: "fsdp_use_orig_params": False, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, } ) @@ -136,12 +140,12 @@ class TestFSDP1: "lora_dropout": 0.0, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 30, - "warmup_steps": 3, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 2e-4, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -157,6 +161,9 @@ class TestFSDP1: "fsdp_use_orig_params": False, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, } ) @@ -217,6 +224,9 @@ class TestFSDP1: "fsdp_use_orig_params": False, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, } ) @@ -279,7 +289,7 @@ class TestFSDP1: "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 2e-4, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -295,6 +305,9 @@ class TestFSDP1: "fsdp_use_orig_params": False, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": "auto", "tf32": True, } diff --git a/tests/e2e/multigpu/test_fsdp2.py b/tests/e2e/multigpu/test_fsdp2.py index 4ae434333..b48ddd436 100644 --- a/tests/e2e/multigpu/test_fsdp2.py +++ b/tests/e2e/multigpu/test_fsdp2.py @@ -32,8 +32,8 @@ def verify_training_success(temp_dir): check_tensorboard_loss_decreased( temp_dir + "/runs", - initial_window=3, - final_window=3, + initial_window=10, + final_window=10, max_initial=5.0, max_final=4.7, ) @@ -61,7 +61,8 @@ class TestFSDP2: }, ], "num_epochs": 1, - "max_steps": 20, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, @@ -79,6 +80,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, } ) @@ -121,14 +125,15 @@ class TestFSDP2: "adapter": "lora", "lora_r": 8, "lora_alpha": 16, - "lora_dropout": 0.05, + "lora_dropout": 0.0, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 20, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 2e-4, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -142,6 +147,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, # explicitly disable LORA kernels, as they may be auto-enabled "lora_mlp_kernel": False, @@ -188,11 +196,12 @@ class TestFSDP2: "lora_alpha": 16, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 20, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 2e-4, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -206,6 +215,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, "lora_mlp_kernel": True, "lora_qkv_kernel": True, @@ -250,14 +262,15 @@ class TestFSDP2: "adapter": "qlora", "lora_r": 8, "lora_alpha": 16, - "lora_dropout": 0.05, + "lora_dropout": 0.0, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 20, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 2e-4, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -271,6 +284,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, } ) @@ -314,12 +330,12 @@ class TestFSDP2: "lora_alpha": 16, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 30, - "warmup_steps": 3, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 2e-4, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -333,6 +349,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, "lora_mlp_kernel": True, "lora_qkv_kernel": True, @@ -395,6 +414,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, } ) @@ -443,7 +465,7 @@ class TestFSDP2: "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 2e-4, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -457,6 +479,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, } ) diff --git a/tests/e2e/patched/test_falcon_samplepack.py b/tests/e2e/patched/test_falcon_samplepack.py index 3166eff52..1d688585e 100644 --- a/tests/e2e/patched/test_falcon_samplepack.py +++ b/tests/e2e/patched/test_falcon_samplepack.py @@ -61,6 +61,7 @@ class TestFalconPatched(unittest.TestCase): "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -110,6 +111,7 @@ class TestFalconPatched(unittest.TestCase): "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) diff --git a/tests/e2e/patched/test_mistral_samplepack.py b/tests/e2e/patched/test_mistral_samplepack.py index 7b38319b3..ab59a000c 100644 --- a/tests/e2e/patched/test_mistral_samplepack.py +++ b/tests/e2e/patched/test_mistral_samplepack.py @@ -63,6 +63,7 @@ class TestMistral(unittest.TestCase): "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -113,6 +114,7 @@ class TestMistral(unittest.TestCase): "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) diff --git a/tests/e2e/patched/test_mixtral_samplepack.py b/tests/e2e/patched/test_mixtral_samplepack.py index 92a3810c4..3c6eb8d12 100644 --- a/tests/e2e/patched/test_mixtral_samplepack.py +++ b/tests/e2e/patched/test_mixtral_samplepack.py @@ -33,7 +33,7 @@ class TestMixtral(unittest.TestCase): "adapter": "qlora", "lora_r": 16, "lora_alpha": 32, - "lora_dropout": 0.1, + "lora_dropout": 0.0, "lora_target_linear": True, "val_set_size": 0.05, "special_tokens": {}, @@ -47,16 +47,18 @@ class TestMixtral(unittest.TestCase): "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 2e-4, + "learning_rate": 3e-3, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 50, + "max_steps": 80, + "warmup_steps": 5, "logging_steps": 1, - "save_steps": 50, - "eval_steps": 50, + "save_steps": 80, + "eval_steps": 80, "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -67,8 +69,8 @@ class TestMixtral(unittest.TestCase): check_model_output_exists(temp_dir, cfg) check_tensorboard_loss_decreased( temp_dir + "/runs", - initial_window=5, - final_window=5, + initial_window=10, + final_window=10, max_initial=6.0, max_final=4.7, ) @@ -93,16 +95,18 @@ class TestMixtral(unittest.TestCase): "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 2e-4, - "optimizer": "adamw_bnb_8bit", + "learning_rate": 5e-4, + "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 50, + "max_steps": 80, + "warmup_steps": 5, "logging_steps": 1, - "save_steps": 50, - "eval_steps": 50, + "save_steps": 80, + "eval_steps": 80, "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) diff --git a/tests/e2e/patched/test_phi_multipack.py b/tests/e2e/patched/test_phi_multipack.py index 2e8d45f05..c3c8ff569 100644 --- a/tests/e2e/patched/test_phi_multipack.py +++ b/tests/e2e/patched/test_phi_multipack.py @@ -60,6 +60,7 @@ class TestPhiMultipack(unittest.TestCase): "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) @@ -120,6 +121,7 @@ class TestPhiMultipack(unittest.TestCase): "bf16": "auto", "save_first_step": False, "use_tensorboard": True, + "seed": 42, } ) diff --git a/tests/e2e/test_falcon.py b/tests/e2e/test_falcon.py index 42ec16107..19de202d2 100644 --- a/tests/e2e/test_falcon.py +++ b/tests/e2e/test_falcon.py @@ -166,17 +166,19 @@ class TestFalcon(unittest.TestCase): }, ], "num_epochs": 2, + "sample_packing": True, + "pad_to_sequence_len": True, "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 2e-4, + "learning_rate": 5e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 50, + "max_steps": 80, "warmup_steps": 5, "logging_steps": 1, - "save_steps": 50, - "eval_steps": 50, + "save_steps": 80, + "eval_steps": 80, "bf16": "auto", "save_first_step": False, "use_tensorboard": True, @@ -192,8 +194,8 @@ class TestFalcon(unittest.TestCase): check_model_output_exists(temp_dir, cfg) check_tensorboard_loss_decreased( temp_dir + "/runs", - initial_window=5, - final_window=5, + initial_window=10, + final_window=10, max_initial=5.0, max_final=4.7, )