add muon optimizer
optimizer_cls_and_kwargs is on trainer_kwargs only add adamw_kwargs if they're non-null fix mocks better handling of override and check the optimizer unwrap optimizer
This commit is contained in:
@@ -28,7 +28,7 @@ class TestTrainCommand(BaseCliTest):
|
||||
config_path.write_text(valid_test_config)
|
||||
|
||||
with patch("axolotl.cli.train.train") as mock_train:
|
||||
mock_train.return_value = (MagicMock(), MagicMock())
|
||||
mock_train.return_value = (MagicMock(), MagicMock(), MagicMock())
|
||||
|
||||
result = cli_runner.invoke(
|
||||
cli,
|
||||
@@ -48,7 +48,7 @@ class TestTrainCommand(BaseCliTest):
|
||||
config_path = self._test_cli_overrides(tmp_path, valid_test_config)
|
||||
|
||||
with patch("axolotl.cli.train.train") as mock_train:
|
||||
mock_train.return_value = (MagicMock(), MagicMock())
|
||||
mock_train.return_value = (MagicMock(), MagicMock(), MagicMock())
|
||||
|
||||
result = cli_runner.invoke(
|
||||
cli,
|
||||
|
||||
@@ -75,7 +75,7 @@ class TestMixtral(unittest.TestCase):
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
model, _ = train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
assert (
|
||||
model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
|
||||
== torch.float32
|
||||
@@ -131,7 +131,7 @@ class TestMixtral(unittest.TestCase):
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
model, _ = train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
assert (
|
||||
model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
|
||||
== torch.float32
|
||||
@@ -190,7 +190,7 @@ class TestMixtral(unittest.TestCase):
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
model, _ = train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
assert (
|
||||
model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
|
||||
== torch.float32
|
||||
@@ -249,7 +249,7 @@ class TestMixtral(unittest.TestCase):
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
model, _ = train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
assert (
|
||||
model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
|
||||
== torch.float32
|
||||
|
||||
@@ -65,8 +65,9 @@ class TestCustomOptimizers(unittest.TestCase):
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
_, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
assert trainer.optimizer.optimizer.__class__.__name__ == "AdamW"
|
||||
|
||||
@with_temp_dir
|
||||
@require_torch_2_5_1
|
||||
@@ -111,8 +112,57 @@ class TestCustomOptimizers(unittest.TestCase):
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
_, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
assert "ADOPT" in trainer.optimizer.optimizer.__class__.__name__
|
||||
|
||||
@with_temp_dir
|
||||
@require_torch_2_5_1
|
||||
def test_muon(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "JackFram/llama-68m",
|
||||
"tokenizer_type": "LlamaTokenizer",
|
||||
"sequence_len": 1024,
|
||||
"load_in_8bit": True,
|
||||
"adapter": "lora",
|
||||
"lora_r": 8,
|
||||
"lora_alpha": 16,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": True,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
"unk_token": "<unk>",
|
||||
"bos_token": "<s>",
|
||||
"eos_token": "</s>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"max_steps": 5,
|
||||
"micro_batch_size": 8,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "muon",
|
||||
"lr_scheduler": "cosine",
|
||||
"weight_decay": 0.01,
|
||||
}
|
||||
)
|
||||
|
||||
cfg = validate_config(cfg)
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
_, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
assert "Muon" in trainer.optimizer.optimizer.__class__.__name__
|
||||
|
||||
@with_temp_dir
|
||||
def test_fft_schedule_free_adamw(self, temp_dir):
|
||||
|
||||
Reference in New Issue
Block a user