Fix Axolotl ReLoRA optimizer reset scope (#3646)

* Fix Axolotl ReLoRA optimizer reset scope
* fix: make relora reset method honor relora_prune_ratio

When relora_prune_method='reset' and relora_prune_ratio is explicitly
set, the ratio was silently ignored and replaced with the hardcoded
_FULL_RESET_RATIO (0.999). Fix by moving the default-ratio logic to
ReLoRACallback.on_step_begin: None maps to _FULL_RESET_RATIO for reset
and 0.9 for other methods. reset_optimizer now uses the same random
pruning path for both 'random' and 'reset'.

Also consolidate three-layer default mismatch: schema default for
relora_prune_method is now 'magnitude' (single canonical source);
dataclass defaults for both fields changed to None to eliminate the
conflicting fallback layer.

Tests updated: removed the test case that verified the old broken
behavior (reset ignoring ratio), added two cases proving reset honors
the passed ratio. E2E reset fixture now uses ratio=0.5 to make it
unambiguous that the ratio is honored.

* Fix ReLoRA uint8 pruning regression

---------

Signed-off-by: Wing Lian <wing@axolotl.ai>
Co-authored-by: Axolotl Swarm <no-reply@axolotl.ai>
This commit is contained in:
Wing Lian
2026-05-09 17:52:35 -04:00
committed by GitHub
parent 5352d41d32
commit e2f01de0e8
7 changed files with 361 additions and 37 deletions

View File

@@ -56,7 +56,72 @@ class TestReLoraLlama(unittest.TestCase):
],
"warmup_steps": 10,
"num_epochs": 2,
"max_steps": 105, # at least 2x relora_steps
"max_steps": 105, # at least 2x restart cadence
"micro_batch_size": 2,
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_8bit",
"lr_scheduler": "cosine",
"use_tensorboard": True,
"save_first_step": False,
}
)
cfg = validate_config(cfg)
normalize_config(cfg)
dataset_meta = load_datasets(cfg=cfg)
train(cfg=cfg, dataset_meta=dataset_meta)
check_model_output_exists(Path(temp_dir) / "checkpoint-100/adapter", cfg)
assert (Path(temp_dir) / "checkpoint-100/relora/model.safetensors").exists(), (
"Relora model checkpoint not found"
)
check_tensorboard(
temp_dir + "/runs", "train/grad_norm", 0.2, "grad_norm is too high"
)
@with_temp_dir
def test_relora_reset_method(self, temp_dir):
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sequence_len": 2048,
"sample_packing": True,
"pad_to_sequence_len": True,
"flash_attention": True,
"load_in_8bit": True,
"adapter": "lora",
"lora_r": 8,
"lora_alpha": 16,
"lora_dropout": 0.05,
"lora_target_modules": ["q_proj", "v_proj"],
"relora": True,
"jagged_restart_steps": 50,
"jagged_restart_warmup_steps": 10,
"jagged_restart_anneal_steps": 10,
"relora_prune_ratio": 0.5, # explicitly honored by reset (not ignored)
"relora_prune_method": "reset",
"relora_cpu_offload": True,
"val_set_size": 0.0,
"special_tokens": {
"pad_token": "<|endoftext|>",
},
"chat_template": "chatml",
"datasets": [
{
"path": "mlabonne/FineTome-100k",
"type": "chat_template",
"split": "train[:10%]",
"field_messages": "conversations",
"message_field_role": "from",
"message_field_content": "value",
},
],
"warmup_steps": 10,
"num_epochs": 2,
"max_steps": 105,
"micro_batch_size": 2,
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,