diff --git a/examples/glm4/glm4.5-fft-fsdp2-offload.yaml b/examples/glm4/glm4.5-fft-fsdp2-offload.yaml new file mode 100644 index 000000000..eb1f59607 --- /dev/null +++ b/examples/glm4/glm4.5-fft-fsdp2-offload.yaml @@ -0,0 +1,73 @@ +base_model: zai-org/GLM-4.5-Air +# Automatically upload checkpoint and final model to HF +# hub_model_id: username/custom_model_name + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + +experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding + +datasets: + - path: winglian/pirate-ultrachat-10k + type: chat_template +dataset_prepared_path: last_run_prepared +val_set_size: 0 +output_dir: ./outputs/qlora-out + +sequence_len: 2048 +sample_packing: true +eval_sample_packing: true + +lora_r: 16 +lora_alpha: 32 +lora_dropout: 0.05 +lora_target_modules: + - gate_proj + - down_proj + - up_proj + - q_proj + - v_proj + - k_proj + - o_proj + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 1 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_torch_4bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: false + +# gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +loss_watchdog_threshold: 5.0 +loss_watchdog_patience: 3 + +warmup_ratio: 0.1 +evals_per_epoch: 1 +saves_per_epoch: 1 +weight_decay: 0.0 +special_tokens: + +fsdp_version: 2 +fsdp_config: + offload_params: true + cpu_ram_efficient_loading: true + auto_wrap_policy: TRANSFORMER_BASED_WRAP + transformer_layer_cls_to_wrap: Glm4MoeDecoderLayer + state_dict_type: FULL_STATE_DICT + reshard_after_forward: true + activation_checkpointing: true + +# save_first_step: true # uncomment this to validate checkpoint saving works with your config diff --git a/examples/glm4/glm4.5-qlora.yaml b/examples/glm4/glm4.5-qlora.yaml index bc5a42500..f7b9236c7 100644 --- a/examples/glm4/glm4.5-qlora.yaml +++ b/examples/glm4/glm4.5-qlora.yaml @@ -2,6 +2,9 @@ base_model: zai-org/GLM-4.5-Air # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + load_in_4bit: true datasets: @@ -40,7 +43,7 @@ wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 2 num_epochs: 1 -optimizer: adamw_8bit +optimizer: adamw_torch_8bit lr_scheduler: cosine learning_rate: 0.0002