From a28eb600e90eb5aeabe358a83ed42a2a610c97f6 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Wed, 13 Aug 2025 13:57:15 +0700 Subject: [PATCH] feat: add readme and better examples --- examples/glm45/README.md | 48 +++++++++++++++++++ ...dp2-offload.yaml => glm4.5-fft-fsdp2.yaml} | 6 +-- ...m4.5-qlora.yaml => glm4.5-lora-fsdp2.yaml} | 31 +++++++----- 3 files changed, 69 insertions(+), 16 deletions(-) create mode 100644 examples/glm45/README.md rename examples/glm45/{glm4.5-fft-fsdp2-offload.yaml => glm4.5-fft-fsdp2.yaml} (88%) rename examples/glm45/{glm4.5-qlora.yaml => glm4.5-lora-fsdp2.yaml} (65%) diff --git a/examples/glm45/README.md b/examples/glm45/README.md new file mode 100644 index 000000000..ec9be2f8b --- /dev/null +++ b/examples/glm45/README.md @@ -0,0 +1,48 @@ +# Finetune GLM4.5 with Axolotl + +[UNSTABLE] + +```bash +# LoRA SFT (4xH200 @ 84GB/GPU) +axolotl train examples/glm45/glm4.5-lora-fsdp2.yaml + +# FFT SFT (4xH200) +# Checkpointing error on backward pass +# Without checkpointing => OOM +axolotl train examples/glm45/glm4.5-fft-fsdp2.yaml +``` + +## Dataset + +In addition to normal OpenAI Messages format, GLM4.5 support an extra parameter for thinking in assistant section. + +```json +{ + "role": "assistant", + "reasoning_content": "...", // or have ... in `content` + "content": "...", +} +``` + +Note: +- The role name for tools in this template is `tool`. +- You will see this Axolotl WARNING. This is to be as expected as the template does not use EOS. +```bash +EOS token '<|endoftext|>' not found in chat_template. Please check if your template/EOS token is correct. +``` +- Make sure you set the below extra attributes if needed +```yaml +datasets: + - path: ... + type: chat_template + message_property_mappings: + role: role + content: content + + # tool_calls: tool_calls # uncomment if using tools + # reasoning_content: reasoning_content # uncomment if have reasoning + +# Uncomment if training on tool role (you would rarely if ever need this) +# eot_tokens: +# - <|observation|> +``` diff --git a/examples/glm45/glm4.5-fft-fsdp2-offload.yaml b/examples/glm45/glm4.5-fft-fsdp2.yaml similarity index 88% rename from examples/glm45/glm4.5-fft-fsdp2-offload.yaml rename to examples/glm45/glm4.5-fft-fsdp2.yaml index c23dd2b4d..6dc62f04d 100644 --- a/examples/glm45/glm4.5-fft-fsdp2-offload.yaml +++ b/examples/glm45/glm4.5-fft-fsdp2.yaml @@ -50,12 +50,10 @@ special_tokens: fsdp_version: 2 fsdp_config: - offload_params: true + offload_params: false cpu_ram_efficient_loading: true auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Glm4MoeDecoderLayer - state_dict_type: FULL_STATE_DICT + state_dict_type: SHARDED_STATE_DICT reshard_after_forward: true activation_checkpointing: true - -# save_first_step: true # uncomment this to validate checkpoint saving works with your config diff --git a/examples/glm45/glm4.5-qlora.yaml b/examples/glm45/glm4.5-lora-fsdp2.yaml similarity index 65% rename from examples/glm45/glm4.5-qlora.yaml rename to examples/glm45/glm4.5-lora-fsdp2.yaml index f7b9236c7..bdef9465d 100644 --- a/examples/glm45/glm4.5-qlora.yaml +++ b/examples/glm45/glm4.5-lora-fsdp2.yaml @@ -5,7 +5,7 @@ base_model: zai-org/GLM-4.5-Air plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin -load_in_4bit: true +experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding datasets: - path: winglian/pirate-ultrachat-10k @@ -14,14 +14,9 @@ dataset_prepared_path: last_run_prepared val_set_size: 0 output_dir: ./outputs/qlora-out -adapter: qlora +adapter: lora lora_model_dir: -sequence_len: 2048 -sample_packing: true -eval_sample_packing: true - - lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 @@ -34,23 +29,27 @@ lora_target_modules: - k_proj - o_proj +sequence_len: 2048 +sample_packing: true +eval_sample_packing: true + wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: -gradient_accumulation_steps: 2 -micro_batch_size: 2 +gradient_accumulation_steps: 1 +micro_batch_size: 1 num_epochs: 1 -optimizer: adamw_torch_8bit +optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 0.0002 bf16: auto tf32: false -gradient_checkpointing: true +# gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true @@ -64,4 +63,12 @@ saves_per_epoch: 1 weight_decay: 0.0 special_tokens: -# save_first_step: true # uncomment this to validate checkpoint saving works with your config +fsdp_version: 2 +fsdp_config: + offload_params: false + cpu_ram_efficient_loading: true + auto_wrap_policy: TRANSFORMER_BASED_WRAP + transformer_layer_cls_to_wrap: Glm4MoeDecoderLayer + state_dict_type: SHARDED_STATE_DICT + reshard_after_forward: true + # activation_checkpointing: false