From a28eb600e90eb5aeabe358a83ed42a2a610c97f6 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Wed, 13 Aug 2025 13:57:15 +0700
Subject: [PATCH] feat: add readme and better examples

---
 examples/glm45/README.md                      | 48 +++++++++++++++++++
 ...dp2-offload.yaml => glm4.5-fft-fsdp2.yaml} |  6 +--
 ...m4.5-qlora.yaml => glm4.5-lora-fsdp2.yaml} | 31 +++++++-----
 3 files changed, 69 insertions(+), 16 deletions(-)
 create mode 100644 examples/glm45/README.md
 rename examples/glm45/{glm4.5-fft-fsdp2-offload.yaml => glm4.5-fft-fsdp2.yaml} (88%)
 rename examples/glm45/{glm4.5-qlora.yaml => glm4.5-lora-fsdp2.yaml} (65%)
diff --git a/examples/glm45/README.md b/examples/glm45/README.md
new file mode 100644
index 000000000..ec9be2f8b
--- /dev/null
+++ b/examples/glm45/README.md
@@ -0,0 +1,48 @@
+# Finetune GLM4.5 with Axolotl
+
+[UNSTABLE]
+
+```bash
+# LoRA SFT (4xH200 @ 84GB/GPU)
+axolotl train examples/glm45/glm4.5-lora-fsdp2.yaml
+
+# FFT SFT (4xH200)
+# Checkpointing error on backward pass
+# Without checkpointing => OOM
+axolotl train examples/glm45/glm4.5-fft-fsdp2.yaml
+```
+
+## Dataset
+
+In addition to normal OpenAI Messages format, GLM4.5 support an extra parameter for thinking in assistant section.
+
+```json
+{
+    "role": "assistant",
+    "reasoning_content": "...",  // or have </think>...</think> in `content`
+    "content": "...",
+}
+```
+
+Note:
+- The role name for tools in this template is `tool`.
+- You will see this Axolotl WARNING. This is to be as expected as the template does not use EOS.
+```bash
+EOS token '<|endoftext|>' not found in chat_template. Please check if your template/EOS token is correct.
+```
+- Make sure you set the below extra attributes if needed
+```yaml
+datasets:
+  - path: ...
+    type: chat_template
+    message_property_mappings:
+      role: role
+      content: content
+
+    #   tool_calls: tool_calls  # uncomment if using tools
+    #   reasoning_content: reasoning_content  # uncomment if have reasoning
+
+# Uncomment if training on tool role (you would rarely if ever need this)
+# eot_tokens:
+#   - <|observation|>
+```
diff --git a/examples/glm45/glm4.5-fft-fsdp2-offload.yaml b/examples/glm45/glm4.5-fft-fsdp2.yaml
similarity index 88%
rename from examples/glm45/glm4.5-fft-fsdp2-offload.yaml
rename to examples/glm45/glm4.5-fft-fsdp2.yaml
index c23dd2b4d..6dc62f04d 100644
--- a/examples/glm45/glm4.5-fft-fsdp2-offload.yaml
+++ b/examples/glm45/glm4.5-fft-fsdp2.yaml
@@ -50,12 +50,10 @@ special_tokens:
 
 fsdp_version: 2
 fsdp_config:
-  offload_params: true
+  offload_params: false
   cpu_ram_efficient_loading: true
   auto_wrap_policy: TRANSFORMER_BASED_WRAP
   transformer_layer_cls_to_wrap: Glm4MoeDecoderLayer
-  state_dict_type: FULL_STATE_DICT
+  state_dict_type: SHARDED_STATE_DICT
   reshard_after_forward: true
   activation_checkpointing: true
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/glm45/glm4.5-qlora.yaml b/examples/glm45/glm4.5-lora-fsdp2.yaml
similarity index 65%
rename from examples/glm45/glm4.5-qlora.yaml
rename to examples/glm45/glm4.5-lora-fsdp2.yaml
index f7b9236c7..bdef9465d 100644
--- a/examples/glm45/glm4.5-qlora.yaml
+++ b/examples/glm45/glm4.5-lora-fsdp2.yaml
@@ -5,7 +5,7 @@ base_model: zai-org/GLM-4.5-Air
 plugins:
   - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 
-load_in_4bit: true
+experimental_skip_move_to_device: true  # prevent OOM by NOT putting model to GPU before sharding
 
 datasets:
   - path: winglian/pirate-ultrachat-10k
@@ -14,14 +14,9 @@ dataset_prepared_path: last_run_prepared
 val_set_size: 0
 output_dir: ./outputs/qlora-out
 
-adapter: qlora
+adapter: lora
 lora_model_dir:
 
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: true
-
-
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
@@ -34,23 +29,27 @@ lora_target_modules:
   - k_proj
   - o_proj
 
+sequence_len: 2048
+sample_packing: true
+eval_sample_packing: true
+
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 
-gradient_accumulation_steps: 2
-micro_batch_size: 2
+gradient_accumulation_steps: 1
+micro_batch_size: 1
 num_epochs: 1
-optimizer: adamw_torch_8bit
+optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 
 bf16: auto
 tf32: false
 
-gradient_checkpointing: true
+# gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
@@ -64,4 +63,12 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  cpu_ram_efficient_loading: true
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: Glm4MoeDecoderLayer
+  state_dict_type: SHARDED_STATE_DICT
+  reshard_after_forward: true
+  # activation_checkpointing: false