Revert "checkpoint model on first step callback (#2906 )"

This reverts commit 10ba1622f7.
2025-07-15 15:01:12 -04:00
162 changed files with 206 additions and 1016 deletions
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -92,7 +92,7 @@ jobs:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
+    timeout-minutes: 60
    needs: [pre-commit, pytest]
    strategy:
@@ -116,7 +116,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
--- a/examples/cloud/modal.yaml
+++ b/examples/cloud/modal.yaml
@@ -26,5 +26,3 @@ timeout: 86400
 # Preprocess specific configurations
 memory_preprocess: 32
 timeout_preprocess: 14400
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -35,6 +35,7 @@ wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 4
@@ -55,5 +56,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
        "%%capture\n",
        "# This step can take ~5-10 minutes to install dependencies\n",
        "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-        "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@50cef19\""
+        "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@78b2a45713a54c9bedf8b33f5e31cf07a1a57154\""
      ]
    },
    {
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -56,5 +56,3 @@ evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -56,5 +56,3 @@ evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -55,5 +55,3 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -79,5 +79,3 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/devstral/devstral-small-qlora.yml
+++ b/examples/devstral/devstral-small-qlora.yml
@@ -62,5 +62,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
@@ -69,5 +69,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/falcon-h1/falcon-h1-1b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-1b-qlora.yaml
@@ -46,6 +46,7 @@ wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 4
@@ -68,5 +69,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/falcon-h1/falcon-h1-34b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-34b-qlora.yaml
@@ -69,5 +69,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/falcon-h1/falcon-h1-3b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-3b-qlora.yaml
@@ -69,5 +69,3 @@ evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/falcon-h1/falcon-h1-500m-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-500m-qlora.yaml
@@ -69,5 +69,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/falcon-h1/falcon-h1-7b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-7b-qlora.yaml
@@ -69,5 +69,3 @@ evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -60,5 +60,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -50,5 +50,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -66,5 +66,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -60,5 +60,3 @@ warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -62,5 +62,3 @@ warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/glm4/qlora-32b.yaml
+++ b/examples/glm4/qlora-32b.yaml
@@ -60,5 +60,3 @@ evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -54,5 +54,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -55,5 +55,3 @@ saves_per_epoch: 1
 deepspeed: deepspeed_configs/zero2.json
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -64,5 +64,3 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: JambaAttentionDecoderLayer,JambaMambaDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/lfm2/lfm2-350m-fft.yaml
+++ b/examples/lfm2/lfm2-350m-fft.yaml
@@ -46,5 +46,3 @@ evals_per_epoch: 2
 saves_per_epoch: 1
 weight_decay: 0.0
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -55,5 +55,3 @@ saves_per_epoch: 1
 deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
 weight_decay: 0.1
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -64,5 +64,3 @@ special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -60,5 +60,3 @@ special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -52,5 +52,3 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -52,5 +52,3 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -67,5 +67,3 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -53,5 +53,3 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -58,5 +58,3 @@ special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -57,5 +57,3 @@ warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/3b-qat-fsdp2.yaml
+++ b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -77,5 +77,3 @@ fsdp_config:
 special_tokens:
  pad_token: <|end_of_text|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -72,5 +72,3 @@ fsdp_config:
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot_id|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -42,5 +42,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: <|end_of_text|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -71,5 +71,3 @@ warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -64,5 +64,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: <|end_of_text|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -83,5 +83,3 @@ warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -61,5 +61,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: <|end_of_text|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -65,5 +65,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: "<|end_of_text|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -64,5 +64,3 @@ special_tokens:
 use_ray: true
 ray_num_workers: 4
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -63,5 +63,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: <|end_of_text|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -60,5 +60,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: "<|end_of_text|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -57,5 +57,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: <|end_of_text|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -61,5 +61,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: "<|end_of_text|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -62,5 +62,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: "<|end_of_text|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -60,5 +60,3 @@ fsdp_config:
  fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -69,5 +69,3 @@ fsdp_config:
  fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
  pad_token: <|end_of_text|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -54,5 +54,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: "<|end_of_text|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/sparse-finetuning.yaml
+++ b/examples/llama-3/sparse-finetuning.yaml
@@ -75,5 +75,3 @@ llmcompressor:
          ]
          start: 0
  save_compressed: true
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
@@ -86,5 +86,3 @@ fsdp_config:
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
@@ -90,5 +90,3 @@ fsdp_config:
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
@@ -83,5 +83,3 @@ weight_decay: 0.0
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
@@ -86,5 +86,3 @@ fsdp_config:
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
+++ b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
@@ -84,5 +84,3 @@ fsdp_config:
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-4/scout-qlora-single-h100-flex.yaml
+++ b/examples/llama-4/scout-qlora-single-h100-flex.yaml
@@ -82,5 +82,3 @@ weight_decay: 0.0
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
+++ b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
@@ -87,5 +87,3 @@ fsdp_config:
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -53,5 +53,3 @@ warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/magistral/magistral-small-fsdp-qlora.yaml
+++ b/examples/magistral/magistral-small-fsdp-qlora.yaml
@@ -70,5 +70,3 @@ fsdp_config:
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
  fsdp_activation_checkpointing: true
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/magistral/magistral-small-qlora.yaml
+++ b/examples/magistral/magistral-small-qlora.yaml
@@ -61,5 +61,3 @@ flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -48,5 +48,3 @@ weight_decay: 0.0
 special_tokens:
 tokens:
 save_safetensors: False
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mistral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral-ds-zero3.yaml
@@ -53,5 +53,3 @@ special_tokens:
  eos_token: "<|im_end|>"
 tokens:
  - "<|im_start|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -43,5 +43,3 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -64,5 +64,3 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -64,5 +64,3 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -80,5 +80,3 @@ weight_decay: 0.0
 special_tokens:
  bos_token: "<|im_start|>"
  eos_token: "<|im_end|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mistral/mistral-qlora-fsdp.yml
+++ b/examples/mistral/mistral-qlora-fsdp.yml
@@ -74,5 +74,3 @@ fsdp_config:
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mistral/mistral-qlora-orpo.yml
+++ b/examples/mistral/mistral-qlora-orpo.yml
@@ -69,5 +69,3 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mistral/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small-3.1-24B-lora.yml
@@ -56,5 +56,3 @@ evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
@@ -72,5 +72,3 @@ fsdp_config:
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mistral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-qlora-fsdp.yml
@@ -77,5 +77,3 @@ fsdp_config:
  fsdp_forward_prefetch: false
  fsdp_backward_prefetch: BACKWARD_PRE
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -81,5 +81,3 @@ saves_per_epoch: 1
 deepspeed: deepspeed_configs/zero2.json
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mistral/mixtral_22.yml
+++ b/examples/mistral/mixtral_22.yml
@@ -51,5 +51,3 @@ special_tokens:
  eos_token: "<|im_end|>"
 tokens:
  - "<|im_start|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -64,5 +64,3 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/orpheus/finetune.yml
+++ b/examples/orpheus/finetune.yml
@@ -50,5 +50,3 @@ weight_decay: 0.05
 special_tokens:
  pad_token: <custom_token_7>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/phi/lora-3.5.yaml
+++ b/examples/phi/lora-3.5.yaml
@@ -63,5 +63,3 @@ warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 4
 weight_decay: 0.0
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -57,5 +57,3 @@ weight_decay: 0.1
 resize_token_embeddings_to_32x: true
 special_tokens:
  pad_token: "<|endoftext|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -60,5 +60,3 @@ weight_decay: 0.1
 resize_token_embeddings_to_32x: true
 special_tokens:
  pad_token: "<|endoftext|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/phi/phi2-ft.yml
+++ b/examples/phi/phi2-ft.yml
@@ -57,5 +57,3 @@ weight_decay: 0.1
 resize_token_embeddings_to_32x: true
 special_tokens:
  pad_token: "<|endoftext|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/phi/phi3-ft-fsdp.yml
+++ b/examples/phi/phi3-ft-fsdp.yml
@@ -71,5 +71,3 @@ fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
  pad_token: "<|endoftext|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/phi/phi3-ft.yml
+++ b/examples/phi/phi3-ft.yml
@@ -59,5 +59,3 @@ warmup_ratio: 0.2
 debug: true
 weight_decay: 0.1
 resize_token_embeddings_to_32x: true
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/pixtral/lora-12b.yml
+++ b/examples/pixtral/lora-12b.yml
@@ -55,5 +55,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: <pad>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/qwen2-vl/lora-7b.yaml
+++ b/examples/qwen2-vl/lora-7b.yaml
@@ -53,5 +53,3 @@ warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/qwen2/dpo.yaml
+++ b/examples/qwen2/dpo.yaml
@@ -54,5 +54,3 @@ warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/qwen2/prm.yaml
+++ b/examples/qwen2/prm.yaml
@@ -55,5 +55,3 @@ eval_steps: 100
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/qwen2/qlora-fsdp.yaml
+++ b/examples/qwen2/qlora-fsdp.yaml
@@ -67,5 +67,3 @@ fsdp_config:
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/qwen2/reward-model.yaml
+++ b/examples/qwen2/reward-model.yaml
@@ -26,6 +26,7 @@ wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 4
@@ -49,5 +50,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/qwen2_5-vl/lora-7b.yaml
+++ b/examples/qwen2_5-vl/lora-7b.yaml
@@ -53,5 +53,3 @@ warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/qwen3/32b-qlora.yaml
+++ b/examples/qwen3/32b-qlora.yaml
@@ -67,5 +67,3 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/qwen3/8b-qat-fsdp2.yml
+++ b/examples/qwen3/8b-qat-fsdp2.yml
@@ -76,5 +76,3 @@ fsdp_config:
  fsdp_activation_checkpointing: true
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/qwen3/qlora-fsdp.yaml
+++ b/examples/qwen3/qlora-fsdp.yaml
@@ -66,5 +66,3 @@ fsdp_config:
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,7 +26,7 @@ hf_transfer
 sentencepiece
 gradio==5.23.3
-modal==1.0.2
+modal==0.70.5
 pydantic==2.10.6
 addict
 fire
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
 print(
    UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@50cef19"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@865b899"'
 )
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -36,7 +36,6 @@ from axolotl.utils.callbacks import (
    GCCallback,
    GPUStatsCallback,
    SaveAxolotlConfigtoWandBCallback,
    SaveModelOnFirstStepCallback,
 )
 from axolotl.utils.callbacks.profiler import PytorchProfilerCallback
 from axolotl.utils.schemas.enums import CustomSupportedOptimizers
@@ -136,8 +135,6 @@ class TrainerBuilderBase(abc.ABC):
            callbacks.append(
                SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path)
            )
        if self.cfg.save_first_step:
            callbacks.append(SaveModelOnFirstStepCallback())
        callbacks.append(GPUStatsCallback(cfg=self.cfg))
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@50cef19"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@865b899"
 ```
 ## Usage
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -19,13 +19,11 @@ Cut Cross Entropy is an optimized implementation of cross entropy loss
 from Apple's ML team.
 """
 import importlib
 from functools import partial
 import torch
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils import get_pytorch_version
 from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
 from axolotl.utils.logging import get_logger
 from .args import CutCrossEntropyArgs  # pylint: disable=unused-import. # noqa: F401
@@ -34,7 +32,7 @@ LOG = get_logger(__name__)
 _CCE_INSTALL_MESSAGE = (
    "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@50cef19"`'
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@865b899"`'
 )
@@ -86,7 +84,6 @@ class CutCrossEntropyPlugin(BasePlugin):
        """Apply cut cross entropy before model loading if enabled."""
        if cfg.cut_cross_entropy:
            self._check_requirements()
            self.patch_llama_like(cfg.model_config_type)
            from cut_cross_entropy.transformers.patch import cce_patch
@@ -96,48 +93,3 @@ class CutCrossEntropyPlugin(BasePlugin):
            # The patch checks model_type internally
            cce_patch(cfg.model_config_type)
    def patch_llama_like(
        self,
        model_type: str,
    ) -> None:
        """
        Generic patch for model architectures with causal lm similar to llama
        """
        from cut_cross_entropy.transformers.patch import PATCH_FNS
        def patch_generic(
            maybe_model, patch_options, model_type: str
        ):  # pylint: disable=unused-argument
            import cut_cross_entropy.transformers.llama
            from cut_cross_entropy.transformers.llama import cce_forward
            try:
                # Dynamically import the module and CausalLM class
                module_path = f"transformers.models.{model_type}.modeling_{model_type}"
                model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
                module = __import__(
                    module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"]
                )
                model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
                cut_cross_entropy.transformers.llama._PATCH_OPTS = (  # pylint: disable=protected-access
                    patch_options
                )
                model_cls.forward = cce_forward
            # pylint: disable=duplicate-code
            except (ImportError, AttributeError) as e:
                raise RuntimeError(
                    f"Could not import ForCausalLM class for model_type: {model_type}. "
                    f"Error: {str(e)}"
                ) from e
        if model_type not in PATCH_FNS:
            LOG.warning_once(
                "Setting up generic cce patch for model type: %s", model_type
            )
            LOG.warning_once(
                f"Generic Cut Cross Entropy + {model_type} support is experimental and may not work as expected."
            )
            PATCH_FNS[model_type] = partial(patch_generic, model_type=model_type)
--- a/src/axolotl/integrations/kd/kernels/models.py
+++ b/src/axolotl/integrations/kd/kernels/models.py
@@ -22,8 +22,6 @@ except ImportError:
        TransformersKwargs,
    )
 from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
 def kldiv_forward_llama_like(
    self,
@@ -99,7 +97,7 @@ def kldiv_forward_llama_like(
 def apply_kernel(model_type):
    # Dynamically import the module and attention class
    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
-    model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
+    model_cls_prefix = "".join([part.capitalize() for part in model_type.split("_")])
    module = __import__(module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"])
    model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
    model_cls.forward = kldiv_forward_llama_like
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -18,10 +18,170 @@ Module for the Plugin for LIGER integraton with Axolotl.
 Liger Kernel is the collection of Triton-native kernels for LLM Training.
 It is designed to be performant, correct, and light-weight.
 """
-from .args import LigerArgs
+import inspect
-from .plugin import LigerPlugin
+import sys
-__all__ = [
+from axolotl.integrations.base import BasePlugin
-    "LigerArgs",
+from axolotl.utils.logging import get_logger
-    "LigerPlugin",
+
-]
+from .args import LigerArgs  # pylint: disable=unused-import. # noqa: F401
 from .utils import patch_with_compile_disable
 LOG = get_logger(__name__)
 class LigerPlugin(BasePlugin):
    """
    Plugin for LIGER integraton with Axolotl.
    """
    def get_input_args(self):
        return "axolotl.integrations.liger.LigerArgs"
    def pre_model_load(self, cfg):
        if cfg.torch_compile:
            # torch compile will unnecessarily attempt to optimize the triton kernel unless explicitly disabled
            import liger_kernel.ops.fused_linear_cross_entropy
            patch_with_compile_disable(
                liger_kernel.ops.fused_linear_cross_entropy,
                "fused_linear_cross_entropy_forward",
            )
            patch_with_compile_disable(
                liger_kernel.ops.fused_linear_cross_entropy,
                "fused_linear_cross_entropy_backward",
            )
        from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
        from liger_kernel.transformers.functional import liger_cross_entropy
        from liger_kernel.transformers.layer_norm import LigerLayerNorm
        from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
        from liger_kernel.transformers.rms_norm import LigerRMSNorm
        from liger_kernel.transformers.rope import liger_rotary_pos_emb
        from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
        if cfg.liger_cross_entropy and cfg.liger_fused_linear_cross_entropy:
            raise ValueError(
                "Cannot have both `liger_cross_entropy` and `liger_fused_linear_cross_entropy` set."
            )
        if cfg.model_config_type in MODEL_TYPE_TO_APPLY_LIGER_FN:
            apply_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[cfg.model_config_type]
            liger_fn_sig = inspect.signature(apply_liger_fn)
            kwargs = {}
            if "rope" in liger_fn_sig.parameters:
                kwargs["rope"] = cfg.liger_rope
            if "cross_entropy" in liger_fn_sig.parameters:
                kwargs["cross_entropy"] = cfg.liger_cross_entropy
            if "fused_linear_cross_entropy" in liger_fn_sig.parameters:
                kwargs["fused_linear_cross_entropy"] = (
                    cfg.liger_fused_linear_cross_entropy
                )
            if "rms_norm" in liger_fn_sig.parameters:
                kwargs["rms_norm"] = cfg.liger_rms_norm
            if "layer_norm" in liger_fn_sig.parameters:
                kwargs["layer_norm"] = cfg.liger_layer_norm
            if "geglu" in liger_fn_sig.parameters:
                kwargs["geglu"] = cfg.liger_glu_activation
            elif "swiglu" in liger_fn_sig.parameters:
                kwargs["swiglu"] = cfg.liger_glu_activation
            LOG.info(f"Applying LIGER to {cfg.model_config_type} with kwargs: {kwargs}")
            apply_liger_fn(**kwargs)
        elif cfg.model_config_type == "jamba":
            from transformers.models.jamba import modeling_jamba
            from .models.jamba import lce_forward as jamba_lce_forward
            if cfg.liger_rope:
                modeling_jamba.apply_rotary_pos_emb = liger_rotary_pos_emb
            if cfg.liger_rms_norm:
                modeling_jamba.JambaRMSNorm = LigerRMSNorm
            if cfg.liger_glu_activation:
                modeling_jamba.JambaMLP = LigerSwiGLUMLP
            if cfg.liger_layer_norm:
                modeling_jamba.nn.LayerNorm = LigerLayerNorm
            if cfg.liger_cross_entropy:
                from transformers.loss.loss_utils import nn
                nn.functional.cross_entropy = liger_cross_entropy
            if cfg.liger_fused_linear_cross_entropy:
                modeling_jamba.JambaForCausalLM.forward = jamba_lce_forward
        elif cfg.model_config_type == "deepseek_v2":
            from accelerate import init_empty_weights
            from transformers import AutoModelForCausalLM
            with init_empty_weights():
                model = AutoModelForCausalLM.from_pretrained(
                    cfg.base_model, trust_remote_code=cfg.trust_remote_code or False
                )
                modeling_mod = sys.modules[model.__class__.__module__]
            from .models.deepseekv2 import lce_forward as deepseekv2_lce_forward
            if cfg.liger_rope:
                # The DeepseekV2 version of RoPE is different than upstream LLaMA.
                # See https://github.com/linkedin/Liger-Kernel/issues/129#issuecomment-2313763528
                LOG.warning("Fused liger_rope is not supported for DeepseekV2.")
            if cfg.liger_glu_activation:
                LOG.warning("liger_glu_activation is not supported for DeepseekV2.")
            if cfg.liger_rms_norm:
                modeling_mod.DeepseekV2RMSNorm = LigerRMSNorm
            if cfg.liger_glu_activation:
                modeling_mod.DeepseekV2MLP.forward = LigerSwiGLUMLP.forward
            if cfg.liger_layer_norm:
                modeling_mod.DeepseekV2MLP.forward = LigerLayerNorm.forward
            if cfg.liger_cross_entropy:
                # We do not patch `nn.functional.cross_entropy` for DeepseekV2 as it still uses
                # nn.CrossEntropyLoss in the forward method.
                modeling_mod.CrossEntropyLoss = LigerCrossEntropyLoss
            if cfg.liger_fused_linear_cross_entropy:
                modeling_mod.DeepseekV2ForCausalLM.forward = deepseekv2_lce_forward
        elif cfg.model_config_type == "llama4":
            from axolotl.integrations.liger.models.llama4 import (
                apply_liger_kernel_to_llama4,
            )
            apply_liger_kernel_to_llama4(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "qwen3":
            from axolotl.integrations.liger.models.qwen3 import (
                apply_liger_kernel_to_qwen3,
            )
            apply_liger_kernel_to_qwen3(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "qwen3_moe":
            from axolotl.integrations.liger.models.qwen3_moe import (
                apply_liger_kernel_to_qwen3_moe,
            )
            apply_liger_kernel_to_qwen3_moe(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "granitemoe":
            from liger_kernel.transformers import apply_liger_kernel_to_granite
            apply_liger_kernel_to_granite(
                rope=cfg.liger_rope,
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                rms_norm=cfg.liger_rms_norm,
                swiglu=cfg.liger_glu_activation,
            )
        else:
            LOG.warning(
                f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
            )
--- a/Show More
+++ b/Show More