feat: add glm and glm4 multipack and cce (#2546)

* feat: add glm and glm4 multipack * feat: add glm4 example * feat: add cce for glm
2025-04-23 21:27:51 +07:00
parent 32e335dd51
commit a6d28d19b1
5 changed files with 129 additions and 0 deletions
--- a/examples/glm4/qlora-32b.yaml
+++ b/examples/glm4/qlora-32b.yaml
@@ -0,0 +1,62 @@
 base_model: THUDM/GLM-4-32B-0414
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_4bit: true
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0
 output_dir: ./outputs/qlora-out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
 pad_to_sequence_len: true
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -47,6 +47,8 @@ cut_cross_entropy: true
 - qwen2
 - cohere
 - cohere2
 - glm
 - glm4
 ## Citation
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/glm4.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/glm4.py
@@ -0,0 +1,57 @@
 """GLM 4 patch. GLM family inherits from Llama."""
 from types import MethodType
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
 )
 def patch_glm(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    # Set the _PATCH_OPTS in the llama patch file
    import cut_cross_entropy.transformers.llama as llama_patch
    llama_patch._PATCH_OPTS = patch_options  # pylint: disable=protected-access
    from cut_cross_entropy.transformers.llama import cce_forward
    from transformers.models.glm import modeling_glm
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_glm.GlmForCausalLM
        ), f"Expected a GlmForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward, maybe_model)
        return maybe_model
    modeling_glm.GlmForCausalLM.forward = cce_forward
    return None
 def patch_glm4(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    # Set the _PATCH_OPTS in the llama patch file
    import cut_cross_entropy.transformers.llama as llama_patch
    llama_patch._PATCH_OPTS = patch_options  # pylint: disable=protected-access
    from cut_cross_entropy.transformers.llama import cce_forward
    from transformers.models.glm4 import modeling_glm4
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_glm4.Glm4ForCausalLM
        ), f"Expected a Glm4ForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward, maybe_model)
        return maybe_model
    modeling_glm4.Glm4ForCausalLM.forward = cce_forward
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/patch.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/patch.py
@@ -20,6 +20,10 @@ from axolotl.integrations.cut_cross_entropy.monkeypatch.gemma3 import (
    patch_gemma3,
    patch_gemma3_text,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.glm4 import (
    patch_glm,
    patch_glm4,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.llama4 import (
    patch_llama4,
    patch_llama4_text,
@@ -45,6 +49,8 @@ CUT_CROSS_ENTROPY_MODEL_MAPPING = {
    "qwen2": patch_qwen2,
    "cohere": patch_cohere,
    "cohere2": patch_cohere2,
    "glm": patch_glm,
    "glm4": patch_glm4,
 }
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -31,6 +31,8 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "starcoder2",
    "deepseek_v2",
    "deepseek_v3",
    "glm",
    "glm4",
 ]