From ede973b76cf92684582013b29cf03e9e769cd188 Mon Sep 17 00:00:00 2001 From: Dan Saunders Date: Mon, 28 Jul 2025 01:47:40 +0000 Subject: [PATCH] nits --- examples/llama-3/fft-8b-liger-fsdp.yaml | 1 - examples/llama-3/lora-1b-kernels.yml | 1 - src/axolotl/kernels/lora.py | 4 ++-- src/axolotl/monkeypatch/accelerate/fsdp2.py | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/llama-3/fft-8b-liger-fsdp.yaml b/examples/llama-3/fft-8b-liger-fsdp.yaml index b3d990a8b..6a65a0ddd 100644 --- a/examples/llama-3/fft-8b-liger-fsdp.yaml +++ b/examples/llama-3/fft-8b-liger-fsdp.yaml @@ -9,7 +9,6 @@ liger_rms_norm: true liger_glu_activation: true liger_fused_linear_cross_entropy: true - chat_template: llama3 datasets: - path: mlabonne/FineTome-100k diff --git a/examples/llama-3/lora-1b-kernels.yml b/examples/llama-3/lora-1b-kernels.yml index 71e569ae0..20af97bf0 100644 --- a/examples/llama-3/lora-1b-kernels.yml +++ b/examples/llama-3/lora-1b-kernels.yml @@ -15,7 +15,6 @@ lora_model_dir: sequence_len: 2048 sample_packing: true - lora_r: 16 lora_alpha: 32 # Currently, we don't support dropout with our custom Triton kernels diff --git a/src/axolotl/kernels/lora.py b/src/axolotl/kernels/lora.py index 63c9e57bd..c77948155 100644 --- a/src/axolotl/kernels/lora.py +++ b/src/axolotl/kernels/lora.py @@ -102,8 +102,8 @@ def matmul_lora( del W if A is not None: - A, B = A.t(), B.t() - out += (X @ A.to(dtype)) @ (s * B.to(dtype)) + A, B = A.t().to(dtype), B.t().to(dtype) + out += (X @ A) @ (s * B) return out.view(batch, seq_len, -1) if reshape else out diff --git a/src/axolotl/monkeypatch/accelerate/fsdp2.py b/src/axolotl/monkeypatch/accelerate/fsdp2.py index 803659232..b1bb4bc21 100644 --- a/src/axolotl/monkeypatch/accelerate/fsdp2.py +++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py @@ -221,7 +221,7 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module: transformer_auto_wrap_policy, ) - # We need the `auto_wrap_policy` original type to create a custom poilicy function for sharding + # We need the `auto_wrap_policy` original type to create a custom policy function for sharding # This is because `fully_shard` doesn't support old auto wrap policies, rather we have to imitate the behaviour if fsdp2_plugin.auto_wrap_policy is transformer_auto_wrap_policy: pass # auto_wrap_policy_type = "transformer"