use narrow as a view on the student logits instead of slicing

2025-02-04 09:34:26 -05:00
35 changed files with 113 additions and 11289 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -22,6 +22,12 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - cuda: "124"
+            cuda_version: 12.4.1
+            cudnn_version: ""
+            python_version: "3.10"
+            pytorch: 2.4.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
@@ -34,12 +40,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "124"
-            cuda_version: 12.4.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.6.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -19,7 +19,7 @@ jobs:
        - name: Setup Python
          uses: actions/setup-python@v5
          with:
-            python-version: '3.11'
+            python-version: '3.10'
        - name: install dependencies
          run: |
            python3 -m pip install jupyter
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -19,6 +19,6 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
-          python-version: "3.11"
+          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -26,11 +26,6 @@ jobs:
            pytorch: 2.5.1
            axolotl_extras:
            is_latest: true
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -34,13 +34,6 @@ jobs:
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras:
-            num_gpus: 2
-            nightly_build: "true"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
@@ -49,7 +42,7 @@ jobs:
      - name: Install Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.11"
+          python-version: "3.10"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -22,11 +22,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -36,7 +36,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.11"
+          python-version: "3.10"

      - name: Install dependencies
        run: |
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -12,7 +12,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
-          python-version: "3.11"
+          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
@@ -25,8 +25,13 @@ jobs:
      fail-fast: false
      max-parallel: 2
      matrix:
-        python_version: ["3.11"]
-        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
+        python_version: ["3.10", "3.11"]
+        pytorch_version: ["2.4.1", "2.5.1"]
+        exclude:
+          - python_version: "3.10"
+            pytorch_version: "2.4.1"
+          - python_version: "3.10"
+            pytorch_version: "2.5.1"
    timeout-minutes: 20

    steps:
@@ -107,20 +112,13 @@ jobs:
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            num_gpus: 1
-            axolotl_extras:
-            nightly_build: "true"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.11"
+          python-version: "3.10"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -35,7 +35,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
-          python-version: "3.11"
+          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
@@ -48,8 +48,13 @@ jobs:
      fail-fast: false
      max-parallel: 2
      matrix:
-        python_version: ["3.11"]
-        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
+        python_version: ["3.10", "3.11"]
+        pytorch_version: ["2.4.1", "2.5.1"]
+        exclude:
+          - python_version: "3.10"
+            pytorch_version: "2.4.1"
+          - python_version: "3.10"
+            pytorch_version: "2.5.1"
    timeout-minutes: 20

    steps:
@@ -122,7 +127,7 @@ jobs:
      max-parallel: 1
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
+        pytorch_version: ["2.4.1", "2.5.1"]
    timeout-minutes: 20

    steps:
@@ -211,7 +216,7 @@ jobs:
      - name: Install Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.11"
+          python-version: "3.10"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
@@ -246,19 +251,13 @@ jobs:
            pytorch: 2.4.1
            num_gpus: 1
            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            num_gpus: 1
-            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.11"
+          python-version: "3.10"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ Features:

 **Requirements**:
 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
- Python 3.11
+- Python ≥3.10
 - PyTorch ≥2.4.1

 ### Installation
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -46,10 +46,6 @@ overrides_of_model_config:
    type: # linear | dynamic
    factor: # float

-# optional overrides the base model loading from_pretrained
-overrides_of_model_kwargs:
-  # use_cache: False
-
 # optional overrides to the bnb 4bit quantization configuration
 # https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
 bnb_config_kwargs:
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -19,7 +19,3 @@ description: Frequently asked questions
 **Q: AttributeError: 'DummyOptim' object has no attribute 'step'**

 > A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
-
-**Q: The codes is stuck on saving preprocessed datasets.**
-
-> A: This is usually an issue with the GPU. This can be resolved through setting the os environment variable `CUDA_VISIBLE_DEVICES=0`. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it. 
--- a/docs/multi-node.qmd
+++ b/docs/multi-node.qmd
@@ -3,18 +3,6 @@ title: Multi Node
 description: How to use Axolotl on multiple machines
 ---

-The below are three ways to train multi-node in Axolotl.
-
-::: {.callout-important}
-Each machine needs a copy of Axolotl, we suggest using the same commit to ensure compatibility.
-
-You will also need to have the same configuration file for your model on each machine.
-
-Make sure the main machine is reachable by other machines.
-:::
-
-# Accelerate
-
 You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:

 ~/.cache/huggingface/accelerate/default_config.yaml
@@ -38,7 +26,7 @@ tpu_use_sudo: false
 use_cpu: false
 ```

-Configure your model to use FSDP in the Axolotl yaml. For example:
+Configure your model to use FSDP with for example:
 ```yaml
 fsdp:
  - full_shard
@@ -49,40 +37,12 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

+## Machine configuration
+
+On each machine you need a copy of Axolotl, we suggest using the same commit to ensure compatibility.
+
+You will also need to have the same configuration file for your model on each machine.
+
+On the main machine only, make sure the port you set as `main_process_port` is open in TCP and reachable by other machines.
+
 All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
-
-# Raytrain
-
-Please see ray train doc [here](ray-integration.qmd).
-
-# Torchrun
-
-If you are using Infiniband, we recommend torchrun to utilize the full bandwidth.
-
-Set the following env (change buffersize/socketname depending on your system):
-
-```yaml
-export NCCL_IB_DISABLE=0
-export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
-export NCCL_BUFFSIZE=2097152
-```
-
-Run the following on each node:
-
-```bash
-torchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port" -m axolotl.cli.train config.yaml
-```
-
-Please make sure to substitute the placeholder variables.
-
- `num_nodes`: Number of nodes (containing GPUs)
- `gpu_per_node`: Number of gpus per node
- `head_node_ip`: IP of the head node (make sure other machines can connect to this)
- `head_node_port`: Port of the head node (make sure other machines can connect to this. Default 29400)
- `rdzv_id`: A unique job ID that is used by the job across nodes.
-
-::: {.callout-note}
-You need to call `axolotl.cli.train` instead of `axolotl train` as the latter calls accelerate under the hood
-:::
-
-More info on the available configs can be found on the Pytorch docs [here](https://pytorch.org/docs/stable/elastic/run.html)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/

 # START section of dependencies that don't install on Darwin/MacOS
-bitsandbytes==0.45.2
+bitsandbytes==0.45.1
 triton>=3.0.0
 mamba-ssm==1.2.0.post1
-flash-attn==2.7.4.post1
+flash-attn==2.7.0.post2
 xformers>=0.0.23.post1
 autoawq==0.2.7.post3
 liger-kernel==0.5.2
@@ -13,7 +13,7 @@ liger-kernel==0.5.2
 packaging==23.2

 peft==0.14.0
-transformers==4.48.3
+transformers==4.48.1
 tokenizers>=0.21.0
 accelerate==1.3.0
 datasets==3.2.0
--- a/setup.py
+++ b/setup.py
@@ -71,15 +71,12 @@ def parse_requirements():
            else:
                raise ValueError("Invalid version format")

-            if (major, minor) >= (2, 6):
-                _install_requires.pop(_install_requires.index(xformers_version))
-                _install_requires.append("xformers==0.0.29.post2")
-            elif (major, minor) >= (2, 5):
+            if (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
                    _install_requires.append("xformers==0.0.28.post2")
                else:
-                    _install_requires.append("xformers==0.0.29")
+                    _install_requires.append("xformers==0.0.28.post3")
                _install_requires.pop(_install_requires.index(autoawq_version))
            elif (major, minor) >= (2, 4):
                if patch == 0:
--- a/src/axolotl/integrations/kd/trainer.py
+++ b/src/axolotl/integrations/kd/trainer.py
@@ -67,9 +67,8 @@ class AxolotlKDTrainer(AxolotlTrainer):
        outputs = model(**inputs)

        # FIXME: account for tokenizer.padding_side
-        student_logits = outputs["logits"][:, : seq_len - 1, :].contiguous()
+        shift_logits = outputs["logits"].narrow(1, 0, seq_len - 1).contiguous()

-        shift_logits = student_logits.contiguous()
        target_logprobs_for_loss = target_logprobs[..., 1:, :].contiguous()
        target_token_ids_for_loss = target_token_ids[..., 1:, :].contiguous()
        target_mask_for_loss = target_mask[..., 1:, :].contiguous()
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-1.5B-Instruct.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-1.5B-Instruct.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-1.5B.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-1.5B.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-3B-Instruct.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-3B-Instruct.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-3B.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-3B.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-7B-Instruct.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-7B-Instruct.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-7B.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-7B.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_google-gemma-2-2b.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_google-gemma-2-2b.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-1B-Instruct.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-1B-Instruct.json
@@ -1,590 +0,0 @@
-{
-    "model.layers.0.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.1.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.2.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.3.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.4.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.5.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.6.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.7.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.8.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.9.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.10.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.11.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.12.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.13.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.14.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.15.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "lm_head": {
-        "snr": Infinity,
-        "type": "lm_head"
-    },
-    "model.layers.0.mlp.down_proj": {
-        "snr": 70.0594253540039,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.1.mlp.down_proj": {
-        "snr": 11.135851860046387,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.2.mlp.down_proj": {
-        "snr": 7.035482883453369,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.3.mlp.down_proj": {
-        "snr": 6.422532081604004,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.4.mlp.down_proj": {
-        "snr": 5.748020172119141,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.5.mlp.down_proj": {
-        "snr": 3.885556697845459,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.6.mlp.down_proj": {
-        "snr": 3.4336745738983154,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.7.mlp.down_proj": {
-        "snr": 2.791595935821533,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.8.mlp.down_proj": {
-        "snr": 5.36277961730957,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.9.mlp.down_proj": {
-        "snr": 4.459208011627197,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.10.mlp.down_proj": {
-        "snr": 6.272170066833496,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.11.mlp.down_proj": {
-        "snr": 5.264761447906494,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.12.mlp.down_proj": {
-        "snr": 4.324735641479492,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.13.mlp.down_proj": {
-        "snr": 3.878648042678833,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.14.mlp.down_proj": {
-        "snr": 2.9773054122924805,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.15.mlp.down_proj": {
-        "snr": 4.471445560455322,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.0.mlp.gate_proj": {
-        "snr": 25.227100372314453,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.1.mlp.gate_proj": {
-        "snr": 6.58299446105957,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.2.mlp.gate_proj": {
-        "snr": 3.4688243865966797,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.3.mlp.gate_proj": {
-        "snr": 1.555246114730835,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.4.mlp.gate_proj": {
-        "snr": 0.7770601511001587,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.5.mlp.gate_proj": {
-        "snr": 0.6239906549453735,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.6.mlp.gate_proj": {
-        "snr": 0.6440379023551941,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.7.mlp.gate_proj": {
-        "snr": 0.5120116472244263,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.8.mlp.gate_proj": {
-        "snr": 0.6544050574302673,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.9.mlp.gate_proj": {
-        "snr": 0.5381016731262207,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.10.mlp.gate_proj": {
-        "snr": 0.622873842716217,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.11.mlp.gate_proj": {
-        "snr": 0.9361700415611267,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.12.mlp.gate_proj": {
-        "snr": 1.475605845451355,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.13.mlp.gate_proj": {
-        "snr": 1.608325719833374,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.14.mlp.gate_proj": {
-        "snr": 1.0720024108886719,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.15.mlp.gate_proj": {
-        "snr": 0.7111338973045349,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.0.mlp.up_proj": {
-        "snr": 28.431896209716797,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.1.mlp.up_proj": {
-        "snr": 15.546019554138184,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.2.mlp.up_proj": {
-        "snr": 23.048023223876953,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.3.mlp.up_proj": {
-        "snr": 25.790977478027344,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.4.mlp.up_proj": {
-        "snr": 18.552549362182617,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.5.mlp.up_proj": {
-        "snr": 8.85106372833252,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.6.mlp.up_proj": {
-        "snr": 10.653799057006836,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.7.mlp.up_proj": {
-        "snr": 7.365357875823975,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.8.mlp.up_proj": {
-        "snr": 11.98373794555664,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.9.mlp.up_proj": {
-        "snr": 8.04493236541748,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.10.mlp.up_proj": {
-        "snr": 8.523039817810059,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.11.mlp.up_proj": {
-        "snr": 5.381742477416992,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.12.mlp.up_proj": {
-        "snr": 3.9845118522644043,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.13.mlp.up_proj": {
-        "snr": 3.4893221855163574,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.14.mlp.up_proj": {
-        "snr": 1.764201045036316,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.15.mlp.up_proj": {
-        "snr": 0.9730708599090576,
-        "type": "mlp.up_proj"
-    },
-    "model.embed_tokens": {
-        "snr": Infinity,
-        "type": "model.embed_tokens"
-    },
-    "model.norm": {
-        "snr": Infinity,
-        "type": "model.norm"
-    },
-    "model.layers.0.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.1.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.2.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.3.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.4.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.5.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.6.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.7.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.8.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.9.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.10.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.11.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.12.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.13.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.14.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.15.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.0.self_attn.k_proj": {
-        "snr": 0.11727584153413773,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.1.self_attn.k_proj": {
-        "snr": 0.24786807596683502,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.2.self_attn.k_proj": {
-        "snr": 0.36378130316734314,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.3.self_attn.k_proj": {
-        "snr": 0.2983120381832123,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.4.self_attn.k_proj": {
-        "snr": 0.33789733052253723,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.5.self_attn.k_proj": {
-        "snr": 0.29155924916267395,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.6.self_attn.k_proj": {
-        "snr": 0.2537297010421753,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.7.self_attn.k_proj": {
-        "snr": 0.28204113245010376,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.8.self_attn.k_proj": {
-        "snr": 0.2776711583137512,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.9.self_attn.k_proj": {
-        "snr": 0.2927376627922058,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.10.self_attn.k_proj": {
-        "snr": 0.31486213207244873,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.11.self_attn.k_proj": {
-        "snr": 0.32363659143447876,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.12.self_attn.k_proj": {
-        "snr": 0.31382912397384644,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.13.self_attn.k_proj": {
-        "snr": 0.4635234773159027,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.14.self_attn.k_proj": {
-        "snr": 0.25379249453544617,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.15.self_attn.k_proj": {
-        "snr": 0.2628238797187805,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.0.self_attn.o_proj": {
-        "snr": 0.27602291107177734,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.1.self_attn.o_proj": {
-        "snr": 0.2149604707956314,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.2.self_attn.o_proj": {
-        "snr": 0.2540294826030731,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.3.self_attn.o_proj": {
-        "snr": 0.27978822588920593,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.4.self_attn.o_proj": {
-        "snr": 0.3121289908885956,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.5.self_attn.o_proj": {
-        "snr": 0.35037684440612793,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.6.self_attn.o_proj": {
-        "snr": 0.366205096244812,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.7.self_attn.o_proj": {
-        "snr": 0.3692712187767029,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.8.self_attn.o_proj": {
-        "snr": 0.3301038146018982,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.9.self_attn.o_proj": {
-        "snr": 0.3003396987915039,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.10.self_attn.o_proj": {
-        "snr": 0.30804169178009033,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.11.self_attn.o_proj": {
-        "snr": 0.28501132130622864,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.12.self_attn.o_proj": {
-        "snr": 0.2171541005373001,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.13.self_attn.o_proj": {
-        "snr": 0.19183959066867828,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.14.self_attn.o_proj": {
-        "snr": 0.19215913116931915,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.15.self_attn.o_proj": {
-        "snr": 0.25486502051353455,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.0.self_attn.q_proj": {
-        "snr": 0.03850084915757179,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.1.self_attn.q_proj": {
-        "snr": 0.0713055431842804,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.2.self_attn.q_proj": {
-        "snr": 0.07948919385671616,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.3.self_attn.q_proj": {
-        "snr": 0.08047746121883392,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.4.self_attn.q_proj": {
-        "snr": 0.0852593332529068,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.5.self_attn.q_proj": {
-        "snr": 0.09794823825359344,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.6.self_attn.q_proj": {
-        "snr": 0.09627152234315872,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.7.self_attn.q_proj": {
-        "snr": 0.11065381020307541,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.8.self_attn.q_proj": {
-        "snr": 0.12031875550746918,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.9.self_attn.q_proj": {
-        "snr": 0.09804573655128479,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.10.self_attn.q_proj": {
-        "snr": 0.10897502303123474,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.11.self_attn.q_proj": {
-        "snr": 0.09267337620258331,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.12.self_attn.q_proj": {
-        "snr": 0.08803492039442062,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.13.self_attn.q_proj": {
-        "snr": 0.0902542844414711,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.14.self_attn.q_proj": {
-        "snr": 0.10154066979885101,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.15.self_attn.q_proj": {
-        "snr": 0.09083802253007889,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.0.self_attn.v_proj": {
-        "snr": 2.842210054397583,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.1.self_attn.v_proj": {
-        "snr": 10.59461498260498,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.2.self_attn.v_proj": {
-        "snr": 8.993025779724121,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.3.self_attn.v_proj": {
-        "snr": 62.567787170410156,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.4.self_attn.v_proj": {
-        "snr": 23.80082893371582,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.5.self_attn.v_proj": {
-        "snr": 7.957369804382324,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.6.self_attn.v_proj": {
-        "snr": 12.01815414428711,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.7.self_attn.v_proj": {
-        "snr": 5.095500469207764,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.8.self_attn.v_proj": {
-        "snr": 11.719332695007324,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.9.self_attn.v_proj": {
-        "snr": 555.0869750976562,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.10.self_attn.v_proj": {
-        "snr": 22.95538330078125,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.11.self_attn.v_proj": {
-        "snr": 30.042158126831055,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.12.self_attn.v_proj": {
-        "snr": 9.577271461486816,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.13.self_attn.v_proj": {
-        "snr": 18.176361083984375,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.14.self_attn.v_proj": {
-        "snr": 1.5695856809616089,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.15.self_attn.v_proj": {
-        "snr": 2.7235565185546875,
-        "type": "self_attn.v_proj"
-    }
-}
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-1B.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-1B.json
@@ -1,590 +0,0 @@
-{
-    "model.layers.0.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.1.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.2.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.3.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.4.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.5.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.6.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.7.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.8.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.9.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.10.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.11.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.12.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.13.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.14.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "model.layers.15.input_layernorm": {
-        "snr": Infinity,
-        "type": "input_layernorm"
-    },
-    "lm_head": {
-        "snr": Infinity,
-        "type": "lm_head"
-    },
-    "model.layers.0.mlp.down_proj": {
-        "snr": 57.09797286987305,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.1.mlp.down_proj": {
-        "snr": 9.538983345031738,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.2.mlp.down_proj": {
-        "snr": 6.227016925811768,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.3.mlp.down_proj": {
-        "snr": 5.660686492919922,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.4.mlp.down_proj": {
-        "snr": 5.178432464599609,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.5.mlp.down_proj": {
-        "snr": 3.5638349056243896,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.6.mlp.down_proj": {
-        "snr": 3.0918056964874268,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.7.mlp.down_proj": {
-        "snr": 2.456392288208008,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.8.mlp.down_proj": {
-        "snr": 4.525328636169434,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.9.mlp.down_proj": {
-        "snr": 3.9409055709838867,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.10.mlp.down_proj": {
-        "snr": 5.447249412536621,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.11.mlp.down_proj": {
-        "snr": 4.807600975036621,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.12.mlp.down_proj": {
-        "snr": 3.915374517440796,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.13.mlp.down_proj": {
-        "snr": 3.4820363521575928,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.14.mlp.down_proj": {
-        "snr": 2.6045074462890625,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.15.mlp.down_proj": {
-        "snr": 3.7237701416015625,
-        "type": "mlp.down_proj"
-    },
-    "model.layers.0.mlp.gate_proj": {
-        "snr": 22.160131454467773,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.1.mlp.gate_proj": {
-        "snr": 6.072206020355225,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.2.mlp.gate_proj": {
-        "snr": 3.2467362880706787,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.3.mlp.gate_proj": {
-        "snr": 1.4111896753311157,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.4.mlp.gate_proj": {
-        "snr": 0.7405938506126404,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.5.mlp.gate_proj": {
-        "snr": 0.5916463136672974,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.6.mlp.gate_proj": {
-        "snr": 0.6149423718452454,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.7.mlp.gate_proj": {
-        "snr": 0.48369669914245605,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.8.mlp.gate_proj": {
-        "snr": 0.6047574877738953,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.9.mlp.gate_proj": {
-        "snr": 0.5092479586601257,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.10.mlp.gate_proj": {
-        "snr": 0.5999670624732971,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.11.mlp.gate_proj": {
-        "snr": 0.8980127573013306,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.12.mlp.gate_proj": {
-        "snr": 1.4252448081970215,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.13.mlp.gate_proj": {
-        "snr": 1.509937047958374,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.14.mlp.gate_proj": {
-        "snr": 1.0066585540771484,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.15.mlp.gate_proj": {
-        "snr": 0.6413647532463074,
-        "type": "mlp.gate_proj"
-    },
-    "model.layers.0.mlp.up_proj": {
-        "snr": 26.08852195739746,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.1.mlp.up_proj": {
-        "snr": 13.382951736450195,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.2.mlp.up_proj": {
-        "snr": 20.088768005371094,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.3.mlp.up_proj": {
-        "snr": 23.0632381439209,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.4.mlp.up_proj": {
-        "snr": 16.07433319091797,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.5.mlp.up_proj": {
-        "snr": 8.00507640838623,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.6.mlp.up_proj": {
-        "snr": 9.538354873657227,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.7.mlp.up_proj": {
-        "snr": 6.286602973937988,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.8.mlp.up_proj": {
-        "snr": 10.092820167541504,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.9.mlp.up_proj": {
-        "snr": 7.193963527679443,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.10.mlp.up_proj": {
-        "snr": 7.320116996765137,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.11.mlp.up_proj": {
-        "snr": 4.8728532791137695,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.12.mlp.up_proj": {
-        "snr": 3.596583366394043,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.13.mlp.up_proj": {
-        "snr": 3.166161298751831,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.14.mlp.up_proj": {
-        "snr": 1.5600818395614624,
-        "type": "mlp.up_proj"
-    },
-    "model.layers.15.mlp.up_proj": {
-        "snr": 0.8726214170455933,
-        "type": "mlp.up_proj"
-    },
-    "model.embed_tokens": {
-        "snr": Infinity,
-        "type": "model.embed_tokens"
-    },
-    "model.norm": {
-        "snr": Infinity,
-        "type": "model.norm"
-    },
-    "model.layers.0.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.1.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.2.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.3.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.4.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.5.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.6.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.7.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.8.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.9.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.10.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.11.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.12.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.13.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.14.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.15.post_attention_layernorm": {
-        "snr": Infinity,
-        "type": "post_attention_layernorm"
-    },
-    "model.layers.0.self_attn.k_proj": {
-        "snr": 0.1154392883181572,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.1.self_attn.k_proj": {
-        "snr": 0.24299409985542297,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.2.self_attn.k_proj": {
-        "snr": 0.3624322712421417,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.3.self_attn.k_proj": {
-        "snr": 0.29509487748146057,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.4.self_attn.k_proj": {
-        "snr": 0.32953736186027527,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.5.self_attn.k_proj": {
-        "snr": 0.2908833622932434,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.6.self_attn.k_proj": {
-        "snr": 0.2488437294960022,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.7.self_attn.k_proj": {
-        "snr": 0.27847856283187866,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.8.self_attn.k_proj": {
-        "snr": 0.27143892645835876,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.9.self_attn.k_proj": {
-        "snr": 0.28804272413253784,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.10.self_attn.k_proj": {
-        "snr": 0.31197959184646606,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.11.self_attn.k_proj": {
-        "snr": 0.3203586935997009,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.12.self_attn.k_proj": {
-        "snr": 0.30905747413635254,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.13.self_attn.k_proj": {
-        "snr": 0.46828722953796387,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.14.self_attn.k_proj": {
-        "snr": 0.24205778539180756,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.15.self_attn.k_proj": {
-        "snr": 0.2559327781200409,
-        "type": "self_attn.k_proj"
-    },
-    "model.layers.0.self_attn.o_proj": {
-        "snr": 0.2638678550720215,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.1.self_attn.o_proj": {
-        "snr": 0.21109595894813538,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.2.self_attn.o_proj": {
-        "snr": 0.24751724302768707,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.3.self_attn.o_proj": {
-        "snr": 0.2728094160556793,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.4.self_attn.o_proj": {
-        "snr": 0.3001374304294586,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.5.self_attn.o_proj": {
-        "snr": 0.33903488516807556,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.6.self_attn.o_proj": {
-        "snr": 0.3530929982662201,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.7.self_attn.o_proj": {
-        "snr": 0.36753255128860474,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.8.self_attn.o_proj": {
-        "snr": 0.3373180329799652,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.9.self_attn.o_proj": {
-        "snr": 0.2970578670501709,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.10.self_attn.o_proj": {
-        "snr": 0.3076324760913849,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.11.self_attn.o_proj": {
-        "snr": 0.2766900658607483,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.12.self_attn.o_proj": {
-        "snr": 0.20973259210586548,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.13.self_attn.o_proj": {
-        "snr": 0.18185566365718842,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.14.self_attn.o_proj": {
-        "snr": 0.18329747021198273,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.15.self_attn.o_proj": {
-        "snr": 0.2437991499900818,
-        "type": "self_attn.o_proj"
-    },
-    "model.layers.0.self_attn.q_proj": {
-        "snr": 0.038040731102228165,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.1.self_attn.q_proj": {
-        "snr": 0.0707998052239418,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.2.self_attn.q_proj": {
-        "snr": 0.0787411704659462,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.3.self_attn.q_proj": {
-        "snr": 0.08089710026979446,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.4.self_attn.q_proj": {
-        "snr": 0.08591937273740768,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.5.self_attn.q_proj": {
-        "snr": 0.09852176159620285,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.6.self_attn.q_proj": {
-        "snr": 0.09690654277801514,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.7.self_attn.q_proj": {
-        "snr": 0.11181341856718063,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.8.self_attn.q_proj": {
-        "snr": 0.12042108923196793,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.9.self_attn.q_proj": {
-        "snr": 0.09799323976039886,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.10.self_attn.q_proj": {
-        "snr": 0.10901063680648804,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.11.self_attn.q_proj": {
-        "snr": 0.09307146072387695,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.12.self_attn.q_proj": {
-        "snr": 0.0880950540304184,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.13.self_attn.q_proj": {
-        "snr": 0.08886399120092392,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.14.self_attn.q_proj": {
-        "snr": 0.09955056011676788,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.15.self_attn.q_proj": {
-        "snr": 0.08929339051246643,
-        "type": "self_attn.q_proj"
-    },
-    "model.layers.0.self_attn.v_proj": {
-        "snr": 2.5501928329467773,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.1.self_attn.v_proj": {
-        "snr": 9.449499130249023,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.2.self_attn.v_proj": {
-        "snr": 7.9920830726623535,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.3.self_attn.v_proj": {
-        "snr": 50.69462585449219,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.4.self_attn.v_proj": {
-        "snr": 19.083511352539062,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.5.self_attn.v_proj": {
-        "snr": 7.21597146987915,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.6.self_attn.v_proj": {
-        "snr": 11.27744197845459,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.7.self_attn.v_proj": {
-        "snr": 4.579711437225342,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.8.self_attn.v_proj": {
-        "snr": 10.940719604492188,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.9.self_attn.v_proj": {
-        "snr": 553.4417724609375,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.10.self_attn.v_proj": {
-        "snr": 20.59434700012207,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.11.self_attn.v_proj": {
-        "snr": 26.636865615844727,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.12.self_attn.v_proj": {
-        "snr": 8.614749908447266,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.13.self_attn.v_proj": {
-        "snr": 17.722007751464844,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.14.self_attn.v_proj": {
-        "snr": 1.48500657081604,
-        "type": "self_attn.v_proj"
-    },
-    "model.layers.15.self_attn.v_proj": {
-        "snr": 2.5776851177215576,
-        "type": "self_attn.v_proj"
-    }
-}
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-3B-Instruct.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-3B-Instruct.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-3B.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-3B.json
--- a/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
+++ b/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
@@ -47,7 +47,7 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):

        if len(chosen_tokenized["input_ids"]) > max_length:
            LOG.warning(
-                f"To-be-trimmed chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}",
+                f"Chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}",
            )

            chosen_tokenized["input_ids"] = chosen_tokenized["input_ids"][:max_length]
@@ -70,7 +70,7 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):

        if len(rejected_tokenized["input_ids"]) > max_length:
            LOG.warning(
-                f"To-be-trimmed rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}",
+                f"Rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}",
            )

            rejected_tokenized["input_ids"] = rejected_tokenized["input_ids"][
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -115,9 +115,6 @@ class RemappedParameters(BaseModel):
    overrides_of_model_config: Optional[Dict[str, Any]] = Field(
        default=None, alias="model_config"
    )
-    overrides_of_model_kwargs: Optional[Dict[str, Any]] = Field(
-        default=None, alias="model_kwargs"
-    )
    type_of_model: Optional[str] = Field(default=None, alias="model_type")
    revision_of_model: Optional[str] = Field(default=None, alias="model_revision")

@@ -429,6 +426,8 @@ class ModelInputConfig(BaseModel):
    )
    trust_remote_code: Optional[bool] = None

+    model_kwargs: Optional[Dict[str, Any]] = None
+
    @field_validator("trust_remote_code")
    @classmethod
    def hint_trust_remote_code(cls, trust_remote_code):
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -46,7 +46,6 @@ from axolotl.utils.data.pretraining import wrap_pretraining_dataset
 from axolotl.utils.data.shared import load_dataset_w_config
 from axolotl.utils.data.utils import (
    deduplicate_and_log_datasets,
-    drop_long_seq_in_dataset,
    md5,
    retry_on_request_exceptions,
 )
@@ -57,7 +56,7 @@ from axolotl.utils.trainer import (
    process_datasets_for_packing,
 )

-LOG = logging.getLogger(__name__)
+LOG = logging.getLogger("axolotl")


@retry_on_request_exceptions(max_retries=3, delay=5)
@@ -340,11 +339,8 @@ def load_tokenized_prepared_datasets(
            else:
                LOG.debug("NOT shuffling merged datasets")

-        if not cfg.skip_prepare_dataset:
-            dataset = drop_long_seq_in_dataset(dataset, cfg)
-
-            if cfg.sample_packing:
-                dataset, _ = process_datasets_for_packing(cfg, dataset, None)
+        if cfg.sample_packing and not cfg.skip_prepare_dataset:
+            dataset, _ = process_datasets_for_packing(cfg, dataset, None)

        if cfg.local_rank == 0 and not cfg.skip_prepare_dataset:
            LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -1,5 +1,4 @@
 """data handling helpers"""
-
 import functools
 import hashlib
 import logging
@@ -7,15 +6,10 @@ import time
 from enum import Enum

 import huggingface_hub
-import numpy as np
 import requests
-from datasets import Dataset, IterableDataset
+from datasets import Dataset

-from axolotl.utils.dict import DictDefault
-from axolotl.utils.samplers.utils import get_dataset_lengths
-from axolotl.utils.trainer import drop_long_seq
-
-LOG = logging.getLogger(__name__)
+LOG = logging.getLogger("axolotl")


 class RetryStrategy(Enum):
@@ -156,53 +150,3 @@ def deduplicate_and_log_datasets(
        )

    return train_dataset, eval_dataset, dataset
-
-
-def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault):
-    if "input_ids" not in dataset.column_names:
-        LOG.warning(
-            "Dataset does not contain 'input_ids' column. Skip drop long seq. This is expected for RewardModeling."
-        )
-        return dataset
-
-    drop_long = functools.partial(
-        drop_long_seq,
-        sequence_len=cfg.sequence_len,
-        min_sequence_len=cfg.min_sample_len,
-    )
-
-    try:
-        min_input_len = np.min(get_dataset_lengths(dataset))
-        LOG.debug(f"min_input_len: {min_input_len}")
-        max_input_len = np.max(get_dataset_lengths(dataset))
-        LOG.debug(f"max_input_len: {max_input_len}")
-    except AttributeError:
-        pass
-
-    try:
-        prior_len = len(dataset)
-    except TypeError:
-        # handle iterable datasets case
-        prior_len = None
-
-    filter_map_kwargs = {}
-    if not isinstance(dataset, IterableDataset):
-        filter_map_kwargs["num_proc"] = cfg.dataset_processes
-        filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
-
-    drop_long_kwargs = {}
-    if filter_map_kwargs:
-        drop_long_kwargs["desc"] = "Dropping Long Sequences"
-
-    dataset = dataset.filter(
-        drop_long,
-        batched=True,
-        **filter_map_kwargs,
-        **drop_long_kwargs,
-    )
-    if prior_len:
-        dropped = prior_len - len(dataset)
-        if dropped:
-            LOG.warning(f"Dropped {dropped} long samples from dataset")
-
-    return dataset
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -357,8 +357,8 @@ class ModelLoader:

        # init model kwargs
        self.model_kwargs: Dict[str, Any] = {}
-        if cfg.overrides_of_model_kwargs:
-            for key, val in cfg.overrides_of_model_kwargs.items():
+        if cfg.model_kwargs:
+            for key, val in cfg.model_kwargs.items():
                self.model_kwargs[key] = val

        # init model
--- a/src/axolotl/utils/samplers/utils.py
+++ b/src/axolotl/utils/samplers/utils.py
@@ -13,4 +13,5 @@ def get_dataset_lengths(dataset):
    else:
        input_ids = dataset.data.column("input_ids")
        lengths = np.vectorize(len)(np.array(input_ids, dtype=object))
+        return lengths
    return lengths
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -1,5 +1,4 @@
 """Module containing the Trainer class and related functions"""
-
 import json
 import math
 import os
@@ -211,8 +210,6 @@ def drop_long_seq(sample, sequence_len=2048, min_sequence_len=2):

    Works for both single-example (list[int]) or batched (list[list[int]]).
    """
-    min_sequence_len = min_sequence_len or 2
-
    input_ids = sample["input_ids"]

    # Edge case: if input_ids is empty
@@ -235,6 +232,20 @@ def drop_long_seq(sample, sequence_len=2048, min_sequence_len=2):


 def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
+    drop_long = partial(
+        drop_long_seq,
+        sequence_len=cfg.sequence_len,
+        min_sequence_len=cfg.min_sample_len or 2,
+    )
+
+    try:
+        min_input_len = np.min(get_dataset_lengths(train_dataset))
+        LOG.debug(f"min_input_len: {min_input_len}", main_process_only=True)
+        max_input_len = np.max(get_dataset_lengths(train_dataset))
+        LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
+    except AttributeError:
+        pass
+
    if cfg.model_config_type == "mamba":
        LOG.info("dropping attention_mask column")
        train_dataset = train_dataset.remove_columns("attention_mask")
@@ -248,6 +259,46 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
        if eval_dataset and "token_type_ids" in eval_dataset.column_names:
            eval_dataset = eval_dataset.remove_columns("token_type_ids")

+    filter_map_kwargs = {}
+    if not isinstance(train_dataset, IterableDataset):
+        filter_map_kwargs["num_proc"] = cfg.dataset_processes
+        filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
+
+    try:
+        prior_len = len(train_dataset)
+    except TypeError:
+        # handle iterable datasets case
+        prior_len = None
+    drop_long_kwargs = {}
+    if filter_map_kwargs:
+        drop_long_kwargs["desc"] = "Dropping Long Sequences"
+    train_dataset = train_dataset.filter(
+        drop_long,
+        batched=True,
+        **filter_map_kwargs,
+        **drop_long_kwargs,
+    )
+    if prior_len:
+        dropped = prior_len - len(train_dataset)
+        if dropped:
+            LOG.warning(f"Dropped {dropped} long samples from train dataset")
+
+    if eval_dataset:
+        try:
+            prior_len = len(eval_dataset)
+        except TypeError:
+            # handle iterable datasets case
+            prior_len = None
+        eval_dataset = eval_dataset.filter(
+            drop_long,
+            **filter_map_kwargs,
+            **drop_long_kwargs,
+        )
+        if prior_len:
+            dropped = prior_len - len(eval_dataset)
+            if dropped:
+                LOG.warning(f"Dropped {dropped} long samples from eval dataset")
+
    def drop_no_trainable_tokens(sample):
        """
        Drop samples if all labels are -100 (i.e., zero trainable tokens).
@@ -274,11 +325,6 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
    except TypeError:
        # handle iterable datasets case
        prior_len = None
-    filter_map_kwargs = {}
-    if not isinstance(train_dataset, IterableDataset):
-        filter_map_kwargs["num_proc"] = cfg.dataset_processes
-        filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
-
    drop_long_kwargs = {}
    if filter_map_kwargs:
        drop_long_kwargs["desc"] = "Drop Samples with Zero Trainable Tokens"
--- a/tests/e2e/test_reward_model_smollm2.py
+++ b/tests/e2e/test_reward_model_smollm2.py
@@ -33,7 +33,7 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
                "num_labels": 1,
                "chat_template": "alpaca",
                "reward_model": True,
-                "sequence_len": 2048,
+                "sequence_len": 1024,
                "pad_to_sequence_len": True,
                "adapter": "lora",
                "lora_r": 8,