test v2batch w/ flex attn

Merge branch 'main' into flx_attn_support
Update faq.qmd (#2319 )
2025-02-13 00:11:45 -05:00 · 2025-02-11 23:31:56 -05:00 · 2025-02-11 13:18:31 -05:00 · 2025-02-08 06:02:02 -05:00 · 2025-02-08 06:01:48 -05:00 · 2025-02-07 21:34:16 -05:00
39 changed files with 11616 additions and 118 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -22,12 +22,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: "124"
-            cuda_version: 12.4.1
-            cudnn_version: ""
-            python_version: "3.10"
-            pytorch: 2.4.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
@@ -40,6 +34,12 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "124"
+            cuda_version: 12.4.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.6.0
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -19,7 +19,7 @@ jobs:
        - name: Setup Python
          uses: actions/setup-python@v5
          with:
-            python-version: '3.10'
+            python-version: '3.11'
        - name: install dependencies
          run: |
            python3 -m pip install jupyter
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -19,6 +19,6 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
-          python-version: "3.10"
+          python-version: "3.11"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -26,6 +26,11 @@ jobs:
            pytorch: 2.5.1
            axolotl_extras:
            is_latest: true
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.6.0
+            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -34,6 +34,13 @@ jobs:
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.6.0
+            axolotl_extras:
+            num_gpus: 2
+            nightly_build: "true"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
@@ -42,7 +49,7 @@ jobs:
      - name: Install Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.10"
+          python-version: "3.11"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -22,6 +22,11 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.6.0
+            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -36,7 +36,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.10"
+          python-version: "3.11"

      - name: Install dependencies
        run: |
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -12,7 +12,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
-          python-version: "3.10"
+          python-version: "3.11"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
@@ -25,13 +25,8 @@ jobs:
      fail-fast: false
      max-parallel: 2
      matrix:
-        python_version: ["3.10", "3.11"]
-        pytorch_version: ["2.4.1", "2.5.1"]
-        exclude:
-          - python_version: "3.10"
-            pytorch_version: "2.4.1"
-          - python_version: "3.10"
-            pytorch_version: "2.5.1"
+        python_version: ["3.11"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20

    steps:
@@ -112,13 +107,20 @@ jobs:
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.6.0
+            num_gpus: 1
+            axolotl_extras:
+            nightly_build: "true"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.10"
+          python-version: "3.11"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -35,7 +35,7 @@ jobs:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
-          python-version: "3.10"
+          python-version: "3.11"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
@@ -48,13 +48,8 @@ jobs:
      fail-fast: false
      max-parallel: 2
      matrix:
-        python_version: ["3.10", "3.11"]
-        pytorch_version: ["2.4.1", "2.5.1"]
-        exclude:
-          - python_version: "3.10"
-            pytorch_version: "2.4.1"
-          - python_version: "3.10"
-            pytorch_version: "2.5.1"
+        python_version: ["3.11"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20

    steps:
@@ -127,7 +122,7 @@ jobs:
      max-parallel: 1
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.4.1", "2.5.1"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20

    steps:
@@ -216,7 +211,7 @@ jobs:
      - name: Install Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.10"
+          python-version: "3.11"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
@@ -251,13 +246,19 @@ jobs:
            pytorch: 2.4.1
            num_gpus: 1
            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.6.0
+            num_gpus: 1
+            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
-          python-version: "3.10"
+          python-version: "3.11"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ Features:

 **Requirements**:
 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
- Python ≥3.10
+- Python 3.11
 - PyTorch ≥2.4.1

 ### Installation
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -46,6 +46,10 @@ overrides_of_model_config:
    type: # linear | dynamic
    factor: # float

+# optional overrides the base model loading from_pretrained
+overrides_of_model_kwargs:
+  # use_cache: False
+
 # optional overrides to the bnb 4bit quantization configuration
 # https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
 bnb_config_kwargs:
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -19,3 +19,7 @@ description: Frequently asked questions
 **Q: AttributeError: 'DummyOptim' object has no attribute 'step'**

 > A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
+
+**Q: The codes is stuck on saving preprocessed datasets.**
+
+> A: This is usually an issue with the GPU. This can be resolved through setting the os environment variable `CUDA_VISIBLE_DEVICES=0`. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it. 
--- a/docs/multi-node.qmd
+++ b/docs/multi-node.qmd
@@ -3,6 +3,18 @@ title: Multi Node
 description: How to use Axolotl on multiple machines
 ---

+The below are three ways to train multi-node in Axolotl.
+
+::: {.callout-important}
+Each machine needs a copy of Axolotl, we suggest using the same commit to ensure compatibility.
+
+You will also need to have the same configuration file for your model on each machine.
+
+Make sure the main machine is reachable by other machines.
+:::
+
+# Accelerate
+
 You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:

 ~/.cache/huggingface/accelerate/default_config.yaml
@@ -26,7 +38,7 @@ tpu_use_sudo: false
 use_cpu: false
 ```

-Configure your model to use FSDP with for example:
+Configure your model to use FSDP in the Axolotl yaml. For example:
 ```yaml
 fsdp:
  - full_shard
@@ -37,12 +49,40 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

-## Machine configuration
-
-On each machine you need a copy of Axolotl, we suggest using the same commit to ensure compatibility.
-
-You will also need to have the same configuration file for your model on each machine.
-
-On the main machine only, make sure the port you set as `main_process_port` is open in TCP and reachable by other machines.
-
 All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
+
+# Raytrain
+
+Please see ray train doc [here](ray-integration.qmd).
+
+# Torchrun
+
+If you are using Infiniband, we recommend torchrun to utilize the full bandwidth.
+
+Set the following env (change buffersize/socketname depending on your system):
+
+```yaml
+export NCCL_IB_DISABLE=0
+export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
+export NCCL_BUFFSIZE=2097152
+```
+
+Run the following on each node:
+
+```bash
+torchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port" -m axolotl.cli.train config.yaml
+```
+
+Please make sure to substitute the placeholder variables.
+
+- `num_nodes`: Number of nodes (containing GPUs)
+- `gpu_per_node`: Number of gpus per node
+- `head_node_ip`: IP of the head node (make sure other machines can connect to this)
+- `head_node_port`: Port of the head node (make sure other machines can connect to this. Default 29400)
+- `rdzv_id`: A unique job ID that is used by the job across nodes.
+
+::: {.callout-note}
+You need to call `axolotl.cli.train` instead of `axolotl train` as the latter calls accelerate under the hood
+:::
+
+More info on the available configs can be found on the Pytorch docs [here](https://pytorch.org/docs/stable/elastic/run.html)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/

 # START section of dependencies that don't install on Darwin/MacOS
-bitsandbytes==0.45.1
+bitsandbytes==0.45.2
 triton>=3.0.0
 mamba-ssm==1.2.0.post1
-flash-attn==2.7.0.post2
+flash-attn==2.7.4.post1
 xformers>=0.0.23.post1
 autoawq==0.2.7.post3
 liger-kernel==0.5.2
@@ -13,7 +13,7 @@ liger-kernel==0.5.2
 packaging==23.2

 peft==0.14.0
-transformers==4.48.1
+transformers==4.48.3
 tokenizers>=0.21.0
 accelerate==1.3.0
 datasets==3.2.0
--- a/setup.py
+++ b/setup.py
@@ -71,12 +71,15 @@ def parse_requirements():
            else:
                raise ValueError("Invalid version format")

-            if (major, minor) >= (2, 5):
+            if (major, minor) >= (2, 6):
+                _install_requires.pop(_install_requires.index(xformers_version))
+                _install_requires.append("xformers==0.0.29.post2")
+            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
                    _install_requires.append("xformers==0.0.28.post2")
                else:
-                    _install_requires.append("xformers==0.0.28.post3")
+                    _install_requires.append("xformers==0.0.29")
                _install_requires.pop(_install_requires.index(autoawq_version))
            elif (major, minor) >= (2, 4):
                if patch == 0:
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -79,6 +79,7 @@ from axolotl.utils.chat_templates import get_chat_template_from_config
 from axolotl.utils.collators import (
    BatchSamplerDataCollatorForSeq2Seq,
    DataCollatorForSeq2Seq,
+    FlexBatchSamplerDataCollatorForSeq2Seq,
    MambaDataCollator,
    V2BatchSamplerDataCollatorForSeq2Seq,
 )
@@ -816,6 +817,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            Union[
                V2BatchSamplerDataCollatorForSeq2Seq,
                BatchSamplerDataCollatorForSeq2Seq,
+                FlexBatchSamplerDataCollatorForSeq2Seq,
                DataCollatorForSeq2Seq,
                DataCollatorWithFlattening,
                RewardDataCollatorWithPadding,
@@ -827,7 +829,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            if "max_length" in kwargs:
                kwargs.pop("max_length")
        elif use_batch_sampler_collator:
-            if self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES:
+            if self.cfg.flex_attention is True:
+                collator = V2BatchSamplerDataCollatorForSeq2Seq
+            elif self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES:
                collator = V2BatchSamplerDataCollatorForSeq2Seq
            elif (
                self.cfg.model_config_type in ["llama"]
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-1.5B-Instruct.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-1.5B-Instruct.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-1.5B.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-1.5B.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-3B-Instruct.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-3B-Instruct.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-3B.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-3B.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-7B-Instruct.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-7B-Instruct.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-7B.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-7B.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_google-gemma-2-2b.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_google-gemma-2-2b.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-1B-Instruct.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-1B-Instruct.json
@@ -0,0 +1,590 @@
+{
+    "model.layers.0.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.1.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.2.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.3.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.4.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.5.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.6.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.7.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.8.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.9.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.10.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.11.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.12.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.13.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.14.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.15.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "lm_head": {
+        "snr": Infinity,
+        "type": "lm_head"
+    },
+    "model.layers.0.mlp.down_proj": {
+        "snr": 70.0594253540039,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.1.mlp.down_proj": {
+        "snr": 11.135851860046387,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.2.mlp.down_proj": {
+        "snr": 7.035482883453369,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.3.mlp.down_proj": {
+        "snr": 6.422532081604004,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.4.mlp.down_proj": {
+        "snr": 5.748020172119141,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.5.mlp.down_proj": {
+        "snr": 3.885556697845459,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.6.mlp.down_proj": {
+        "snr": 3.4336745738983154,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.7.mlp.down_proj": {
+        "snr": 2.791595935821533,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.8.mlp.down_proj": {
+        "snr": 5.36277961730957,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.9.mlp.down_proj": {
+        "snr": 4.459208011627197,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.10.mlp.down_proj": {
+        "snr": 6.272170066833496,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.11.mlp.down_proj": {
+        "snr": 5.264761447906494,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.12.mlp.down_proj": {
+        "snr": 4.324735641479492,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.13.mlp.down_proj": {
+        "snr": 3.878648042678833,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.14.mlp.down_proj": {
+        "snr": 2.9773054122924805,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.15.mlp.down_proj": {
+        "snr": 4.471445560455322,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.0.mlp.gate_proj": {
+        "snr": 25.227100372314453,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.1.mlp.gate_proj": {
+        "snr": 6.58299446105957,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.2.mlp.gate_proj": {
+        "snr": 3.4688243865966797,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.3.mlp.gate_proj": {
+        "snr": 1.555246114730835,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.4.mlp.gate_proj": {
+        "snr": 0.7770601511001587,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.5.mlp.gate_proj": {
+        "snr": 0.6239906549453735,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.6.mlp.gate_proj": {
+        "snr": 0.6440379023551941,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.7.mlp.gate_proj": {
+        "snr": 0.5120116472244263,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.8.mlp.gate_proj": {
+        "snr": 0.6544050574302673,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.9.mlp.gate_proj": {
+        "snr": 0.5381016731262207,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.10.mlp.gate_proj": {
+        "snr": 0.622873842716217,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.11.mlp.gate_proj": {
+        "snr": 0.9361700415611267,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.12.mlp.gate_proj": {
+        "snr": 1.475605845451355,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.13.mlp.gate_proj": {
+        "snr": 1.608325719833374,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.14.mlp.gate_proj": {
+        "snr": 1.0720024108886719,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.15.mlp.gate_proj": {
+        "snr": 0.7111338973045349,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.0.mlp.up_proj": {
+        "snr": 28.431896209716797,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.1.mlp.up_proj": {
+        "snr": 15.546019554138184,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.2.mlp.up_proj": {
+        "snr": 23.048023223876953,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.3.mlp.up_proj": {
+        "snr": 25.790977478027344,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.4.mlp.up_proj": {
+        "snr": 18.552549362182617,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.5.mlp.up_proj": {
+        "snr": 8.85106372833252,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.6.mlp.up_proj": {
+        "snr": 10.653799057006836,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.7.mlp.up_proj": {
+        "snr": 7.365357875823975,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.8.mlp.up_proj": {
+        "snr": 11.98373794555664,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.9.mlp.up_proj": {
+        "snr": 8.04493236541748,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.10.mlp.up_proj": {
+        "snr": 8.523039817810059,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.11.mlp.up_proj": {
+        "snr": 5.381742477416992,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.12.mlp.up_proj": {
+        "snr": 3.9845118522644043,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.13.mlp.up_proj": {
+        "snr": 3.4893221855163574,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.14.mlp.up_proj": {
+        "snr": 1.764201045036316,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.15.mlp.up_proj": {
+        "snr": 0.9730708599090576,
+        "type": "mlp.up_proj"
+    },
+    "model.embed_tokens": {
+        "snr": Infinity,
+        "type": "model.embed_tokens"
+    },
+    "model.norm": {
+        "snr": Infinity,
+        "type": "model.norm"
+    },
+    "model.layers.0.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.1.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.2.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.3.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.4.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.5.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.6.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.7.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.8.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.9.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.10.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.11.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.12.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.13.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.14.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.15.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.0.self_attn.k_proj": {
+        "snr": 0.11727584153413773,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.1.self_attn.k_proj": {
+        "snr": 0.24786807596683502,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.2.self_attn.k_proj": {
+        "snr": 0.36378130316734314,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.3.self_attn.k_proj": {
+        "snr": 0.2983120381832123,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.4.self_attn.k_proj": {
+        "snr": 0.33789733052253723,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.5.self_attn.k_proj": {
+        "snr": 0.29155924916267395,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.6.self_attn.k_proj": {
+        "snr": 0.2537297010421753,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.7.self_attn.k_proj": {
+        "snr": 0.28204113245010376,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.8.self_attn.k_proj": {
+        "snr": 0.2776711583137512,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.9.self_attn.k_proj": {
+        "snr": 0.2927376627922058,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.10.self_attn.k_proj": {
+        "snr": 0.31486213207244873,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.11.self_attn.k_proj": {
+        "snr": 0.32363659143447876,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.12.self_attn.k_proj": {
+        "snr": 0.31382912397384644,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.13.self_attn.k_proj": {
+        "snr": 0.4635234773159027,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.14.self_attn.k_proj": {
+        "snr": 0.25379249453544617,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.15.self_attn.k_proj": {
+        "snr": 0.2628238797187805,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.0.self_attn.o_proj": {
+        "snr": 0.27602291107177734,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.1.self_attn.o_proj": {
+        "snr": 0.2149604707956314,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.2.self_attn.o_proj": {
+        "snr": 0.2540294826030731,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.3.self_attn.o_proj": {
+        "snr": 0.27978822588920593,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.4.self_attn.o_proj": {
+        "snr": 0.3121289908885956,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.5.self_attn.o_proj": {
+        "snr": 0.35037684440612793,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.6.self_attn.o_proj": {
+        "snr": 0.366205096244812,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.7.self_attn.o_proj": {
+        "snr": 0.3692712187767029,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.8.self_attn.o_proj": {
+        "snr": 0.3301038146018982,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.9.self_attn.o_proj": {
+        "snr": 0.3003396987915039,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.10.self_attn.o_proj": {
+        "snr": 0.30804169178009033,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.11.self_attn.o_proj": {
+        "snr": 0.28501132130622864,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.12.self_attn.o_proj": {
+        "snr": 0.2171541005373001,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.13.self_attn.o_proj": {
+        "snr": 0.19183959066867828,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.14.self_attn.o_proj": {
+        "snr": 0.19215913116931915,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.15.self_attn.o_proj": {
+        "snr": 0.25486502051353455,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.0.self_attn.q_proj": {
+        "snr": 0.03850084915757179,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.1.self_attn.q_proj": {
+        "snr": 0.0713055431842804,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.2.self_attn.q_proj": {
+        "snr": 0.07948919385671616,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.3.self_attn.q_proj": {
+        "snr": 0.08047746121883392,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.4.self_attn.q_proj": {
+        "snr": 0.0852593332529068,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.5.self_attn.q_proj": {
+        "snr": 0.09794823825359344,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.6.self_attn.q_proj": {
+        "snr": 0.09627152234315872,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.7.self_attn.q_proj": {
+        "snr": 0.11065381020307541,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.8.self_attn.q_proj": {
+        "snr": 0.12031875550746918,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.9.self_attn.q_proj": {
+        "snr": 0.09804573655128479,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.10.self_attn.q_proj": {
+        "snr": 0.10897502303123474,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.11.self_attn.q_proj": {
+        "snr": 0.09267337620258331,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.12.self_attn.q_proj": {
+        "snr": 0.08803492039442062,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.13.self_attn.q_proj": {
+        "snr": 0.0902542844414711,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.14.self_attn.q_proj": {
+        "snr": 0.10154066979885101,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.15.self_attn.q_proj": {
+        "snr": 0.09083802253007889,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.0.self_attn.v_proj": {
+        "snr": 2.842210054397583,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.1.self_attn.v_proj": {
+        "snr": 10.59461498260498,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.2.self_attn.v_proj": {
+        "snr": 8.993025779724121,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.3.self_attn.v_proj": {
+        "snr": 62.567787170410156,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.4.self_attn.v_proj": {
+        "snr": 23.80082893371582,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.5.self_attn.v_proj": {
+        "snr": 7.957369804382324,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.6.self_attn.v_proj": {
+        "snr": 12.01815414428711,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.7.self_attn.v_proj": {
+        "snr": 5.095500469207764,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.8.self_attn.v_proj": {
+        "snr": 11.719332695007324,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.9.self_attn.v_proj": {
+        "snr": 555.0869750976562,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.10.self_attn.v_proj": {
+        "snr": 22.95538330078125,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.11.self_attn.v_proj": {
+        "snr": 30.042158126831055,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.12.self_attn.v_proj": {
+        "snr": 9.577271461486816,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.13.self_attn.v_proj": {
+        "snr": 18.176361083984375,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.14.self_attn.v_proj": {
+        "snr": 1.5695856809616089,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.15.self_attn.v_proj": {
+        "snr": 2.7235565185546875,
+        "type": "self_attn.v_proj"
+    }
+}
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-1B.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-1B.json
@@ -0,0 +1,590 @@
+{
+    "model.layers.0.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.1.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.2.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.3.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.4.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.5.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.6.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.7.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.8.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.9.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.10.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.11.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.12.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.13.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.14.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "model.layers.15.input_layernorm": {
+        "snr": Infinity,
+        "type": "input_layernorm"
+    },
+    "lm_head": {
+        "snr": Infinity,
+        "type": "lm_head"
+    },
+    "model.layers.0.mlp.down_proj": {
+        "snr": 57.09797286987305,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.1.mlp.down_proj": {
+        "snr": 9.538983345031738,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.2.mlp.down_proj": {
+        "snr": 6.227016925811768,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.3.mlp.down_proj": {
+        "snr": 5.660686492919922,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.4.mlp.down_proj": {
+        "snr": 5.178432464599609,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.5.mlp.down_proj": {
+        "snr": 3.5638349056243896,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.6.mlp.down_proj": {
+        "snr": 3.0918056964874268,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.7.mlp.down_proj": {
+        "snr": 2.456392288208008,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.8.mlp.down_proj": {
+        "snr": 4.525328636169434,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.9.mlp.down_proj": {
+        "snr": 3.9409055709838867,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.10.mlp.down_proj": {
+        "snr": 5.447249412536621,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.11.mlp.down_proj": {
+        "snr": 4.807600975036621,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.12.mlp.down_proj": {
+        "snr": 3.915374517440796,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.13.mlp.down_proj": {
+        "snr": 3.4820363521575928,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.14.mlp.down_proj": {
+        "snr": 2.6045074462890625,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.15.mlp.down_proj": {
+        "snr": 3.7237701416015625,
+        "type": "mlp.down_proj"
+    },
+    "model.layers.0.mlp.gate_proj": {
+        "snr": 22.160131454467773,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.1.mlp.gate_proj": {
+        "snr": 6.072206020355225,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.2.mlp.gate_proj": {
+        "snr": 3.2467362880706787,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.3.mlp.gate_proj": {
+        "snr": 1.4111896753311157,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.4.mlp.gate_proj": {
+        "snr": 0.7405938506126404,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.5.mlp.gate_proj": {
+        "snr": 0.5916463136672974,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.6.mlp.gate_proj": {
+        "snr": 0.6149423718452454,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.7.mlp.gate_proj": {
+        "snr": 0.48369669914245605,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.8.mlp.gate_proj": {
+        "snr": 0.6047574877738953,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.9.mlp.gate_proj": {
+        "snr": 0.5092479586601257,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.10.mlp.gate_proj": {
+        "snr": 0.5999670624732971,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.11.mlp.gate_proj": {
+        "snr": 0.8980127573013306,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.12.mlp.gate_proj": {
+        "snr": 1.4252448081970215,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.13.mlp.gate_proj": {
+        "snr": 1.509937047958374,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.14.mlp.gate_proj": {
+        "snr": 1.0066585540771484,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.15.mlp.gate_proj": {
+        "snr": 0.6413647532463074,
+        "type": "mlp.gate_proj"
+    },
+    "model.layers.0.mlp.up_proj": {
+        "snr": 26.08852195739746,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.1.mlp.up_proj": {
+        "snr": 13.382951736450195,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.2.mlp.up_proj": {
+        "snr": 20.088768005371094,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.3.mlp.up_proj": {
+        "snr": 23.0632381439209,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.4.mlp.up_proj": {
+        "snr": 16.07433319091797,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.5.mlp.up_proj": {
+        "snr": 8.00507640838623,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.6.mlp.up_proj": {
+        "snr": 9.538354873657227,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.7.mlp.up_proj": {
+        "snr": 6.286602973937988,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.8.mlp.up_proj": {
+        "snr": 10.092820167541504,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.9.mlp.up_proj": {
+        "snr": 7.193963527679443,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.10.mlp.up_proj": {
+        "snr": 7.320116996765137,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.11.mlp.up_proj": {
+        "snr": 4.8728532791137695,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.12.mlp.up_proj": {
+        "snr": 3.596583366394043,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.13.mlp.up_proj": {
+        "snr": 3.166161298751831,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.14.mlp.up_proj": {
+        "snr": 1.5600818395614624,
+        "type": "mlp.up_proj"
+    },
+    "model.layers.15.mlp.up_proj": {
+        "snr": 0.8726214170455933,
+        "type": "mlp.up_proj"
+    },
+    "model.embed_tokens": {
+        "snr": Infinity,
+        "type": "model.embed_tokens"
+    },
+    "model.norm": {
+        "snr": Infinity,
+        "type": "model.norm"
+    },
+    "model.layers.0.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.1.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.2.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.3.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.4.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.5.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.6.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.7.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.8.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.9.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.10.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.11.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.12.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.13.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.14.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.15.post_attention_layernorm": {
+        "snr": Infinity,
+        "type": "post_attention_layernorm"
+    },
+    "model.layers.0.self_attn.k_proj": {
+        "snr": 0.1154392883181572,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.1.self_attn.k_proj": {
+        "snr": 0.24299409985542297,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.2.self_attn.k_proj": {
+        "snr": 0.3624322712421417,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.3.self_attn.k_proj": {
+        "snr": 0.29509487748146057,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.4.self_attn.k_proj": {
+        "snr": 0.32953736186027527,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.5.self_attn.k_proj": {
+        "snr": 0.2908833622932434,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.6.self_attn.k_proj": {
+        "snr": 0.2488437294960022,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.7.self_attn.k_proj": {
+        "snr": 0.27847856283187866,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.8.self_attn.k_proj": {
+        "snr": 0.27143892645835876,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.9.self_attn.k_proj": {
+        "snr": 0.28804272413253784,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.10.self_attn.k_proj": {
+        "snr": 0.31197959184646606,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.11.self_attn.k_proj": {
+        "snr": 0.3203586935997009,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.12.self_attn.k_proj": {
+        "snr": 0.30905747413635254,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.13.self_attn.k_proj": {
+        "snr": 0.46828722953796387,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.14.self_attn.k_proj": {
+        "snr": 0.24205778539180756,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.15.self_attn.k_proj": {
+        "snr": 0.2559327781200409,
+        "type": "self_attn.k_proj"
+    },
+    "model.layers.0.self_attn.o_proj": {
+        "snr": 0.2638678550720215,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.1.self_attn.o_proj": {
+        "snr": 0.21109595894813538,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.2.self_attn.o_proj": {
+        "snr": 0.24751724302768707,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.3.self_attn.o_proj": {
+        "snr": 0.2728094160556793,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.4.self_attn.o_proj": {
+        "snr": 0.3001374304294586,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.5.self_attn.o_proj": {
+        "snr": 0.33903488516807556,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.6.self_attn.o_proj": {
+        "snr": 0.3530929982662201,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.7.self_attn.o_proj": {
+        "snr": 0.36753255128860474,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.8.self_attn.o_proj": {
+        "snr": 0.3373180329799652,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.9.self_attn.o_proj": {
+        "snr": 0.2970578670501709,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.10.self_attn.o_proj": {
+        "snr": 0.3076324760913849,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.11.self_attn.o_proj": {
+        "snr": 0.2766900658607483,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.12.self_attn.o_proj": {
+        "snr": 0.20973259210586548,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.13.self_attn.o_proj": {
+        "snr": 0.18185566365718842,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.14.self_attn.o_proj": {
+        "snr": 0.18329747021198273,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.15.self_attn.o_proj": {
+        "snr": 0.2437991499900818,
+        "type": "self_attn.o_proj"
+    },
+    "model.layers.0.self_attn.q_proj": {
+        "snr": 0.038040731102228165,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.1.self_attn.q_proj": {
+        "snr": 0.0707998052239418,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.2.self_attn.q_proj": {
+        "snr": 0.0787411704659462,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.3.self_attn.q_proj": {
+        "snr": 0.08089710026979446,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.4.self_attn.q_proj": {
+        "snr": 0.08591937273740768,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.5.self_attn.q_proj": {
+        "snr": 0.09852176159620285,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.6.self_attn.q_proj": {
+        "snr": 0.09690654277801514,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.7.self_attn.q_proj": {
+        "snr": 0.11181341856718063,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.8.self_attn.q_proj": {
+        "snr": 0.12042108923196793,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.9.self_attn.q_proj": {
+        "snr": 0.09799323976039886,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.10.self_attn.q_proj": {
+        "snr": 0.10901063680648804,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.11.self_attn.q_proj": {
+        "snr": 0.09307146072387695,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.12.self_attn.q_proj": {
+        "snr": 0.0880950540304184,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.13.self_attn.q_proj": {
+        "snr": 0.08886399120092392,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.14.self_attn.q_proj": {
+        "snr": 0.09955056011676788,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.15.self_attn.q_proj": {
+        "snr": 0.08929339051246643,
+        "type": "self_attn.q_proj"
+    },
+    "model.layers.0.self_attn.v_proj": {
+        "snr": 2.5501928329467773,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.1.self_attn.v_proj": {
+        "snr": 9.449499130249023,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.2.self_attn.v_proj": {
+        "snr": 7.9920830726623535,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.3.self_attn.v_proj": {
+        "snr": 50.69462585449219,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.4.self_attn.v_proj": {
+        "snr": 19.083511352539062,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.5.self_attn.v_proj": {
+        "snr": 7.21597146987915,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.6.self_attn.v_proj": {
+        "snr": 11.27744197845459,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.7.self_attn.v_proj": {
+        "snr": 4.579711437225342,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.8.self_attn.v_proj": {
+        "snr": 10.940719604492188,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.9.self_attn.v_proj": {
+        "snr": 553.4417724609375,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.10.self_attn.v_proj": {
+        "snr": 20.59434700012207,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.11.self_attn.v_proj": {
+        "snr": 26.636865615844727,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.12.self_attn.v_proj": {
+        "snr": 8.614749908447266,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.13.self_attn.v_proj": {
+        "snr": 17.722007751464844,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.14.self_attn.v_proj": {
+        "snr": 1.48500657081604,
+        "type": "self_attn.v_proj"
+    },
+    "model.layers.15.self_attn.v_proj": {
+        "snr": 2.5776851177215576,
+        "type": "self_attn.v_proj"
+    }
+}
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-3B-Instruct.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-3B-Instruct.json
--- a/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-3B.json
+++ b/src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-3B.json
--- a/src/axolotl/monkeypatch/flex_attn.py
+++ b/src/axolotl/monkeypatch/flex_attn.py
@@ -0,0 +1,146 @@
+"""
+Taken from https://github.com/pytorch/torchtune/blob/main/torchtune/modules/attention_utils.py
+"""
+from typing import Union
+
+import torch
+from torch.nn.attention.flex_attention import BlockMask
+from torch.nn.attention.flex_attention import (
+    create_block_mask as create_block_causal_mask_flex,
+)
+
+_MaskType = Union[torch.Tensor, BlockMask]
+
+
+def create_block_causal_mask(
+    seq_lens: list[torch.Tensor], max_seq_len: int
+) -> torch.Tensor:
+    """
+    Given a batch tensor of seq lens defining the lengths of samples in each pack,
+    Construct a 2D block causal mask for each pack in the batch. For example, if
+    a single sample's seq_lens is [3, 2, 1], the mask would be::
+
+        mask = [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0],
+            [0, 0, 0, 1, 0, 0],
+            [0, 0, 0, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1],
+        ]
+
+    Args:
+        seq_lens (List[torch.Tensor]): Sequence lengths of samples in each pack in the batch,
+            shape (batch_size, n), where n is the max number of sequences in a pack and can vary
+            across packs.
+
+
+    Returns:
+        Tensor: Block causal mask of shape (batch_size, max_seq_len, max_seq_len).
+    """
+    batch_block_attn_masks = []
+    batch_size = len(seq_lens)
+    for sample_idx in range(batch_size):
+        block_attn_masks = [
+            torch.trilu( # torch.tril(
+                torch.ones(seq_len, seq_len, dtype=torch.bool, device=seq_len.device)
+            )
+            for seq_len in seq_lens[sample_idx]
+        ]
+
+        """residue_len = max_seq_len - torch.sum(seq_lens[sample_idx])
+        block_attn_masks.append(
+            torch.tril(
+                torch.ones(
+                    residue_len, residue_len, dtype=torch.bool, device=seq_lens[sample_idx].device
+                )
+            )
+        )"""
+
+        batch_block_attn_masks.append(torch.block_diag(*block_attn_masks))
+
+    return torch.stack(batch_block_attn_masks)[:, None, :, :]
+
+
+def _get_document_ids_from_seq_lens(
+    seq_lens: list[torch.Tensor],
+) -> torch.Tensor:
+    """
+    Convert a batch tensor of seq lens into integer IDs denoting sample ownership.
+    For example, seq_lens = [2, 3, 1] would return [0, 0, 1, 1, 1, 2].
+
+    Args:
+        seq_lens (List[torch.Tensor]): Sequence lengths of samples in each pack in the batch,
+            shape (batch_size, n), where n is the max number of sequences in a pack and can vary
+            across packs.
+
+    Returns:
+        Tensor: Document IDs of shape (batch_size, max_seq_len).
+    """
+    batch_size = len(seq_lens)
+    batch_document_ids = []
+    for sample_idx in range(batch_size):
+        # We assume seq lens sum to max seq lens, so document_ids should be of
+        # shape (max_seq_len, )
+        document_ids = torch.cat(
+            [
+                torch.full((seq_len,), i, dtype=torch.long, device=seq_len.device)
+                for i, seq_len in enumerate(seq_lens[sample_idx])
+            ]
+        )
+        batch_document_ids.append(document_ids)
+    batch_document_ids = torch.stack(batch_document_ids)
+    return batch_document_ids
+
+
+def packed_block_causal_mask(
+    seq_lens: list[torch.Tensor], totalseqlens: list[int]
+) -> _MaskType:
+    """
+    Create a block causal document mask for a batch of packed sequences. If
+    flex attention is supported by the current hardware, block causal logic and
+    passing this into :func:`torch.nn.attention.flex_attention.create_block_mask`.
+    The resultant BlockMask is a compressed representation of the full block causal
+    mask. If on an older version, a standard 2D block causal mask is created and returned.
+
+    Args:
+        seq_lens (List[torch.Tensor]): Sequence lengths of samples in each pack in the batch,
+            shape (batch_size, n), where n is the max number of sequences in a pack and can vary
+            across packs.
+
+    Returns:
+        _MaskType: BlockMask or Tensor if torch version < 2.5.0.
+    """
+
+    document_ids = _get_document_ids_from_seq_lens(seq_lens)
+    batch_size , max_seq_len = document_ids.shape
+    document_ids = document_ids.to("cuda")
+    totalseqlens = totalseqlens.to("cuda")
+
+    # Instead of passing a tensor mask, flex attention requires a mask_mod function
+    # that determines which elements of QK^T should be included in the attention
+    # computation prior to the softmax. For sample packing, we need both the
+    # logic for both causal mask and document mask. See PyTorch's official
+    # blog post for more details: https://pytorch.org/blog/flexattention/#mask-mods
+    def mask_mod(b, h, q_idx, kv_idx):
+        """
+        Defines the logic of a block causal mask by combining both a standard causal mask
+        and a block diagonal document mask.
+
+        See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
+        for an illustration.
+        """
+        causal_mask = q_idx >= kv_idx
+        document_mask = document_ids[b, q_idx] == document_ids[b, kv_idx]
+        finite_mask = q_idx < totalseqlens[b]
+        return causal_mask & document_mask & finite_mask
+
+    return create_block_causal_mask_flex(
+        mask_mod,
+        batch_size,
+        None,
+        max_seq_len,
+        max_seq_len,
+        device="cuda",
+        BLOCK_SIZE=512,
+    )
--- a/src/axolotl/monkeypatch/utils.py
+++ b/src/axolotl/monkeypatch/utils.py
@@ -95,6 +95,103 @@ def get_cu_seqlens(attn_mask):
    return torch.stack(results).to(dtype=torch.int32), torch.stack(max_seq_lens)


+def get_packed_mask_from_pos_ids(position_ids):
+    if len(position_ids.shape) == 1:
+        position_ids = position_ids.unsqueeze(0)
+
+    device = position_ids.device
+    results = []
+
+    for i, row in enumerate(position_ids):
+        # Count the number of consecutive zeros from the right side
+        padding_length = (row == 0).int().flip(dims=[0]).cumprod(dim=0).sum().item()
+
+        # Adjust the row to exclude padding
+        adjusted_row = row[:-padding_length] if padding_length else row.clone()
+
+        # Find where the position resets to 0 (indicating a new sequence)
+        seq_starts = torch.cat(
+            [
+                torch.tensor([True], dtype=torch.bool, device=device),
+                adjusted_row[1:] == 0,
+            ]
+        )
+        # Get the indices where the sequence starts
+        start_indices = torch.cat(
+            [
+                torch.nonzero(seq_starts).unbind(dim=1)[0],
+                torch.tensor([len(adjusted_row)], dtype=torch.int32, device=device),
+            ]
+        )
+        # Calculate the sequence lengths
+        seq_lengths = start_indices[1:] - start_indices[:-1]
+        # Append the padding length to the sequence lengths
+        doc_mask = torch.ones(len(row), dtype=torch.int32, device=device)
+        for i, seq_len in enumerate(seq_lengths):
+            start_id = start_indices[i]
+            doc_mask[start_id : start_id + seq_len] = (
+                (i+1) * doc_mask[start_id : start_id + seq_len]
+            )
+        if padding_length:
+            doc_mask[len(adjusted_row) :] = 0 * doc_mask[len(adjusted_row) :]
+
+        results.append(doc_mask)
+
+    return torch.stack(results)
+
+
+def get_seqlens_from_pos_ids(position_ids):
+    """generate a sequence length set using pos ids for doc mask creation in flex attention"""
+    if len(position_ids.shape) == 1:
+        position_ids = position_ids.unsqueeze(0)
+    max_seq_len = position_ids.shape[1]
+
+    device = position_ids.device
+    results = []
+    totalseqlens = []
+
+    for row in position_ids:
+        # Count the number of consecutive zeros from the right side
+        padding_length = (row == 0).int().flip(dims=[0]).cumprod(dim=0).sum().item()
+
+        # Adjust the row to exclude padding
+        adjusted_row = row[:-padding_length] if padding_length else row.clone()
+
+        # Find where the position resets to 0 (indicating a new sequence)
+        seq_starts = torch.cat(
+            [
+                torch.tensor([True], dtype=torch.bool, device=device),
+                adjusted_row[1:] == 0,
+            ]
+        )
+        # Get the indices where the sequence starts
+        start_indices = torch.cat(
+            [
+                torch.nonzero(seq_starts).unbind(dim=1)[0],
+                torch.tensor([len(adjusted_row)], dtype=torch.int32, device=device),
+            ]
+        )
+        # Calculate the sequence lengths
+        seq_lengths = start_indices[1:] - start_indices[:-1]
+        # Append the padding length to the sequence lengths
+        if padding_length:
+            seq_lengths = torch.cat(
+                [
+                    seq_lengths,
+                    torch.tensor(
+                        [len(row) - torch.sum(seq_lengths)],
+                        dtype=torch.int32,
+                        device=device,
+                    ),
+                ]
+            )
+
+        results.append(seq_lengths)
+        totalseqlens.append(len(adjusted_row))
+
+    return results, torch.tensor(totalseqlens, dtype=torch.int32, device=device)
+
+
 def get_cu_seqlens_from_pos_ids(position_ids):
    """generate a cumulative sequence length mask for flash attention using pos ids"""
    if len(position_ids.shape) == 1:
@@ -176,7 +273,10 @@ def mask_2d_to_4d(
    when they attend to each other within that sequence.
    This expansion transforms the mask to lower triangular form to prevent future peeking.
    """
-    bsz, src_len = mask.size()
+
+    if len(mask.size()) == 4:
+        return mask
+    bsz, src_len = int(mask.size()[0]), int(mask.size()[1])
    tgt_len = tgt_len if tgt_len is not None else src_len

    mask = mask.unsqueeze(1).unsqueeze(2)
--- a/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
+++ b/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
@@ -47,7 +47,7 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):

        if len(chosen_tokenized["input_ids"]) > max_length:
            LOG.warning(
-                f"Chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}",
+                f"To-be-trimmed chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}",
            )

            chosen_tokenized["input_ids"] = chosen_tokenized["input_ids"][:max_length]
@@ -70,7 +70,7 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):

        if len(rejected_tokenized["input_ids"]) > max_length:
            LOG.warning(
-                f"Rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}",
+                f"To-be-trimmed rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}",
            )

            rejected_tokenized["input_ids"] = rejected_tokenized["input_ids"][
--- a/src/axolotl/utils/collators/init.py
+++ b/src/axolotl/utils/collators/init.py
@@ -4,6 +4,7 @@ shared axolotl collators for multipack, mamba, multimodal
 from .batching import (  # noqa: F401
    BatchSamplerDataCollatorForSeq2Seq,
    DataCollatorForSeq2Seq,
+    FlexBatchSamplerDataCollatorForSeq2Seq,
    PretrainingBatchSamplerDataCollatorForSeq2Seq,
    V2BatchSamplerDataCollatorForSeq2Seq,
 )
--- a/src/axolotl/utils/collators/batching.py
+++ b/src/axolotl/utils/collators/batching.py
@@ -3,12 +3,21 @@ DataCollator for axolotl to pad labels and position_ids for packed sequences
 """

 from dataclasses import dataclass
-from typing import Any, Optional, Union
+from typing import Any, List, Optional, Union

 import numpy as np
+import torch
 from transformers import PreTrainedTokenizerBase
 from transformers.utils import PaddingStrategy

+from axolotl.monkeypatch.flex_attn import (
+    create_block_causal_mask,
+    packed_block_causal_mask,
+)
+from axolotl.monkeypatch.utils import (
+    get_packed_mask_from_pos_ids,
+)
+

@dataclass
 class DataCollatorForSeq2Seq:
@@ -151,6 +160,42 @@ class BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
        return super().__call__(out_features, return_tensors=return_tensors)


+@dataclass
+class FlexBatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
+    """
+    Collator for multipack specific to Flex Attention using the BatchSampler
+    """
+
+    def __call__(self, features, return_tensors=None):
+        if not isinstance(features[0], list):
+            features = [features]
+        out_features = [{} for _ in features]
+        for i, features_ in enumerate(features):
+            for feature in features_[0].keys():
+                if feature == "length":
+                    continue
+                elif feature == "attention_mask":
+                    """arrays = [
+                        i * np.array(item[feature])
+                        for i, item in enumerate(features_)
+                        if feature in item
+                    ]
+                    out_features[i][feature] = np.concatenate(arrays)"""
+                    continue
+                else:
+                    arrays = [
+                        np.array(item[feature]) for item in features_ if feature in item
+                    ]
+                    out_features[i][feature] = np.concatenate(arrays)
+        out = super().__call__(out_features, return_tensors=return_tensors)
+
+        # collated_seq_lens, totalseqlens = get_seqlens_from_pos_ids(out["position_ids"])
+        # out["attention_mask"] = packed_block_causal_mask(collated_seq_lens, totalseqlens)
+        out["attention_mask"] = get_packed_mask_from_pos_ids(out["position_ids"])
+        # out["attention_mask"] = create_block_causal_mask(collated_seq_lens, max_seq_len)
+        return out
+
+
@dataclass
 class V2BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    """
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -115,6 +115,9 @@ class RemappedParameters(BaseModel):
    overrides_of_model_config: Optional[Dict[str, Any]] = Field(
        default=None, alias="model_config"
    )
+    overrides_of_model_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None, alias="model_kwargs"
+    )
    type_of_model: Optional[str] = Field(default=None, alias="model_type")
    revision_of_model: Optional[str] = Field(default=None, alias="model_revision")

@@ -426,8 +429,6 @@ class ModelInputConfig(BaseModel):
    )
    trust_remote_code: Optional[bool] = None

-    model_kwargs: Optional[Dict[str, Any]] = None
-
    @field_validator("trust_remote_code")
    @classmethod
    def hint_trust_remote_code(cls, trust_remote_code):
@@ -783,6 +784,7 @@ class AxolotlInputConfig(
    xformers_attention: Optional[bool] = None
    sdp_attention: Optional[bool] = None
    s2_attention: Optional[bool] = None
+    flex_attention: Optional[bool] = None
    flash_attention: Optional[bool] = None
    flash_attn_cross_entropy: Optional[bool] = None
    flash_attn_rms_norm: Optional[bool] = None
@@ -1678,6 +1680,26 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
                )
        return data

+    @model_validator(mode="before")
+    @classmethod
+    def check_flex_torch_version(cls, data):
+        if (data.get("flex_attention") is not None) and (
+            data.get("flex_attention") is True
+        ):
+            env_capabilities = data.get("env_capabilities", {})
+            torch_version = env_capabilities.get("torch_version")
+
+            if torch_version is None:
+                import torch
+
+                torch_version = str(torch.__version__).split("+", maxsplit=1)[0]
+
+            if version.parse(torch_version) < version.parse("2.5.1"):
+                raise ValueError(
+                    "Flex attention is not supported on torch version < 2.5.1"
+                )
+        return data
+
    @model_validator(mode="before")
    @classmethod
    def check_torch_compile_auto(cls, data):
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -46,6 +46,7 @@ from axolotl.utils.data.pretraining import wrap_pretraining_dataset
 from axolotl.utils.data.shared import load_dataset_w_config
 from axolotl.utils.data.utils import (
    deduplicate_and_log_datasets,
+    drop_long_seq_in_dataset,
    md5,
    retry_on_request_exceptions,
 )
@@ -56,7 +57,7 @@ from axolotl.utils.trainer import (
    process_datasets_for_packing,
 )

-LOG = logging.getLogger("axolotl")
+LOG = logging.getLogger(__name__)


@retry_on_request_exceptions(max_retries=3, delay=5)
@@ -339,8 +340,11 @@ def load_tokenized_prepared_datasets(
            else:
                LOG.debug("NOT shuffling merged datasets")

-        if cfg.sample_packing and not cfg.skip_prepare_dataset:
-            dataset, _ = process_datasets_for_packing(cfg, dataset, None)
+        if not cfg.skip_prepare_dataset:
+            dataset = drop_long_seq_in_dataset(dataset, cfg)
+
+            if cfg.sample_packing:
+                dataset, _ = process_datasets_for_packing(cfg, dataset, None)

        if cfg.local_rank == 0 and not cfg.skip_prepare_dataset:
            LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -1,4 +1,5 @@
 """data handling helpers"""
+
 import functools
 import hashlib
 import logging
@@ -6,10 +7,15 @@ import time
 from enum import Enum

 import huggingface_hub
+import numpy as np
 import requests
-from datasets import Dataset
+from datasets import Dataset, IterableDataset

-LOG = logging.getLogger("axolotl")
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.samplers.utils import get_dataset_lengths
+from axolotl.utils.trainer import drop_long_seq
+
+LOG = logging.getLogger(__name__)


 class RetryStrategy(Enum):
@@ -150,3 +156,53 @@ def deduplicate_and_log_datasets(
        )

    return train_dataset, eval_dataset, dataset
+
+
+def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault):
+    if "input_ids" not in dataset.column_names:
+        LOG.warning(
+            "Dataset does not contain 'input_ids' column. Skip drop long seq. This is expected for RewardModeling."
+        )
+        return dataset
+
+    drop_long = functools.partial(
+        drop_long_seq,
+        sequence_len=cfg.sequence_len,
+        min_sequence_len=cfg.min_sample_len,
+    )
+
+    try:
+        min_input_len = np.min(get_dataset_lengths(dataset))
+        LOG.debug(f"min_input_len: {min_input_len}")
+        max_input_len = np.max(get_dataset_lengths(dataset))
+        LOG.debug(f"max_input_len: {max_input_len}")
+    except AttributeError:
+        pass
+
+    try:
+        prior_len = len(dataset)
+    except TypeError:
+        # handle iterable datasets case
+        prior_len = None
+
+    filter_map_kwargs = {}
+    if not isinstance(dataset, IterableDataset):
+        filter_map_kwargs["num_proc"] = cfg.dataset_processes
+        filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
+
+    drop_long_kwargs = {}
+    if filter_map_kwargs:
+        drop_long_kwargs["desc"] = "Dropping Long Sequences"
+
+    dataset = dataset.filter(
+        drop_long,
+        batched=True,
+        **filter_map_kwargs,
+        **drop_long_kwargs,
+    )
+    if prior_len:
+        dropped = prior_len - len(dataset)
+        if dropped:
+            LOG.warning(f"Dropped {dropped} long samples from dataset")
+
+    return dataset
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -357,8 +357,8 @@ class ModelLoader:

        # init model kwargs
        self.model_kwargs: Dict[str, Any] = {}
-        if cfg.model_kwargs:
-            for key, val in cfg.model_kwargs.items():
+        if cfg.overrides_of_model_kwargs:
+            for key, val in cfg.overrides_of_model_kwargs.items():
                self.model_kwargs[key] = val

        # init model
@@ -403,7 +403,7 @@ class ModelLoader:

        if (
            self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
-            and self.cfg.flash_attention
+            and (self.cfg.flash_attention or self.cfg.flex_attention)
            and self.cfg.sample_packing
        ):
            if "auto_map" in self.model_config:
@@ -708,7 +708,13 @@ class ModelLoader:
        """
        sample packing uses custom FA2 patch
        """
-        if self.cfg.flash_attention:
+
+        if self.cfg.flex_attention:
+            self.model_kwargs["attn_implementation"] = "flex_attention"
+            self.model_config._attn_implementation = (  # pylint: disable=protected-access
+                "flex_attention"
+            )
+        elif self.cfg.flash_attention:
            if not self.cfg.sample_packing and self.cfg.s2_attention:
                pass
            self.model_kwargs["attn_implementation"] = "flash_attention_2"
@@ -1100,7 +1106,7 @@ class ModelLoader:
        should_convert = (
            # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
            # convert them back to fp16/bf16 for flash-attn compatibility.
-            ((needs_fa2_dtype or self.cfg.flash_attention) and not qlora_fsdp)
+            ((needs_fa2_dtype or self.cfg.flash_attention or self.cfg.flex_attention) and not qlora_fsdp)
            or self.cfg.cut_cross_entropy  # Cut cross entropy requires embedding layers to be in fp16/bf16 for backward pass
        )

--- a/src/axolotl/utils/samplers/utils.py
+++ b/src/axolotl/utils/samplers/utils.py
@@ -13,5 +13,4 @@ def get_dataset_lengths(dataset):
    else:
        input_ids = dataset.data.column("input_ids")
        lengths = np.vectorize(len)(np.array(input_ids, dtype=object))
-        return lengths
    return lengths
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -1,4 +1,5 @@
 """Module containing the Trainer class and related functions"""
+
 import json
 import math
 import os
@@ -210,6 +211,8 @@ def drop_long_seq(sample, sequence_len=2048, min_sequence_len=2):

    Works for both single-example (list[int]) or batched (list[list[int]]).
    """
+    min_sequence_len = min_sequence_len or 2
+
    input_ids = sample["input_ids"]

    # Edge case: if input_ids is empty
@@ -232,20 +235,6 @@ def drop_long_seq(sample, sequence_len=2048, min_sequence_len=2):


 def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
-    drop_long = partial(
-        drop_long_seq,
-        sequence_len=cfg.sequence_len,
-        min_sequence_len=cfg.min_sample_len or 2,
-    )
-
-    try:
-        min_input_len = np.min(get_dataset_lengths(train_dataset))
-        LOG.debug(f"min_input_len: {min_input_len}", main_process_only=True)
-        max_input_len = np.max(get_dataset_lengths(train_dataset))
-        LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
-    except AttributeError:
-        pass
-
    if cfg.model_config_type == "mamba":
        LOG.info("dropping attention_mask column")
        train_dataset = train_dataset.remove_columns("attention_mask")
@@ -259,46 +248,6 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
        if eval_dataset and "token_type_ids" in eval_dataset.column_names:
            eval_dataset = eval_dataset.remove_columns("token_type_ids")

-    filter_map_kwargs = {}
-    if not isinstance(train_dataset, IterableDataset):
-        filter_map_kwargs["num_proc"] = cfg.dataset_processes
-        filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
-
-    try:
-        prior_len = len(train_dataset)
-    except TypeError:
-        # handle iterable datasets case
-        prior_len = None
-    drop_long_kwargs = {}
-    if filter_map_kwargs:
-        drop_long_kwargs["desc"] = "Dropping Long Sequences"
-    train_dataset = train_dataset.filter(
-        drop_long,
-        batched=True,
-        **filter_map_kwargs,
-        **drop_long_kwargs,
-    )
-    if prior_len:
-        dropped = prior_len - len(train_dataset)
-        if dropped:
-            LOG.warning(f"Dropped {dropped} long samples from train dataset")
-
-    if eval_dataset:
-        try:
-            prior_len = len(eval_dataset)
-        except TypeError:
-            # handle iterable datasets case
-            prior_len = None
-        eval_dataset = eval_dataset.filter(
-            drop_long,
-            **filter_map_kwargs,
-            **drop_long_kwargs,
-        )
-        if prior_len:
-            dropped = prior_len - len(eval_dataset)
-            if dropped:
-                LOG.warning(f"Dropped {dropped} long samples from eval dataset")
-
    def drop_no_trainable_tokens(sample):
        """
        Drop samples if all labels are -100 (i.e., zero trainable tokens).
@@ -325,6 +274,11 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
    except TypeError:
        # handle iterable datasets case
        prior_len = None
+    filter_map_kwargs = {}
+    if not isinstance(train_dataset, IterableDataset):
+        filter_map_kwargs["num_proc"] = cfg.dataset_processes
+        filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
+
    drop_long_kwargs = {}
    if filter_map_kwargs:
        drop_long_kwargs["desc"] = "Drop Samples with Zero Trainable Tokens"
--- a/tests/e2e/test_reward_model_smollm2.py
+++ b/tests/e2e/test_reward_model_smollm2.py
@@ -33,7 +33,7 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
                "num_labels": 1,
                "chat_template": "alpaca",
                "reward_model": True,
-                "sequence_len": 1024,
+                "sequence_len": 2048,
                "pad_to_sequence_len": True,
                "adapter": "lora",
                "lora_r": 8,
Author	SHA1	Message	Date
bursteratom	82d04ea060	test v2batch w/ flex attn	2025-02-13 00:11:45 -05:00
Sung Ching Liu	0ef1f011fe	Merge branch 'main' into flx_attn_support	2025-02-11 23:31:56 -05:00
Sung Ching Liu	44f64ab627	Update faq.qmd (#2319 ) * Update faq.qmd Added Q&A for being stuck on saving preprocessed datasets * Update faq.qmd added details on preprocessing on cpu * Update faq.qmd * Update faq.qmd	2025-02-11 13:18:31 -05:00
NanoCode012	826f1b1494	feat(doc): Add multi-node torchrun info (#2304 )	2025-02-08 06:02:02 -05:00
NanoCode012	526e5ee8b8	fix(config): missing config not being documented and fix model_ override (#2317 ) * fix(config): missing config not being documented and fix model_ space override * fix: delete redundant field	2025-02-08 06:01:48 -05:00
NanoCode012	fd8cb32547	chore: remove redundant py310 from tests (#2316 )	2025-02-07 21:34:16 -05:00
NanoCode012	e48e2df4dd	feat: update FA to 2.7.4.post1 which includes torch2.6 binary (#2315 )	2025-02-07 21:34:01 -05:00
Wing Lian	b7616022ab	bump transformers to 4.48.3 (#2318 )	2025-02-07 21:33:44 -05:00
Wing Lian	1faf1a5c5a	batch add of spectrum snr results (#2320 )	2025-02-07 21:33:14 -05:00
Sunny Liu	c0a1d205c7	packed doc mask starts at 1, 0 means masked out	2025-02-07 14:44:52 -05:00
NanoCode012	5bbad5ef93	feat: add torch2.6 to ci (#2311 )	2025-02-07 07:28:54 -05:00
Wing Lian	a971eb4ce6	Torch 2.6 support for base docker image (#2312 )	2025-02-05 09:24:02 -05:00
Sunny Liu	d0e739da24	attempt at getting around bf16 error	2025-02-04 21:57:21 -05:00
Sunny Liu	3f6be519d5	stack	2025-02-04 21:25:13 -05:00
Sunny Liu	adcbc7459b	misc	2025-02-04 21:17:50 -05:00
Sunny Liu	470ba65c44	make doc mask instead of the whole block mask in collator	2025-02-04 20:27:39 -05:00
NanoCode012	a620d481e2	fix: drop long seq even if not sample packing (#2211 ) * fix: drop long seq even if not sample packing * fix: logging import * fix: cfg passed being none * fix: try to fix logging * fix: refactor call to not use accelerate log * fix: try to fix circular import issue * fix: don't drop when skip prepare * chore: remove duplicate line * fix: update warning to mention that sequences will be trimmed * fix: do not drop seq if input_ids don't exist * fix: increase RM unittest sequence length to reduce trim warnings * fix: solve conflicts * fix: default min_seq_len in case of None	2025-02-04 09:43:35 -05:00
Sunny Liu	8e1adc154d	stuff	2025-02-02 20:36:14 -05:00
Sunny Liu	e5b36900e4	misc	2025-02-02 20:32:03 -05:00
Sunny Liu	9f6c89b12b	undo my stupidity	2025-02-02 20:25:53 -05:00
Sunny Liu	b0871c8d3b	attempt - mask padding	2025-02-02 20:18:49 -05:00
bursteratom	d3ea379a23	figure out slight diff from flash result	2025-02-02 01:45:54 -05:00
bursteratom	0ebab63309	test	2025-02-02 01:27:15 -05:00
bursteratom	e98581f6f5	BLOCK SIZE	2025-02-02 01:22:23 -05:00
bursteratom	b832b11c8f	stuff	2025-02-02 00:51:43 -05:00
bursteratom	b692d394b1	more test	2025-02-02 00:48:57 -05:00
bursteratom	2319e5276d	more test	2025-02-02 00:48:15 -05:00
bursteratom	9a43a0925d	more test	2025-02-02 00:45:30 -05:00
bursteratom	10de67e8ea	more test	2025-02-02 00:43:41 -05:00
bursteratom	fa7355404c	test	2025-02-02 00:38:35 -05:00
bursteratom	907424a2e8	stuff	2025-02-02 00:29:09 -05:00
Sunny Liu	3f4fd3c1eb	remove padding self attention	2025-02-01 22:47:10 -05:00
Sunny Liu	48c3c47071	vanills mask	2025-02-01 14:23:37 -05:00
Sunny Liu	3ed9c117fb	try vanilla mask	2025-02-01 14:09:13 -05:00
Sunny Liu	84960003ed	reset llama_patch_multipack.py	2025-01-30 14:40:18 -05:00
Sunny Liu	93a268e43d	--no-verify fixes silly mistake	2025-01-30 14:08:26 -05:00
Sunny Liu	065f6d477e	flex batching WIP	2025-01-30 14:04:59 -05:00
Sunny Liu	96ad741cd5	flex batching WIP	2025-01-30 12:35:25 -05:00
bursteratom	ba88bc7840	wip flex block mask creation	2025-01-29 00:25:25 -05:00
Sung Ching Liu	b31796a681	Merge branch 'main' into flx_attn_support	2025-01-28 14:20:43 -05:00
Sunny Liu	5ca57cb55a	undo bool conversion	2025-01-23 17:56:13 -05:00
Sunny Liu	0149de7fb0	mask to bool	2025-01-23 15:30:08 -05:00
Sunny Liu	8c34c65181	dummy	2025-01-23 14:56:26 -05:00
Sunny Liu	555aa5772a	skip mask conversion if already 4d	2025-01-23 14:01:53 -05:00
Sunny Liu	e8b2789086	revert mask expand	2025-01-23 11:20:38 -05:00
Sunny Liu	85752cdfc9	mask expansion	2025-01-22 21:33:38 -05:00
Sunny Liu	f2f23c8041	mask expansion	2025-01-22 21:31:42 -05:00
Sunny Liu	8b3eec7f6e	mask expansion	2025-01-22 21:29:52 -05:00
Sunny Liu	bb9bea3110	mask expansion	2025-01-22 21:27:25 -05:00
Sunny Liu	0dd18a3681	llama sdpa patching WIP - static class function import	2025-01-22 21:10:05 -05:00
Sunny Liu	152e988d3c	llama sdpa patching WIP - static class function import	2025-01-22 21:02:26 -05:00
Sunny Liu	27532825a9	llama sdpa patching WIP - static class function import	2025-01-22 21:00:34 -05:00
Sunny Liu	06f83a54a5	llama sdpa patching WIP - static class function import	2025-01-22 20:45:44 -05:00
Sunny Liu	d7b133dc1f	llama sdpa patching WIP - static class function import	2025-01-22 20:33:13 -05:00
Sunny Liu	f3bec17917	llama sdpa patching WIP - static class function import	2025-01-22 20:25:26 -05:00
Sunny Liu	b7deb5241c	llama sdpa patching WIP	2025-01-22 20:16:27 -05:00
Sunny Liu	cee310dcfa	llama sdpa patching WIP	2025-01-22 20:15:23 -05:00
Sunny Liu	d1be6e228d	llama sdpa patching WIP	2025-01-22 20:14:20 -05:00
Sunny Liu	5f9f77f384	llama patch	2025-01-22 11:29:28 -05:00
bursteratom	b2a34380b3	sample packing doc mask creation WIP	2025-01-21 09:18:38 -05:00
Sunny Liu	80bfc50d1f	get seqlens from position ids for foc masking	2025-01-17 17:22:04 -05:00
Sunny Liu	a5360c172c	llama hijacking	2025-01-17 15:54:03 -05:00
Sunny Liu	013a9b73fc	fix transformers version for testing	2025-01-16 15:32:57 -05:00
Sunny	aad62428e0	not sure if this is necessary actually	2025-01-16 15:08:34 -05:00
Sunny	a6f2c5d583	flex sample packing WIP	2025-01-15 21:12:33 -05:00
Sunny	dbcd11e533	revert seq len in multipack sampler	2025-01-14 11:45:35 -05:00
Sunny	c06a6be915	flex_attn sample packing WIP	2025-01-14 00:22:05 -05:00
bursteratom	d3a0cb5edb	transformers version	2025-01-13 10:33:00 -05:00
bursteratom	8b47e456b0	revert to transformers 4.47.1	2025-01-13 10:29:27 -05:00
Sunny Liu	2319ac729c	Merge branch 'main' into flx_attn_support	2025-01-13 09:42:58 -05:00
Sunny	f99cae0e7b	llama test	2025-01-12 17:30:19 -05:00
Wing Lian	888cd9407f	use 2.5.1 docker images as latest tag as it seems stable (#2198 )	2025-01-12 13:34:17 -05:00
Wing Lian	bd62d6e10a	rename liger test so it properly runs in ci (#2246 )	2025-01-12 13:34:17 -05:00
NanoCode012	5eae134110	feat: add support for data_files in pretraining (#2238 )	2025-01-12 13:34:17 -05:00
Wing Lian	b7d27bdfa4	update upstream HF deps (#2239 ) * bump axolotl contribs for upstream main conflicts: * bump datasets, tokenizer, trl * remove log workarounds in trl * bump lm-eval * remove unsloth_ import from critical path * remove llama fa2 from conftest * unsloth breaks with latest upstream	2025-01-12 13:34:17 -05:00
Vincenzo di Cicco	da97a21bdc	Use SequentialSampler if curriculum_sampling is enabled with sample_packing (#2235 )	2025-01-12 13:34:17 -05:00
Wing Lian	e0d4b88598	update modal version for ci (#2242 )	2025-01-12 13:34:17 -05:00
NanoCode012	fac059a209	fix: mistral nemo does not recognize token_type_ids in forward (#2233 )	2025-01-12 13:34:17 -05:00
Wing Lian	9c9ac1cf0b	add hf cache caching for GHA (#2247 ) * add hf cache caching for GHA * use modal volume to cache hf data * make sure to update the cache as we add new fixtures in conftest	2025-01-12 13:34:17 -05:00
Wing Lian	2346f21b2b	Merge group queue (#2248 ) * add support for merge groups * also lint merge groups	2025-01-12 13:34:17 -05:00
salman	0b47281f51	Fixing OSX installation (#2231 ) * bumping version, removing non-osx compatible deps * updating pylintrc * fixing linters * reverting changes	2025-01-12 13:34:17 -05:00
Sunny	543daaf46f	llama test	2025-01-09 16:08:24 -05:00
Sunny	bcd9ad44e0	flex attention support	2025-01-06 19:54:11 -05:00
bursteratom	61ad375bf4	config validation for flex attention	2025-01-05 23:27:49 -05:00