Compare commits

...

11 Commits

Author SHA1 Message Date
Wing Lian
f24efd77a1 lint docs 2025-02-12 10:04:01 -05:00
Sung Ching Liu
44f64ab627 Update faq.qmd (#2319)
* Update faq.qmd

Added Q&A for being stuck on saving preprocessed datasets

* Update faq.qmd

added details on preprocessing on cpu

* Update faq.qmd

* Update faq.qmd
2025-02-11 13:18:31 -05:00
NanoCode012
826f1b1494 feat(doc): Add multi-node torchrun info (#2304) 2025-02-08 06:02:02 -05:00
NanoCode012
526e5ee8b8 fix(config): missing config not being documented and fix model_ override (#2317)
* fix(config): missing config not being documented and fix model_ space override

* fix: delete redundant field
2025-02-08 06:01:48 -05:00
NanoCode012
fd8cb32547 chore: remove redundant py310 from tests (#2316) 2025-02-07 21:34:16 -05:00
NanoCode012
e48e2df4dd feat: update FA to 2.7.4.post1 which includes torch2.6 binary (#2315) 2025-02-07 21:34:01 -05:00
Wing Lian
b7616022ab bump transformers to 4.48.3 (#2318) 2025-02-07 21:33:44 -05:00
Wing Lian
1faf1a5c5a batch add of spectrum snr results (#2320) 2025-02-07 21:33:14 -05:00
NanoCode012
5bbad5ef93 feat: add torch2.6 to ci (#2311) 2025-02-07 07:28:54 -05:00
Wing Lian
a971eb4ce6 Torch 2.6 support for base docker image (#2312) 2025-02-05 09:24:02 -05:00
NanoCode012
a620d481e2 fix: drop long seq even if not sample packing (#2211)
* fix: drop long seq even if not sample packing

* fix: logging import

* fix: cfg passed being none

* fix: try to fix logging

* fix: refactor call to not use accelerate log

* fix: try to fix circular import issue

* fix: don't drop when skip prepare

* chore: remove duplicate line

* fix: update warning to mention that sequences will be trimmed

* fix: do not drop seq if input_ids don't exist

* fix: increase RM unittest sequence length to reduce trim warnings

* fix: solve conflicts

* fix: default min_seq_len in case of None
2025-02-04 09:43:35 -05:00
34 changed files with 11287 additions and 112 deletions

View File

@@ -22,12 +22,6 @@ jobs:
fail-fast: false
matrix:
include:
- cuda: "124"
cuda_version: 12.4.1
cudnn_version: ""
python_version: "3.10"
pytorch: 2.4.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
- cuda: "124"
cuda_version: 12.4.1
cudnn_version: ""
@@ -40,6 +34,12 @@ jobs:
python_version: "3.11"
pytorch: 2.5.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
- cuda: "124"
cuda_version: 12.4.1
cudnn_version: ""
python_version: "3.11"
pytorch: 2.6.0
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
steps:
- name: Checkout
uses: actions/checkout@v4

View File

@@ -19,7 +19,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
python-version: '3.11'
- name: install dependencies
run: |
python3 -m pip install jupyter

View File

@@ -19,6 +19,6 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.11"
cache: 'pip' # caching pip dependencies
- uses: pre-commit/action@v3.0.1

View File

@@ -26,6 +26,11 @@ jobs:
pytorch: 2.5.1
axolotl_extras:
is_latest: true
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
axolotl_extras:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout

View File

@@ -34,6 +34,13 @@ jobs:
axolotl_extras:
num_gpus: 2
nightly_build: "true"
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
axolotl_extras:
num_gpus: 2
nightly_build: "true"
runs-on: [self-hosted, modal]
timeout-minutes: 120
steps:
@@ -42,7 +49,7 @@ jobs:
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.11"
- name: Install Modal
run: |
python -m pip install --upgrade pip

View File

@@ -22,6 +22,11 @@ jobs:
python_version: "3.11"
pytorch: 2.5.1
axolotl_extras:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
axolotl_extras:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout

View File

@@ -36,7 +36,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.11"
- name: Install dependencies
run: |

View File

@@ -12,7 +12,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.11"
cache: 'pip' # caching pip dependencies
- uses: pre-commit/action@v3.0.1
env:
@@ -25,13 +25,8 @@ jobs:
fail-fast: false
max-parallel: 2
matrix:
python_version: ["3.10", "3.11"]
pytorch_version: ["2.4.1", "2.5.1"]
exclude:
- python_version: "3.10"
pytorch_version: "2.4.1"
- python_version: "3.10"
pytorch_version: "2.5.1"
python_version: ["3.11"]
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
timeout-minutes: 20
steps:
@@ -112,13 +107,20 @@ jobs:
num_gpus: 1
axolotl_extras:
nightly_build: "true"
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
num_gpus: 1
axolotl_extras:
nightly_build: "true"
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.11"
- name: Install Modal
run: |
python -m pip install --upgrade pip

View File

@@ -35,7 +35,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.11"
cache: 'pip' # caching pip dependencies
- uses: pre-commit/action@v3.0.1
env:
@@ -48,13 +48,8 @@ jobs:
fail-fast: false
max-parallel: 2
matrix:
python_version: ["3.10", "3.11"]
pytorch_version: ["2.4.1", "2.5.1"]
exclude:
- python_version: "3.10"
pytorch_version: "2.4.1"
- python_version: "3.10"
pytorch_version: "2.5.1"
python_version: ["3.11"]
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
timeout-minutes: 20
steps:
@@ -127,7 +122,7 @@ jobs:
max-parallel: 1
matrix:
python_version: ["3.11"]
pytorch_version: ["2.4.1", "2.5.1"]
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
timeout-minutes: 20
steps:
@@ -216,7 +211,7 @@ jobs:
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.11"
- name: Install Modal
run: |
python -m pip install --upgrade pip
@@ -251,13 +246,19 @@ jobs:
pytorch: 2.4.1
num_gpus: 1
axolotl_extras:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
num_gpus: 1
axolotl_extras:
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
python-version: "3.11"
- name: Install Modal
run: |
python -m pip install --upgrade pip

View File

@@ -51,7 +51,7 @@ Features:
**Requirements**:
- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
- Python 3.10
- Python 3.11
- PyTorch ≥2.4.1
### Installation

View File

@@ -46,6 +46,10 @@ overrides_of_model_config:
type: # linear | dynamic
factor: # float
# optional overrides the base model loading from_pretrained
overrides_of_model_kwargs:
# use_cache: False
# optional overrides to the bnb 4bit quantization configuration
# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
bnb_config_kwargs:

View File

@@ -19,3 +19,7 @@ description: Frequently asked questions
**Q: AttributeError: 'DummyOptim' object has no attribute 'step'**
> A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
**Q: The codes is stuck on saving preprocessed datasets.**
> A: This is usually an issue with the GPU. This can be resolved through setting the os environment variable `CUDA_VISIBLE_DEVICES=0`. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.

View File

@@ -3,6 +3,18 @@ title: Multi Node
description: How to use Axolotl on multiple machines
---
The below are three ways to train multi-node in Axolotl.
::: {.callout-important}
Each machine needs a copy of Axolotl, we suggest using the same commit to ensure compatibility.
You will also need to have the same configuration file for your model on each machine.
Make sure the main machine is reachable by other machines.
:::
# Accelerate
You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:
~/.cache/huggingface/accelerate/default_config.yaml
@@ -26,7 +38,7 @@ tpu_use_sudo: false
use_cpu: false
```
Configure your model to use FSDP with for example:
Configure your model to use FSDP in the Axolotl yaml. For example:
```yaml
fsdp:
- full_shard
@@ -37,12 +49,40 @@ fsdp_config:
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
```
## Machine configuration
On each machine you need a copy of Axolotl, we suggest using the same commit to ensure compatibility.
You will also need to have the same configuration file for your model on each machine.
On the main machine only, make sure the port you set as `main_process_port` is open in TCP and reachable by other machines.
All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
# Raytrain
Please see ray train doc [here](ray-integration.qmd).
# Torchrun
If you are using Infiniband, we recommend torchrun to utilize the full bandwidth.
Set the following env (change buffersize/socketname depending on your system):
```yaml
export NCCL_IB_DISABLE=0
export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
export NCCL_BUFFSIZE=2097152
```
Run the following on each node:
```bash
torchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port" -m axolotl.cli.train config.yaml
```
Please make sure to substitute the placeholder variables.
- `num_nodes`: Number of nodes (containing GPUs)
- `gpu_per_node`: Number of gpus per node
- `head_node_ip`: IP of the head node (make sure other machines can connect to this)
- `head_node_port`: Port of the head node (make sure other machines can connect to this. Default 29400)
- `rdzv_id`: A unique job ID that is used by the job across nodes.
::: {.callout-note}
You need to call `axolotl.cli.train` instead of `axolotl train` as the latter calls accelerate under the hood
:::
More info on the available configs can be found on the Pytorch docs [here](https://pytorch.org/docs/stable/elastic/run.html)

View File

@@ -1,10 +1,10 @@
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
# START section of dependencies that don't install on Darwin/MacOS
bitsandbytes==0.45.1
bitsandbytes==0.45.2
triton>=3.0.0
mamba-ssm==1.2.0.post1
flash-attn==2.7.0.post2
flash-attn==2.7.4.post1
xformers>=0.0.23.post1
autoawq==0.2.7.post3
liger-kernel==0.5.2
@@ -13,7 +13,7 @@ liger-kernel==0.5.2
packaging==23.2
peft==0.14.0
transformers==4.48.1
transformers==4.48.3
tokenizers>=0.21.0
accelerate==1.3.0
datasets==3.2.0

View File

@@ -71,12 +71,15 @@ def parse_requirements():
else:
raise ValueError("Invalid version format")
if (major, minor) >= (2, 5):
if (major, minor) >= (2, 6):
_install_requires.pop(_install_requires.index(xformers_version))
_install_requires.append("xformers==0.0.29.post2")
elif (major, minor) >= (2, 5):
_install_requires.pop(_install_requires.index(xformers_version))
if patch == 0:
_install_requires.append("xformers==0.0.28.post2")
else:
_install_requires.append("xformers==0.0.28.post3")
_install_requires.append("xformers==0.0.29")
_install_requires.pop(_install_requires.index(autoawq_version))
elif (major, minor) >= (2, 4):
if patch == 0:

View File

@@ -0,0 +1,590 @@
{
"model.layers.0.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.1.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.2.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.3.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.4.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.5.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.6.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.7.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.8.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.9.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.10.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.11.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.12.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.13.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.14.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.15.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"lm_head": {
"snr": Infinity,
"type": "lm_head"
},
"model.layers.0.mlp.down_proj": {
"snr": 70.0594253540039,
"type": "mlp.down_proj"
},
"model.layers.1.mlp.down_proj": {
"snr": 11.135851860046387,
"type": "mlp.down_proj"
},
"model.layers.2.mlp.down_proj": {
"snr": 7.035482883453369,
"type": "mlp.down_proj"
},
"model.layers.3.mlp.down_proj": {
"snr": 6.422532081604004,
"type": "mlp.down_proj"
},
"model.layers.4.mlp.down_proj": {
"snr": 5.748020172119141,
"type": "mlp.down_proj"
},
"model.layers.5.mlp.down_proj": {
"snr": 3.885556697845459,
"type": "mlp.down_proj"
},
"model.layers.6.mlp.down_proj": {
"snr": 3.4336745738983154,
"type": "mlp.down_proj"
},
"model.layers.7.mlp.down_proj": {
"snr": 2.791595935821533,
"type": "mlp.down_proj"
},
"model.layers.8.mlp.down_proj": {
"snr": 5.36277961730957,
"type": "mlp.down_proj"
},
"model.layers.9.mlp.down_proj": {
"snr": 4.459208011627197,
"type": "mlp.down_proj"
},
"model.layers.10.mlp.down_proj": {
"snr": 6.272170066833496,
"type": "mlp.down_proj"
},
"model.layers.11.mlp.down_proj": {
"snr": 5.264761447906494,
"type": "mlp.down_proj"
},
"model.layers.12.mlp.down_proj": {
"snr": 4.324735641479492,
"type": "mlp.down_proj"
},
"model.layers.13.mlp.down_proj": {
"snr": 3.878648042678833,
"type": "mlp.down_proj"
},
"model.layers.14.mlp.down_proj": {
"snr": 2.9773054122924805,
"type": "mlp.down_proj"
},
"model.layers.15.mlp.down_proj": {
"snr": 4.471445560455322,
"type": "mlp.down_proj"
},
"model.layers.0.mlp.gate_proj": {
"snr": 25.227100372314453,
"type": "mlp.gate_proj"
},
"model.layers.1.mlp.gate_proj": {
"snr": 6.58299446105957,
"type": "mlp.gate_proj"
},
"model.layers.2.mlp.gate_proj": {
"snr": 3.4688243865966797,
"type": "mlp.gate_proj"
},
"model.layers.3.mlp.gate_proj": {
"snr": 1.555246114730835,
"type": "mlp.gate_proj"
},
"model.layers.4.mlp.gate_proj": {
"snr": 0.7770601511001587,
"type": "mlp.gate_proj"
},
"model.layers.5.mlp.gate_proj": {
"snr": 0.6239906549453735,
"type": "mlp.gate_proj"
},
"model.layers.6.mlp.gate_proj": {
"snr": 0.6440379023551941,
"type": "mlp.gate_proj"
},
"model.layers.7.mlp.gate_proj": {
"snr": 0.5120116472244263,
"type": "mlp.gate_proj"
},
"model.layers.8.mlp.gate_proj": {
"snr": 0.6544050574302673,
"type": "mlp.gate_proj"
},
"model.layers.9.mlp.gate_proj": {
"snr": 0.5381016731262207,
"type": "mlp.gate_proj"
},
"model.layers.10.mlp.gate_proj": {
"snr": 0.622873842716217,
"type": "mlp.gate_proj"
},
"model.layers.11.mlp.gate_proj": {
"snr": 0.9361700415611267,
"type": "mlp.gate_proj"
},
"model.layers.12.mlp.gate_proj": {
"snr": 1.475605845451355,
"type": "mlp.gate_proj"
},
"model.layers.13.mlp.gate_proj": {
"snr": 1.608325719833374,
"type": "mlp.gate_proj"
},
"model.layers.14.mlp.gate_proj": {
"snr": 1.0720024108886719,
"type": "mlp.gate_proj"
},
"model.layers.15.mlp.gate_proj": {
"snr": 0.7111338973045349,
"type": "mlp.gate_proj"
},
"model.layers.0.mlp.up_proj": {
"snr": 28.431896209716797,
"type": "mlp.up_proj"
},
"model.layers.1.mlp.up_proj": {
"snr": 15.546019554138184,
"type": "mlp.up_proj"
},
"model.layers.2.mlp.up_proj": {
"snr": 23.048023223876953,
"type": "mlp.up_proj"
},
"model.layers.3.mlp.up_proj": {
"snr": 25.790977478027344,
"type": "mlp.up_proj"
},
"model.layers.4.mlp.up_proj": {
"snr": 18.552549362182617,
"type": "mlp.up_proj"
},
"model.layers.5.mlp.up_proj": {
"snr": 8.85106372833252,
"type": "mlp.up_proj"
},
"model.layers.6.mlp.up_proj": {
"snr": 10.653799057006836,
"type": "mlp.up_proj"
},
"model.layers.7.mlp.up_proj": {
"snr": 7.365357875823975,
"type": "mlp.up_proj"
},
"model.layers.8.mlp.up_proj": {
"snr": 11.98373794555664,
"type": "mlp.up_proj"
},
"model.layers.9.mlp.up_proj": {
"snr": 8.04493236541748,
"type": "mlp.up_proj"
},
"model.layers.10.mlp.up_proj": {
"snr": 8.523039817810059,
"type": "mlp.up_proj"
},
"model.layers.11.mlp.up_proj": {
"snr": 5.381742477416992,
"type": "mlp.up_proj"
},
"model.layers.12.mlp.up_proj": {
"snr": 3.9845118522644043,
"type": "mlp.up_proj"
},
"model.layers.13.mlp.up_proj": {
"snr": 3.4893221855163574,
"type": "mlp.up_proj"
},
"model.layers.14.mlp.up_proj": {
"snr": 1.764201045036316,
"type": "mlp.up_proj"
},
"model.layers.15.mlp.up_proj": {
"snr": 0.9730708599090576,
"type": "mlp.up_proj"
},
"model.embed_tokens": {
"snr": Infinity,
"type": "model.embed_tokens"
},
"model.norm": {
"snr": Infinity,
"type": "model.norm"
},
"model.layers.0.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.1.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.2.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.3.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.4.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.5.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.6.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.7.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.8.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.9.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.10.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.11.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.12.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.13.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.14.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.15.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.0.self_attn.k_proj": {
"snr": 0.11727584153413773,
"type": "self_attn.k_proj"
},
"model.layers.1.self_attn.k_proj": {
"snr": 0.24786807596683502,
"type": "self_attn.k_proj"
},
"model.layers.2.self_attn.k_proj": {
"snr": 0.36378130316734314,
"type": "self_attn.k_proj"
},
"model.layers.3.self_attn.k_proj": {
"snr": 0.2983120381832123,
"type": "self_attn.k_proj"
},
"model.layers.4.self_attn.k_proj": {
"snr": 0.33789733052253723,
"type": "self_attn.k_proj"
},
"model.layers.5.self_attn.k_proj": {
"snr": 0.29155924916267395,
"type": "self_attn.k_proj"
},
"model.layers.6.self_attn.k_proj": {
"snr": 0.2537297010421753,
"type": "self_attn.k_proj"
},
"model.layers.7.self_attn.k_proj": {
"snr": 0.28204113245010376,
"type": "self_attn.k_proj"
},
"model.layers.8.self_attn.k_proj": {
"snr": 0.2776711583137512,
"type": "self_attn.k_proj"
},
"model.layers.9.self_attn.k_proj": {
"snr": 0.2927376627922058,
"type": "self_attn.k_proj"
},
"model.layers.10.self_attn.k_proj": {
"snr": 0.31486213207244873,
"type": "self_attn.k_proj"
},
"model.layers.11.self_attn.k_proj": {
"snr": 0.32363659143447876,
"type": "self_attn.k_proj"
},
"model.layers.12.self_attn.k_proj": {
"snr": 0.31382912397384644,
"type": "self_attn.k_proj"
},
"model.layers.13.self_attn.k_proj": {
"snr": 0.4635234773159027,
"type": "self_attn.k_proj"
},
"model.layers.14.self_attn.k_proj": {
"snr": 0.25379249453544617,
"type": "self_attn.k_proj"
},
"model.layers.15.self_attn.k_proj": {
"snr": 0.2628238797187805,
"type": "self_attn.k_proj"
},
"model.layers.0.self_attn.o_proj": {
"snr": 0.27602291107177734,
"type": "self_attn.o_proj"
},
"model.layers.1.self_attn.o_proj": {
"snr": 0.2149604707956314,
"type": "self_attn.o_proj"
},
"model.layers.2.self_attn.o_proj": {
"snr": 0.2540294826030731,
"type": "self_attn.o_proj"
},
"model.layers.3.self_attn.o_proj": {
"snr": 0.27978822588920593,
"type": "self_attn.o_proj"
},
"model.layers.4.self_attn.o_proj": {
"snr": 0.3121289908885956,
"type": "self_attn.o_proj"
},
"model.layers.5.self_attn.o_proj": {
"snr": 0.35037684440612793,
"type": "self_attn.o_proj"
},
"model.layers.6.self_attn.o_proj": {
"snr": 0.366205096244812,
"type": "self_attn.o_proj"
},
"model.layers.7.self_attn.o_proj": {
"snr": 0.3692712187767029,
"type": "self_attn.o_proj"
},
"model.layers.8.self_attn.o_proj": {
"snr": 0.3301038146018982,
"type": "self_attn.o_proj"
},
"model.layers.9.self_attn.o_proj": {
"snr": 0.3003396987915039,
"type": "self_attn.o_proj"
},
"model.layers.10.self_attn.o_proj": {
"snr": 0.30804169178009033,
"type": "self_attn.o_proj"
},
"model.layers.11.self_attn.o_proj": {
"snr": 0.28501132130622864,
"type": "self_attn.o_proj"
},
"model.layers.12.self_attn.o_proj": {
"snr": 0.2171541005373001,
"type": "self_attn.o_proj"
},
"model.layers.13.self_attn.o_proj": {
"snr": 0.19183959066867828,
"type": "self_attn.o_proj"
},
"model.layers.14.self_attn.o_proj": {
"snr": 0.19215913116931915,
"type": "self_attn.o_proj"
},
"model.layers.15.self_attn.o_proj": {
"snr": 0.25486502051353455,
"type": "self_attn.o_proj"
},
"model.layers.0.self_attn.q_proj": {
"snr": 0.03850084915757179,
"type": "self_attn.q_proj"
},
"model.layers.1.self_attn.q_proj": {
"snr": 0.0713055431842804,
"type": "self_attn.q_proj"
},
"model.layers.2.self_attn.q_proj": {
"snr": 0.07948919385671616,
"type": "self_attn.q_proj"
},
"model.layers.3.self_attn.q_proj": {
"snr": 0.08047746121883392,
"type": "self_attn.q_proj"
},
"model.layers.4.self_attn.q_proj": {
"snr": 0.0852593332529068,
"type": "self_attn.q_proj"
},
"model.layers.5.self_attn.q_proj": {
"snr": 0.09794823825359344,
"type": "self_attn.q_proj"
},
"model.layers.6.self_attn.q_proj": {
"snr": 0.09627152234315872,
"type": "self_attn.q_proj"
},
"model.layers.7.self_attn.q_proj": {
"snr": 0.11065381020307541,
"type": "self_attn.q_proj"
},
"model.layers.8.self_attn.q_proj": {
"snr": 0.12031875550746918,
"type": "self_attn.q_proj"
},
"model.layers.9.self_attn.q_proj": {
"snr": 0.09804573655128479,
"type": "self_attn.q_proj"
},
"model.layers.10.self_attn.q_proj": {
"snr": 0.10897502303123474,
"type": "self_attn.q_proj"
},
"model.layers.11.self_attn.q_proj": {
"snr": 0.09267337620258331,
"type": "self_attn.q_proj"
},
"model.layers.12.self_attn.q_proj": {
"snr": 0.08803492039442062,
"type": "self_attn.q_proj"
},
"model.layers.13.self_attn.q_proj": {
"snr": 0.0902542844414711,
"type": "self_attn.q_proj"
},
"model.layers.14.self_attn.q_proj": {
"snr": 0.10154066979885101,
"type": "self_attn.q_proj"
},
"model.layers.15.self_attn.q_proj": {
"snr": 0.09083802253007889,
"type": "self_attn.q_proj"
},
"model.layers.0.self_attn.v_proj": {
"snr": 2.842210054397583,
"type": "self_attn.v_proj"
},
"model.layers.1.self_attn.v_proj": {
"snr": 10.59461498260498,
"type": "self_attn.v_proj"
},
"model.layers.2.self_attn.v_proj": {
"snr": 8.993025779724121,
"type": "self_attn.v_proj"
},
"model.layers.3.self_attn.v_proj": {
"snr": 62.567787170410156,
"type": "self_attn.v_proj"
},
"model.layers.4.self_attn.v_proj": {
"snr": 23.80082893371582,
"type": "self_attn.v_proj"
},
"model.layers.5.self_attn.v_proj": {
"snr": 7.957369804382324,
"type": "self_attn.v_proj"
},
"model.layers.6.self_attn.v_proj": {
"snr": 12.01815414428711,
"type": "self_attn.v_proj"
},
"model.layers.7.self_attn.v_proj": {
"snr": 5.095500469207764,
"type": "self_attn.v_proj"
},
"model.layers.8.self_attn.v_proj": {
"snr": 11.719332695007324,
"type": "self_attn.v_proj"
},
"model.layers.9.self_attn.v_proj": {
"snr": 555.0869750976562,
"type": "self_attn.v_proj"
},
"model.layers.10.self_attn.v_proj": {
"snr": 22.95538330078125,
"type": "self_attn.v_proj"
},
"model.layers.11.self_attn.v_proj": {
"snr": 30.042158126831055,
"type": "self_attn.v_proj"
},
"model.layers.12.self_attn.v_proj": {
"snr": 9.577271461486816,
"type": "self_attn.v_proj"
},
"model.layers.13.self_attn.v_proj": {
"snr": 18.176361083984375,
"type": "self_attn.v_proj"
},
"model.layers.14.self_attn.v_proj": {
"snr": 1.5695856809616089,
"type": "self_attn.v_proj"
},
"model.layers.15.self_attn.v_proj": {
"snr": 2.7235565185546875,
"type": "self_attn.v_proj"
}
}

View File

@@ -0,0 +1,590 @@
{
"model.layers.0.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.1.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.2.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.3.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.4.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.5.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.6.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.7.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.8.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.9.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.10.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.11.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.12.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.13.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.14.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"model.layers.15.input_layernorm": {
"snr": Infinity,
"type": "input_layernorm"
},
"lm_head": {
"snr": Infinity,
"type": "lm_head"
},
"model.layers.0.mlp.down_proj": {
"snr": 57.09797286987305,
"type": "mlp.down_proj"
},
"model.layers.1.mlp.down_proj": {
"snr": 9.538983345031738,
"type": "mlp.down_proj"
},
"model.layers.2.mlp.down_proj": {
"snr": 6.227016925811768,
"type": "mlp.down_proj"
},
"model.layers.3.mlp.down_proj": {
"snr": 5.660686492919922,
"type": "mlp.down_proj"
},
"model.layers.4.mlp.down_proj": {
"snr": 5.178432464599609,
"type": "mlp.down_proj"
},
"model.layers.5.mlp.down_proj": {
"snr": 3.5638349056243896,
"type": "mlp.down_proj"
},
"model.layers.6.mlp.down_proj": {
"snr": 3.0918056964874268,
"type": "mlp.down_proj"
},
"model.layers.7.mlp.down_proj": {
"snr": 2.456392288208008,
"type": "mlp.down_proj"
},
"model.layers.8.mlp.down_proj": {
"snr": 4.525328636169434,
"type": "mlp.down_proj"
},
"model.layers.9.mlp.down_proj": {
"snr": 3.9409055709838867,
"type": "mlp.down_proj"
},
"model.layers.10.mlp.down_proj": {
"snr": 5.447249412536621,
"type": "mlp.down_proj"
},
"model.layers.11.mlp.down_proj": {
"snr": 4.807600975036621,
"type": "mlp.down_proj"
},
"model.layers.12.mlp.down_proj": {
"snr": 3.915374517440796,
"type": "mlp.down_proj"
},
"model.layers.13.mlp.down_proj": {
"snr": 3.4820363521575928,
"type": "mlp.down_proj"
},
"model.layers.14.mlp.down_proj": {
"snr": 2.6045074462890625,
"type": "mlp.down_proj"
},
"model.layers.15.mlp.down_proj": {
"snr": 3.7237701416015625,
"type": "mlp.down_proj"
},
"model.layers.0.mlp.gate_proj": {
"snr": 22.160131454467773,
"type": "mlp.gate_proj"
},
"model.layers.1.mlp.gate_proj": {
"snr": 6.072206020355225,
"type": "mlp.gate_proj"
},
"model.layers.2.mlp.gate_proj": {
"snr": 3.2467362880706787,
"type": "mlp.gate_proj"
},
"model.layers.3.mlp.gate_proj": {
"snr": 1.4111896753311157,
"type": "mlp.gate_proj"
},
"model.layers.4.mlp.gate_proj": {
"snr": 0.7405938506126404,
"type": "mlp.gate_proj"
},
"model.layers.5.mlp.gate_proj": {
"snr": 0.5916463136672974,
"type": "mlp.gate_proj"
},
"model.layers.6.mlp.gate_proj": {
"snr": 0.6149423718452454,
"type": "mlp.gate_proj"
},
"model.layers.7.mlp.gate_proj": {
"snr": 0.48369669914245605,
"type": "mlp.gate_proj"
},
"model.layers.8.mlp.gate_proj": {
"snr": 0.6047574877738953,
"type": "mlp.gate_proj"
},
"model.layers.9.mlp.gate_proj": {
"snr": 0.5092479586601257,
"type": "mlp.gate_proj"
},
"model.layers.10.mlp.gate_proj": {
"snr": 0.5999670624732971,
"type": "mlp.gate_proj"
},
"model.layers.11.mlp.gate_proj": {
"snr": 0.8980127573013306,
"type": "mlp.gate_proj"
},
"model.layers.12.mlp.gate_proj": {
"snr": 1.4252448081970215,
"type": "mlp.gate_proj"
},
"model.layers.13.mlp.gate_proj": {
"snr": 1.509937047958374,
"type": "mlp.gate_proj"
},
"model.layers.14.mlp.gate_proj": {
"snr": 1.0066585540771484,
"type": "mlp.gate_proj"
},
"model.layers.15.mlp.gate_proj": {
"snr": 0.6413647532463074,
"type": "mlp.gate_proj"
},
"model.layers.0.mlp.up_proj": {
"snr": 26.08852195739746,
"type": "mlp.up_proj"
},
"model.layers.1.mlp.up_proj": {
"snr": 13.382951736450195,
"type": "mlp.up_proj"
},
"model.layers.2.mlp.up_proj": {
"snr": 20.088768005371094,
"type": "mlp.up_proj"
},
"model.layers.3.mlp.up_proj": {
"snr": 23.0632381439209,
"type": "mlp.up_proj"
},
"model.layers.4.mlp.up_proj": {
"snr": 16.07433319091797,
"type": "mlp.up_proj"
},
"model.layers.5.mlp.up_proj": {
"snr": 8.00507640838623,
"type": "mlp.up_proj"
},
"model.layers.6.mlp.up_proj": {
"snr": 9.538354873657227,
"type": "mlp.up_proj"
},
"model.layers.7.mlp.up_proj": {
"snr": 6.286602973937988,
"type": "mlp.up_proj"
},
"model.layers.8.mlp.up_proj": {
"snr": 10.092820167541504,
"type": "mlp.up_proj"
},
"model.layers.9.mlp.up_proj": {
"snr": 7.193963527679443,
"type": "mlp.up_proj"
},
"model.layers.10.mlp.up_proj": {
"snr": 7.320116996765137,
"type": "mlp.up_proj"
},
"model.layers.11.mlp.up_proj": {
"snr": 4.8728532791137695,
"type": "mlp.up_proj"
},
"model.layers.12.mlp.up_proj": {
"snr": 3.596583366394043,
"type": "mlp.up_proj"
},
"model.layers.13.mlp.up_proj": {
"snr": 3.166161298751831,
"type": "mlp.up_proj"
},
"model.layers.14.mlp.up_proj": {
"snr": 1.5600818395614624,
"type": "mlp.up_proj"
},
"model.layers.15.mlp.up_proj": {
"snr": 0.8726214170455933,
"type": "mlp.up_proj"
},
"model.embed_tokens": {
"snr": Infinity,
"type": "model.embed_tokens"
},
"model.norm": {
"snr": Infinity,
"type": "model.norm"
},
"model.layers.0.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.1.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.2.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.3.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.4.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.5.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.6.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.7.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.8.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.9.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.10.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.11.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.12.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.13.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.14.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.15.post_attention_layernorm": {
"snr": Infinity,
"type": "post_attention_layernorm"
},
"model.layers.0.self_attn.k_proj": {
"snr": 0.1154392883181572,
"type": "self_attn.k_proj"
},
"model.layers.1.self_attn.k_proj": {
"snr": 0.24299409985542297,
"type": "self_attn.k_proj"
},
"model.layers.2.self_attn.k_proj": {
"snr": 0.3624322712421417,
"type": "self_attn.k_proj"
},
"model.layers.3.self_attn.k_proj": {
"snr": 0.29509487748146057,
"type": "self_attn.k_proj"
},
"model.layers.4.self_attn.k_proj": {
"snr": 0.32953736186027527,
"type": "self_attn.k_proj"
},
"model.layers.5.self_attn.k_proj": {
"snr": 0.2908833622932434,
"type": "self_attn.k_proj"
},
"model.layers.6.self_attn.k_proj": {
"snr": 0.2488437294960022,
"type": "self_attn.k_proj"
},
"model.layers.7.self_attn.k_proj": {
"snr": 0.27847856283187866,
"type": "self_attn.k_proj"
},
"model.layers.8.self_attn.k_proj": {
"snr": 0.27143892645835876,
"type": "self_attn.k_proj"
},
"model.layers.9.self_attn.k_proj": {
"snr": 0.28804272413253784,
"type": "self_attn.k_proj"
},
"model.layers.10.self_attn.k_proj": {
"snr": 0.31197959184646606,
"type": "self_attn.k_proj"
},
"model.layers.11.self_attn.k_proj": {
"snr": 0.3203586935997009,
"type": "self_attn.k_proj"
},
"model.layers.12.self_attn.k_proj": {
"snr": 0.30905747413635254,
"type": "self_attn.k_proj"
},
"model.layers.13.self_attn.k_proj": {
"snr": 0.46828722953796387,
"type": "self_attn.k_proj"
},
"model.layers.14.self_attn.k_proj": {
"snr": 0.24205778539180756,
"type": "self_attn.k_proj"
},
"model.layers.15.self_attn.k_proj": {
"snr": 0.2559327781200409,
"type": "self_attn.k_proj"
},
"model.layers.0.self_attn.o_proj": {
"snr": 0.2638678550720215,
"type": "self_attn.o_proj"
},
"model.layers.1.self_attn.o_proj": {
"snr": 0.21109595894813538,
"type": "self_attn.o_proj"
},
"model.layers.2.self_attn.o_proj": {
"snr": 0.24751724302768707,
"type": "self_attn.o_proj"
},
"model.layers.3.self_attn.o_proj": {
"snr": 0.2728094160556793,
"type": "self_attn.o_proj"
},
"model.layers.4.self_attn.o_proj": {
"snr": 0.3001374304294586,
"type": "self_attn.o_proj"
},
"model.layers.5.self_attn.o_proj": {
"snr": 0.33903488516807556,
"type": "self_attn.o_proj"
},
"model.layers.6.self_attn.o_proj": {
"snr": 0.3530929982662201,
"type": "self_attn.o_proj"
},
"model.layers.7.self_attn.o_proj": {
"snr": 0.36753255128860474,
"type": "self_attn.o_proj"
},
"model.layers.8.self_attn.o_proj": {
"snr": 0.3373180329799652,
"type": "self_attn.o_proj"
},
"model.layers.9.self_attn.o_proj": {
"snr": 0.2970578670501709,
"type": "self_attn.o_proj"
},
"model.layers.10.self_attn.o_proj": {
"snr": 0.3076324760913849,
"type": "self_attn.o_proj"
},
"model.layers.11.self_attn.o_proj": {
"snr": 0.2766900658607483,
"type": "self_attn.o_proj"
},
"model.layers.12.self_attn.o_proj": {
"snr": 0.20973259210586548,
"type": "self_attn.o_proj"
},
"model.layers.13.self_attn.o_proj": {
"snr": 0.18185566365718842,
"type": "self_attn.o_proj"
},
"model.layers.14.self_attn.o_proj": {
"snr": 0.18329747021198273,
"type": "self_attn.o_proj"
},
"model.layers.15.self_attn.o_proj": {
"snr": 0.2437991499900818,
"type": "self_attn.o_proj"
},
"model.layers.0.self_attn.q_proj": {
"snr": 0.038040731102228165,
"type": "self_attn.q_proj"
},
"model.layers.1.self_attn.q_proj": {
"snr": 0.0707998052239418,
"type": "self_attn.q_proj"
},
"model.layers.2.self_attn.q_proj": {
"snr": 0.0787411704659462,
"type": "self_attn.q_proj"
},
"model.layers.3.self_attn.q_proj": {
"snr": 0.08089710026979446,
"type": "self_attn.q_proj"
},
"model.layers.4.self_attn.q_proj": {
"snr": 0.08591937273740768,
"type": "self_attn.q_proj"
},
"model.layers.5.self_attn.q_proj": {
"snr": 0.09852176159620285,
"type": "self_attn.q_proj"
},
"model.layers.6.self_attn.q_proj": {
"snr": 0.09690654277801514,
"type": "self_attn.q_proj"
},
"model.layers.7.self_attn.q_proj": {
"snr": 0.11181341856718063,
"type": "self_attn.q_proj"
},
"model.layers.8.self_attn.q_proj": {
"snr": 0.12042108923196793,
"type": "self_attn.q_proj"
},
"model.layers.9.self_attn.q_proj": {
"snr": 0.09799323976039886,
"type": "self_attn.q_proj"
},
"model.layers.10.self_attn.q_proj": {
"snr": 0.10901063680648804,
"type": "self_attn.q_proj"
},
"model.layers.11.self_attn.q_proj": {
"snr": 0.09307146072387695,
"type": "self_attn.q_proj"
},
"model.layers.12.self_attn.q_proj": {
"snr": 0.0880950540304184,
"type": "self_attn.q_proj"
},
"model.layers.13.self_attn.q_proj": {
"snr": 0.08886399120092392,
"type": "self_attn.q_proj"
},
"model.layers.14.self_attn.q_proj": {
"snr": 0.09955056011676788,
"type": "self_attn.q_proj"
},
"model.layers.15.self_attn.q_proj": {
"snr": 0.08929339051246643,
"type": "self_attn.q_proj"
},
"model.layers.0.self_attn.v_proj": {
"snr": 2.5501928329467773,
"type": "self_attn.v_proj"
},
"model.layers.1.self_attn.v_proj": {
"snr": 9.449499130249023,
"type": "self_attn.v_proj"
},
"model.layers.2.self_attn.v_proj": {
"snr": 7.9920830726623535,
"type": "self_attn.v_proj"
},
"model.layers.3.self_attn.v_proj": {
"snr": 50.69462585449219,
"type": "self_attn.v_proj"
},
"model.layers.4.self_attn.v_proj": {
"snr": 19.083511352539062,
"type": "self_attn.v_proj"
},
"model.layers.5.self_attn.v_proj": {
"snr": 7.21597146987915,
"type": "self_attn.v_proj"
},
"model.layers.6.self_attn.v_proj": {
"snr": 11.27744197845459,
"type": "self_attn.v_proj"
},
"model.layers.7.self_attn.v_proj": {
"snr": 4.579711437225342,
"type": "self_attn.v_proj"
},
"model.layers.8.self_attn.v_proj": {
"snr": 10.940719604492188,
"type": "self_attn.v_proj"
},
"model.layers.9.self_attn.v_proj": {
"snr": 553.4417724609375,
"type": "self_attn.v_proj"
},
"model.layers.10.self_attn.v_proj": {
"snr": 20.59434700012207,
"type": "self_attn.v_proj"
},
"model.layers.11.self_attn.v_proj": {
"snr": 26.636865615844727,
"type": "self_attn.v_proj"
},
"model.layers.12.self_attn.v_proj": {
"snr": 8.614749908447266,
"type": "self_attn.v_proj"
},
"model.layers.13.self_attn.v_proj": {
"snr": 17.722007751464844,
"type": "self_attn.v_proj"
},
"model.layers.14.self_attn.v_proj": {
"snr": 1.48500657081604,
"type": "self_attn.v_proj"
},
"model.layers.15.self_attn.v_proj": {
"snr": 2.5776851177215576,
"type": "self_attn.v_proj"
}
}

View File

@@ -47,7 +47,7 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
if len(chosen_tokenized["input_ids"]) > max_length:
LOG.warning(
f"Chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}",
f"To-be-trimmed chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}",
)
chosen_tokenized["input_ids"] = chosen_tokenized["input_ids"][:max_length]
@@ -70,7 +70,7 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
if len(rejected_tokenized["input_ids"]) > max_length:
LOG.warning(
f"Rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}",
f"To-be-trimmed rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}",
)
rejected_tokenized["input_ids"] = rejected_tokenized["input_ids"][

View File

@@ -115,6 +115,9 @@ class RemappedParameters(BaseModel):
overrides_of_model_config: Optional[Dict[str, Any]] = Field(
default=None, alias="model_config"
)
overrides_of_model_kwargs: Optional[Dict[str, Any]] = Field(
default=None, alias="model_kwargs"
)
type_of_model: Optional[str] = Field(default=None, alias="model_type")
revision_of_model: Optional[str] = Field(default=None, alias="model_revision")
@@ -426,8 +429,6 @@ class ModelInputConfig(BaseModel):
)
trust_remote_code: Optional[bool] = None
model_kwargs: Optional[Dict[str, Any]] = None
@field_validator("trust_remote_code")
@classmethod
def hint_trust_remote_code(cls, trust_remote_code):

View File

@@ -46,6 +46,7 @@ from axolotl.utils.data.pretraining import wrap_pretraining_dataset
from axolotl.utils.data.shared import load_dataset_w_config
from axolotl.utils.data.utils import (
deduplicate_and_log_datasets,
drop_long_seq_in_dataset,
md5,
retry_on_request_exceptions,
)
@@ -56,7 +57,7 @@ from axolotl.utils.trainer import (
process_datasets_for_packing,
)
LOG = logging.getLogger("axolotl")
LOG = logging.getLogger(__name__)
@retry_on_request_exceptions(max_retries=3, delay=5)
@@ -339,8 +340,11 @@ def load_tokenized_prepared_datasets(
else:
LOG.debug("NOT shuffling merged datasets")
if cfg.sample_packing and not cfg.skip_prepare_dataset:
dataset, _ = process_datasets_for_packing(cfg, dataset, None)
if not cfg.skip_prepare_dataset:
dataset = drop_long_seq_in_dataset(dataset, cfg)
if cfg.sample_packing:
dataset, _ = process_datasets_for_packing(cfg, dataset, None)
if cfg.local_rank == 0 and not cfg.skip_prepare_dataset:
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")

View File

@@ -1,4 +1,5 @@
"""data handling helpers"""
import functools
import hashlib
import logging
@@ -6,10 +7,15 @@ import time
from enum import Enum
import huggingface_hub
import numpy as np
import requests
from datasets import Dataset
from datasets import Dataset, IterableDataset
LOG = logging.getLogger("axolotl")
from axolotl.utils.dict import DictDefault
from axolotl.utils.samplers.utils import get_dataset_lengths
from axolotl.utils.trainer import drop_long_seq
LOG = logging.getLogger(__name__)
class RetryStrategy(Enum):
@@ -150,3 +156,53 @@ def deduplicate_and_log_datasets(
)
return train_dataset, eval_dataset, dataset
def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault):
if "input_ids" not in dataset.column_names:
LOG.warning(
"Dataset does not contain 'input_ids' column. Skip drop long seq. This is expected for RewardModeling."
)
return dataset
drop_long = functools.partial(
drop_long_seq,
sequence_len=cfg.sequence_len,
min_sequence_len=cfg.min_sample_len,
)
try:
min_input_len = np.min(get_dataset_lengths(dataset))
LOG.debug(f"min_input_len: {min_input_len}")
max_input_len = np.max(get_dataset_lengths(dataset))
LOG.debug(f"max_input_len: {max_input_len}")
except AttributeError:
pass
try:
prior_len = len(dataset)
except TypeError:
# handle iterable datasets case
prior_len = None
filter_map_kwargs = {}
if not isinstance(dataset, IterableDataset):
filter_map_kwargs["num_proc"] = cfg.dataset_processes
filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
drop_long_kwargs = {}
if filter_map_kwargs:
drop_long_kwargs["desc"] = "Dropping Long Sequences"
dataset = dataset.filter(
drop_long,
batched=True,
**filter_map_kwargs,
**drop_long_kwargs,
)
if prior_len:
dropped = prior_len - len(dataset)
if dropped:
LOG.warning(f"Dropped {dropped} long samples from dataset")
return dataset

View File

@@ -357,8 +357,8 @@ class ModelLoader:
# init model kwargs
self.model_kwargs: Dict[str, Any] = {}
if cfg.model_kwargs:
for key, val in cfg.model_kwargs.items():
if cfg.overrides_of_model_kwargs:
for key, val in cfg.overrides_of_model_kwargs.items():
self.model_kwargs[key] = val
# init model

View File

@@ -13,5 +13,4 @@ def get_dataset_lengths(dataset):
else:
input_ids = dataset.data.column("input_ids")
lengths = np.vectorize(len)(np.array(input_ids, dtype=object))
return lengths
return lengths

View File

@@ -1,4 +1,5 @@
"""Module containing the Trainer class and related functions"""
import json
import math
import os
@@ -210,6 +211,8 @@ def drop_long_seq(sample, sequence_len=2048, min_sequence_len=2):
Works for both single-example (list[int]) or batched (list[list[int]]).
"""
min_sequence_len = min_sequence_len or 2
input_ids = sample["input_ids"]
# Edge case: if input_ids is empty
@@ -232,20 +235,6 @@ def drop_long_seq(sample, sequence_len=2048, min_sequence_len=2):
def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
drop_long = partial(
drop_long_seq,
sequence_len=cfg.sequence_len,
min_sequence_len=cfg.min_sample_len or 2,
)
try:
min_input_len = np.min(get_dataset_lengths(train_dataset))
LOG.debug(f"min_input_len: {min_input_len}", main_process_only=True)
max_input_len = np.max(get_dataset_lengths(train_dataset))
LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
except AttributeError:
pass
if cfg.model_config_type == "mamba":
LOG.info("dropping attention_mask column")
train_dataset = train_dataset.remove_columns("attention_mask")
@@ -259,46 +248,6 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
if eval_dataset and "token_type_ids" in eval_dataset.column_names:
eval_dataset = eval_dataset.remove_columns("token_type_ids")
filter_map_kwargs = {}
if not isinstance(train_dataset, IterableDataset):
filter_map_kwargs["num_proc"] = cfg.dataset_processes
filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
try:
prior_len = len(train_dataset)
except TypeError:
# handle iterable datasets case
prior_len = None
drop_long_kwargs = {}
if filter_map_kwargs:
drop_long_kwargs["desc"] = "Dropping Long Sequences"
train_dataset = train_dataset.filter(
drop_long,
batched=True,
**filter_map_kwargs,
**drop_long_kwargs,
)
if prior_len:
dropped = prior_len - len(train_dataset)
if dropped:
LOG.warning(f"Dropped {dropped} long samples from train dataset")
if eval_dataset:
try:
prior_len = len(eval_dataset)
except TypeError:
# handle iterable datasets case
prior_len = None
eval_dataset = eval_dataset.filter(
drop_long,
**filter_map_kwargs,
**drop_long_kwargs,
)
if prior_len:
dropped = prior_len - len(eval_dataset)
if dropped:
LOG.warning(f"Dropped {dropped} long samples from eval dataset")
def drop_no_trainable_tokens(sample):
"""
Drop samples if all labels are -100 (i.e., zero trainable tokens).
@@ -325,6 +274,11 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
except TypeError:
# handle iterable datasets case
prior_len = None
filter_map_kwargs = {}
if not isinstance(train_dataset, IterableDataset):
filter_map_kwargs["num_proc"] = cfg.dataset_processes
filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
drop_long_kwargs = {}
if filter_map_kwargs:
drop_long_kwargs["desc"] = "Drop Samples with Zero Trainable Tokens"

View File

@@ -33,7 +33,7 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
"num_labels": 1,
"chat_template": "alpaca",
"reward_model": True,
"sequence_len": 1024,
"sequence_len": 2048,
"pad_to_sequence_len": True,
"adapter": "lora",
"lora_r": 8,