Compare commits
84 Commits
feat/linea
...
82d04ea060
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
82d04ea060 | ||
|
|
0ef1f011fe | ||
|
|
44f64ab627 | ||
|
|
826f1b1494 | ||
|
|
526e5ee8b8 | ||
|
|
fd8cb32547 | ||
|
|
e48e2df4dd | ||
|
|
b7616022ab | ||
|
|
1faf1a5c5a | ||
|
|
c0a1d205c7 | ||
|
|
5bbad5ef93 | ||
|
|
a971eb4ce6 | ||
|
|
d0e739da24 | ||
|
|
3f6be519d5 | ||
|
|
adcbc7459b | ||
|
|
470ba65c44 | ||
|
|
a620d481e2 | ||
|
|
8e1adc154d | ||
|
|
e5b36900e4 | ||
|
|
9f6c89b12b | ||
|
|
b0871c8d3b | ||
|
|
d3ea379a23 | ||
|
|
0ebab63309 | ||
|
|
e98581f6f5 | ||
|
|
b832b11c8f | ||
|
|
b692d394b1 | ||
|
|
2319e5276d | ||
|
|
9a43a0925d | ||
|
|
10de67e8ea | ||
|
|
fa7355404c | ||
|
|
907424a2e8 | ||
|
|
3f4fd3c1eb | ||
|
|
48c3c47071 | ||
|
|
3ed9c117fb | ||
|
|
84960003ed | ||
|
|
93a268e43d | ||
|
|
065f6d477e | ||
|
|
96ad741cd5 | ||
|
|
ba88bc7840 | ||
|
|
b31796a681 | ||
|
|
5ca57cb55a | ||
|
|
0149de7fb0 | ||
|
|
8c34c65181 | ||
|
|
555aa5772a | ||
|
|
e8b2789086 | ||
|
|
85752cdfc9 | ||
|
|
f2f23c8041 | ||
|
|
8b3eec7f6e | ||
|
|
bb9bea3110 | ||
|
|
0dd18a3681 | ||
|
|
152e988d3c | ||
|
|
27532825a9 | ||
|
|
06f83a54a5 | ||
|
|
d7b133dc1f | ||
|
|
f3bec17917 | ||
|
|
b7deb5241c | ||
|
|
cee310dcfa | ||
|
|
d1be6e228d | ||
|
|
5f9f77f384 | ||
|
|
b2a34380b3 | ||
|
|
80bfc50d1f | ||
|
|
a5360c172c | ||
|
|
013a9b73fc | ||
|
|
aad62428e0 | ||
|
|
a6f2c5d583 | ||
|
|
dbcd11e533 | ||
|
|
c06a6be915 | ||
|
|
d3a0cb5edb | ||
|
|
8b47e456b0 | ||
|
|
2319ac729c | ||
|
|
f99cae0e7b | ||
|
|
888cd9407f | ||
|
|
bd62d6e10a | ||
|
|
5eae134110 | ||
|
|
b7d27bdfa4 | ||
|
|
da97a21bdc | ||
|
|
e0d4b88598 | ||
|
|
fac059a209 | ||
|
|
9c9ac1cf0b | ||
|
|
2346f21b2b | ||
|
|
0b47281f51 | ||
|
|
543daaf46f | ||
|
|
bcd9ad44e0 | ||
|
|
61ad375bf4 |
12
.github/workflows/base.yml
vendored
12
.github/workflows/base.yml
vendored
@@ -22,12 +22,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: "124"
|
||||
cuda_version: 12.4.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.10"
|
||||
pytorch: 2.4.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
- cuda: "124"
|
||||
cuda_version: 12.4.1
|
||||
cudnn_version: ""
|
||||
@@ -40,6 +34,12 @@ jobs:
|
||||
python_version: "3.11"
|
||||
pytorch: 2.5.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
- cuda: "124"
|
||||
cuda_version: 12.4.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
2
.github/workflows/docs.yml
vendored
2
.github/workflows/docs.yml
vendored
@@ -19,7 +19,7 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.10'
|
||||
python-version: '3.11'
|
||||
- name: install dependencies
|
||||
run: |
|
||||
python3 -m pip install jupyter
|
||||
|
||||
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@@ -19,6 +19,6 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
cache: 'pip' # caching pip dependencies
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
|
||||
5
.github/workflows/main.yml
vendored
5
.github/workflows/main.yml
vendored
@@ -26,6 +26,11 @@ jobs:
|
||||
pytorch: 2.5.1
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
9
.github/workflows/multi-gpu-e2e.yml
vendored
9
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -34,6 +34,13 @@ jobs:
|
||||
axolotl_extras:
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
axolotl_extras:
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 120
|
||||
steps:
|
||||
@@ -42,7 +49,7 @@ jobs:
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
5
.github/workflows/nightlies.yml
vendored
5
.github/workflows/nightlies.yml
vendored
@@ -22,6 +22,11 @@ jobs:
|
||||
python_version: "3.11"
|
||||
pytorch: 2.5.1
|
||||
axolotl_extras:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
2
.github/workflows/pypi.yml
vendored
2
.github/workflows/pypi.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
|
||||
20
.github/workflows/tests-nightly.yml
vendored
20
.github/workflows/tests-nightly.yml
vendored
@@ -12,7 +12,7 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
cache: 'pip' # caching pip dependencies
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
env:
|
||||
@@ -25,13 +25,8 @@ jobs:
|
||||
fail-fast: false
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
python_version: ["3.10", "3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1"]
|
||||
exclude:
|
||||
- python_version: "3.10"
|
||||
pytorch_version: "2.4.1"
|
||||
- python_version: "3.10"
|
||||
pytorch_version: "2.5.1"
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
@@ -112,13 +107,20 @@ jobs:
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
23
.github/workflows/tests.yml
vendored
23
.github/workflows/tests.yml
vendored
@@ -35,7 +35,7 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
cache: 'pip' # caching pip dependencies
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
env:
|
||||
@@ -48,13 +48,8 @@ jobs:
|
||||
fail-fast: false
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
python_version: ["3.10", "3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1"]
|
||||
exclude:
|
||||
- python_version: "3.10"
|
||||
pytorch_version: "2.4.1"
|
||||
- python_version: "3.10"
|
||||
pytorch_version: "2.5.1"
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
@@ -127,7 +122,7 @@ jobs:
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1"]
|
||||
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
@@ -216,7 +211,7 @@ jobs:
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
@@ -251,13 +246,19 @@ jobs:
|
||||
pytorch: 2.4.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
@@ -51,7 +51,7 @@ Features:
|
||||
|
||||
**Requirements**:
|
||||
- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
|
||||
- Python ≥3.10
|
||||
- Python 3.11
|
||||
- PyTorch ≥2.4.1
|
||||
|
||||
### Installation
|
||||
|
||||
@@ -46,6 +46,10 @@ overrides_of_model_config:
|
||||
type: # linear | dynamic
|
||||
factor: # float
|
||||
|
||||
# optional overrides the base model loading from_pretrained
|
||||
overrides_of_model_kwargs:
|
||||
# use_cache: False
|
||||
|
||||
# optional overrides to the bnb 4bit quantization configuration
|
||||
# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
|
||||
bnb_config_kwargs:
|
||||
|
||||
@@ -19,3 +19,7 @@ description: Frequently asked questions
|
||||
**Q: AttributeError: 'DummyOptim' object has no attribute 'step'**
|
||||
|
||||
> A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
|
||||
|
||||
**Q: The codes is stuck on saving preprocessed datasets.**
|
||||
|
||||
> A: This is usually an issue with the GPU. This can be resolved through setting the os environment variable `CUDA_VISIBLE_DEVICES=0`. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.
|
||||
|
||||
@@ -3,6 +3,18 @@ title: Multi Node
|
||||
description: How to use Axolotl on multiple machines
|
||||
---
|
||||
|
||||
The below are three ways to train multi-node in Axolotl.
|
||||
|
||||
::: {.callout-important}
|
||||
Each machine needs a copy of Axolotl, we suggest using the same commit to ensure compatibility.
|
||||
|
||||
You will also need to have the same configuration file for your model on each machine.
|
||||
|
||||
Make sure the main machine is reachable by other machines.
|
||||
:::
|
||||
|
||||
# Accelerate
|
||||
|
||||
You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:
|
||||
|
||||
~/.cache/huggingface/accelerate/default_config.yaml
|
||||
@@ -26,7 +38,7 @@ tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
Configure your model to use FSDP with for example:
|
||||
Configure your model to use FSDP in the Axolotl yaml. For example:
|
||||
```yaml
|
||||
fsdp:
|
||||
- full_shard
|
||||
@@ -37,12 +49,40 @@ fsdp_config:
|
||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
```
|
||||
|
||||
## Machine configuration
|
||||
|
||||
On each machine you need a copy of Axolotl, we suggest using the same commit to ensure compatibility.
|
||||
|
||||
You will also need to have the same configuration file for your model on each machine.
|
||||
|
||||
On the main machine only, make sure the port you set as `main_process_port` is open in TCP and reachable by other machines.
|
||||
|
||||
All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
|
||||
|
||||
# Raytrain
|
||||
|
||||
Please see ray train doc [here](ray-integration.qmd).
|
||||
|
||||
# Torchrun
|
||||
|
||||
If you are using Infiniband, we recommend torchrun to utilize the full bandwidth.
|
||||
|
||||
Set the following env (change buffersize/socketname depending on your system):
|
||||
|
||||
```yaml
|
||||
export NCCL_IB_DISABLE=0
|
||||
export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
|
||||
export NCCL_BUFFSIZE=2097152
|
||||
```
|
||||
|
||||
Run the following on each node:
|
||||
|
||||
```bash
|
||||
torchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port" -m axolotl.cli.train config.yaml
|
||||
```
|
||||
|
||||
Please make sure to substitute the placeholder variables.
|
||||
|
||||
- `num_nodes`: Number of nodes (containing GPUs)
|
||||
- `gpu_per_node`: Number of gpus per node
|
||||
- `head_node_ip`: IP of the head node (make sure other machines can connect to this)
|
||||
- `head_node_port`: Port of the head node (make sure other machines can connect to this. Default 29400)
|
||||
- `rdzv_id`: A unique job ID that is used by the job across nodes.
|
||||
|
||||
::: {.callout-note}
|
||||
You need to call `axolotl.cli.train` instead of `axolotl train` as the latter calls accelerate under the hood
|
||||
:::
|
||||
|
||||
More info on the available configs can be found on the Pytorch docs [here](https://pytorch.org/docs/stable/elastic/run.html)
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||
|
||||
# START section of dependencies that don't install on Darwin/MacOS
|
||||
bitsandbytes==0.45.1
|
||||
bitsandbytes==0.45.2
|
||||
triton>=3.0.0
|
||||
mamba-ssm==1.2.0.post1
|
||||
flash-attn==2.7.0.post2
|
||||
flash-attn==2.7.4.post1
|
||||
xformers>=0.0.23.post1
|
||||
autoawq==0.2.7.post3
|
||||
liger-kernel==0.5.2
|
||||
@@ -13,7 +13,7 @@ liger-kernel==0.5.2
|
||||
packaging==23.2
|
||||
|
||||
peft==0.14.0
|
||||
transformers==4.48.1
|
||||
transformers==4.48.3
|
||||
tokenizers>=0.21.0
|
||||
accelerate==1.3.0
|
||||
datasets==3.2.0
|
||||
|
||||
7
setup.py
7
setup.py
@@ -71,12 +71,15 @@ def parse_requirements():
|
||||
else:
|
||||
raise ValueError("Invalid version format")
|
||||
|
||||
if (major, minor) >= (2, 5):
|
||||
if (major, minor) >= (2, 6):
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
_install_requires.append("xformers==0.0.29.post2")
|
||||
elif (major, minor) >= (2, 5):
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
if patch == 0:
|
||||
_install_requires.append("xformers==0.0.28.post2")
|
||||
else:
|
||||
_install_requires.append("xformers==0.0.28.post3")
|
||||
_install_requires.append("xformers==0.0.29")
|
||||
_install_requires.pop(_install_requires.index(autoawq_version))
|
||||
elif (major, minor) >= (2, 4):
|
||||
if patch == 0:
|
||||
|
||||
@@ -79,6 +79,7 @@ from axolotl.utils.chat_templates import get_chat_template_from_config
|
||||
from axolotl.utils.collators import (
|
||||
BatchSamplerDataCollatorForSeq2Seq,
|
||||
DataCollatorForSeq2Seq,
|
||||
FlexBatchSamplerDataCollatorForSeq2Seq,
|
||||
MambaDataCollator,
|
||||
V2BatchSamplerDataCollatorForSeq2Seq,
|
||||
)
|
||||
@@ -816,6 +817,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
Union[
|
||||
V2BatchSamplerDataCollatorForSeq2Seq,
|
||||
BatchSamplerDataCollatorForSeq2Seq,
|
||||
FlexBatchSamplerDataCollatorForSeq2Seq,
|
||||
DataCollatorForSeq2Seq,
|
||||
DataCollatorWithFlattening,
|
||||
RewardDataCollatorWithPadding,
|
||||
@@ -827,7 +829,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
if "max_length" in kwargs:
|
||||
kwargs.pop("max_length")
|
||||
elif use_batch_sampler_collator:
|
||||
if self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES:
|
||||
if self.cfg.flex_attention is True:
|
||||
collator = V2BatchSamplerDataCollatorForSeq2Seq
|
||||
elif self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES:
|
||||
collator = V2BatchSamplerDataCollatorForSeq2Seq
|
||||
elif (
|
||||
self.cfg.model_config_type in ["llama"]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,590 @@
|
||||
{
|
||||
"model.layers.0.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.1.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.2.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.3.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.4.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.5.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.6.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.7.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.8.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.9.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.10.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.11.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.12.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.13.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.14.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.15.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"lm_head": {
|
||||
"snr": Infinity,
|
||||
"type": "lm_head"
|
||||
},
|
||||
"model.layers.0.mlp.down_proj": {
|
||||
"snr": 70.0594253540039,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.1.mlp.down_proj": {
|
||||
"snr": 11.135851860046387,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.2.mlp.down_proj": {
|
||||
"snr": 7.035482883453369,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.3.mlp.down_proj": {
|
||||
"snr": 6.422532081604004,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.4.mlp.down_proj": {
|
||||
"snr": 5.748020172119141,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.5.mlp.down_proj": {
|
||||
"snr": 3.885556697845459,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.6.mlp.down_proj": {
|
||||
"snr": 3.4336745738983154,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.7.mlp.down_proj": {
|
||||
"snr": 2.791595935821533,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.8.mlp.down_proj": {
|
||||
"snr": 5.36277961730957,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.9.mlp.down_proj": {
|
||||
"snr": 4.459208011627197,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.10.mlp.down_proj": {
|
||||
"snr": 6.272170066833496,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.11.mlp.down_proj": {
|
||||
"snr": 5.264761447906494,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.12.mlp.down_proj": {
|
||||
"snr": 4.324735641479492,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.13.mlp.down_proj": {
|
||||
"snr": 3.878648042678833,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.14.mlp.down_proj": {
|
||||
"snr": 2.9773054122924805,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.15.mlp.down_proj": {
|
||||
"snr": 4.471445560455322,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.0.mlp.gate_proj": {
|
||||
"snr": 25.227100372314453,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.1.mlp.gate_proj": {
|
||||
"snr": 6.58299446105957,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.2.mlp.gate_proj": {
|
||||
"snr": 3.4688243865966797,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.3.mlp.gate_proj": {
|
||||
"snr": 1.555246114730835,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.4.mlp.gate_proj": {
|
||||
"snr": 0.7770601511001587,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.5.mlp.gate_proj": {
|
||||
"snr": 0.6239906549453735,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.6.mlp.gate_proj": {
|
||||
"snr": 0.6440379023551941,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.7.mlp.gate_proj": {
|
||||
"snr": 0.5120116472244263,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.8.mlp.gate_proj": {
|
||||
"snr": 0.6544050574302673,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.9.mlp.gate_proj": {
|
||||
"snr": 0.5381016731262207,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.10.mlp.gate_proj": {
|
||||
"snr": 0.622873842716217,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.11.mlp.gate_proj": {
|
||||
"snr": 0.9361700415611267,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.12.mlp.gate_proj": {
|
||||
"snr": 1.475605845451355,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.13.mlp.gate_proj": {
|
||||
"snr": 1.608325719833374,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.14.mlp.gate_proj": {
|
||||
"snr": 1.0720024108886719,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.15.mlp.gate_proj": {
|
||||
"snr": 0.7111338973045349,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.0.mlp.up_proj": {
|
||||
"snr": 28.431896209716797,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.1.mlp.up_proj": {
|
||||
"snr": 15.546019554138184,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.2.mlp.up_proj": {
|
||||
"snr": 23.048023223876953,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.3.mlp.up_proj": {
|
||||
"snr": 25.790977478027344,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.4.mlp.up_proj": {
|
||||
"snr": 18.552549362182617,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.5.mlp.up_proj": {
|
||||
"snr": 8.85106372833252,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.6.mlp.up_proj": {
|
||||
"snr": 10.653799057006836,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.7.mlp.up_proj": {
|
||||
"snr": 7.365357875823975,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.8.mlp.up_proj": {
|
||||
"snr": 11.98373794555664,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.9.mlp.up_proj": {
|
||||
"snr": 8.04493236541748,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.10.mlp.up_proj": {
|
||||
"snr": 8.523039817810059,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.11.mlp.up_proj": {
|
||||
"snr": 5.381742477416992,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.12.mlp.up_proj": {
|
||||
"snr": 3.9845118522644043,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.13.mlp.up_proj": {
|
||||
"snr": 3.4893221855163574,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.14.mlp.up_proj": {
|
||||
"snr": 1.764201045036316,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.15.mlp.up_proj": {
|
||||
"snr": 0.9730708599090576,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.embed_tokens": {
|
||||
"snr": Infinity,
|
||||
"type": "model.embed_tokens"
|
||||
},
|
||||
"model.norm": {
|
||||
"snr": Infinity,
|
||||
"type": "model.norm"
|
||||
},
|
||||
"model.layers.0.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.1.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.2.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.3.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.4.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.5.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.6.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.7.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.8.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.9.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.10.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.11.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.12.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.13.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.14.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.15.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.0.self_attn.k_proj": {
|
||||
"snr": 0.11727584153413773,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.k_proj": {
|
||||
"snr": 0.24786807596683502,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.k_proj": {
|
||||
"snr": 0.36378130316734314,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.k_proj": {
|
||||
"snr": 0.2983120381832123,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.k_proj": {
|
||||
"snr": 0.33789733052253723,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.k_proj": {
|
||||
"snr": 0.29155924916267395,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.k_proj": {
|
||||
"snr": 0.2537297010421753,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.k_proj": {
|
||||
"snr": 0.28204113245010376,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.k_proj": {
|
||||
"snr": 0.2776711583137512,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.k_proj": {
|
||||
"snr": 0.2927376627922058,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.k_proj": {
|
||||
"snr": 0.31486213207244873,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.k_proj": {
|
||||
"snr": 0.32363659143447876,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.k_proj": {
|
||||
"snr": 0.31382912397384644,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.k_proj": {
|
||||
"snr": 0.4635234773159027,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.k_proj": {
|
||||
"snr": 0.25379249453544617,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.k_proj": {
|
||||
"snr": 0.2628238797187805,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.o_proj": {
|
||||
"snr": 0.27602291107177734,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.o_proj": {
|
||||
"snr": 0.2149604707956314,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.o_proj": {
|
||||
"snr": 0.2540294826030731,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.o_proj": {
|
||||
"snr": 0.27978822588920593,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.o_proj": {
|
||||
"snr": 0.3121289908885956,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.o_proj": {
|
||||
"snr": 0.35037684440612793,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.o_proj": {
|
||||
"snr": 0.366205096244812,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.o_proj": {
|
||||
"snr": 0.3692712187767029,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.o_proj": {
|
||||
"snr": 0.3301038146018982,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.o_proj": {
|
||||
"snr": 0.3003396987915039,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.o_proj": {
|
||||
"snr": 0.30804169178009033,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.o_proj": {
|
||||
"snr": 0.28501132130622864,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.o_proj": {
|
||||
"snr": 0.2171541005373001,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.o_proj": {
|
||||
"snr": 0.19183959066867828,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.o_proj": {
|
||||
"snr": 0.19215913116931915,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.o_proj": {
|
||||
"snr": 0.25486502051353455,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.q_proj": {
|
||||
"snr": 0.03850084915757179,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.q_proj": {
|
||||
"snr": 0.0713055431842804,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.q_proj": {
|
||||
"snr": 0.07948919385671616,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.q_proj": {
|
||||
"snr": 0.08047746121883392,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.q_proj": {
|
||||
"snr": 0.0852593332529068,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.q_proj": {
|
||||
"snr": 0.09794823825359344,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.q_proj": {
|
||||
"snr": 0.09627152234315872,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.q_proj": {
|
||||
"snr": 0.11065381020307541,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.q_proj": {
|
||||
"snr": 0.12031875550746918,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.q_proj": {
|
||||
"snr": 0.09804573655128479,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.q_proj": {
|
||||
"snr": 0.10897502303123474,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.q_proj": {
|
||||
"snr": 0.09267337620258331,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.q_proj": {
|
||||
"snr": 0.08803492039442062,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.q_proj": {
|
||||
"snr": 0.0902542844414711,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.q_proj": {
|
||||
"snr": 0.10154066979885101,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.q_proj": {
|
||||
"snr": 0.09083802253007889,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.v_proj": {
|
||||
"snr": 2.842210054397583,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.v_proj": {
|
||||
"snr": 10.59461498260498,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.v_proj": {
|
||||
"snr": 8.993025779724121,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.v_proj": {
|
||||
"snr": 62.567787170410156,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.v_proj": {
|
||||
"snr": 23.80082893371582,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.v_proj": {
|
||||
"snr": 7.957369804382324,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.v_proj": {
|
||||
"snr": 12.01815414428711,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.v_proj": {
|
||||
"snr": 5.095500469207764,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.v_proj": {
|
||||
"snr": 11.719332695007324,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.v_proj": {
|
||||
"snr": 555.0869750976562,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.v_proj": {
|
||||
"snr": 22.95538330078125,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.v_proj": {
|
||||
"snr": 30.042158126831055,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.v_proj": {
|
||||
"snr": 9.577271461486816,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.v_proj": {
|
||||
"snr": 18.176361083984375,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.v_proj": {
|
||||
"snr": 1.5695856809616089,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.v_proj": {
|
||||
"snr": 2.7235565185546875,
|
||||
"type": "self_attn.v_proj"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,590 @@
|
||||
{
|
||||
"model.layers.0.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.1.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.2.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.3.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.4.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.5.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.6.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.7.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.8.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.9.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.10.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.11.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.12.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.13.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.14.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.15.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"lm_head": {
|
||||
"snr": Infinity,
|
||||
"type": "lm_head"
|
||||
},
|
||||
"model.layers.0.mlp.down_proj": {
|
||||
"snr": 57.09797286987305,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.1.mlp.down_proj": {
|
||||
"snr": 9.538983345031738,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.2.mlp.down_proj": {
|
||||
"snr": 6.227016925811768,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.3.mlp.down_proj": {
|
||||
"snr": 5.660686492919922,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.4.mlp.down_proj": {
|
||||
"snr": 5.178432464599609,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.5.mlp.down_proj": {
|
||||
"snr": 3.5638349056243896,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.6.mlp.down_proj": {
|
||||
"snr": 3.0918056964874268,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.7.mlp.down_proj": {
|
||||
"snr": 2.456392288208008,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.8.mlp.down_proj": {
|
||||
"snr": 4.525328636169434,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.9.mlp.down_proj": {
|
||||
"snr": 3.9409055709838867,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.10.mlp.down_proj": {
|
||||
"snr": 5.447249412536621,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.11.mlp.down_proj": {
|
||||
"snr": 4.807600975036621,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.12.mlp.down_proj": {
|
||||
"snr": 3.915374517440796,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.13.mlp.down_proj": {
|
||||
"snr": 3.4820363521575928,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.14.mlp.down_proj": {
|
||||
"snr": 2.6045074462890625,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.15.mlp.down_proj": {
|
||||
"snr": 3.7237701416015625,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.0.mlp.gate_proj": {
|
||||
"snr": 22.160131454467773,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.1.mlp.gate_proj": {
|
||||
"snr": 6.072206020355225,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.2.mlp.gate_proj": {
|
||||
"snr": 3.2467362880706787,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.3.mlp.gate_proj": {
|
||||
"snr": 1.4111896753311157,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.4.mlp.gate_proj": {
|
||||
"snr": 0.7405938506126404,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.5.mlp.gate_proj": {
|
||||
"snr": 0.5916463136672974,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.6.mlp.gate_proj": {
|
||||
"snr": 0.6149423718452454,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.7.mlp.gate_proj": {
|
||||
"snr": 0.48369669914245605,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.8.mlp.gate_proj": {
|
||||
"snr": 0.6047574877738953,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.9.mlp.gate_proj": {
|
||||
"snr": 0.5092479586601257,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.10.mlp.gate_proj": {
|
||||
"snr": 0.5999670624732971,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.11.mlp.gate_proj": {
|
||||
"snr": 0.8980127573013306,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.12.mlp.gate_proj": {
|
||||
"snr": 1.4252448081970215,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.13.mlp.gate_proj": {
|
||||
"snr": 1.509937047958374,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.14.mlp.gate_proj": {
|
||||
"snr": 1.0066585540771484,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.15.mlp.gate_proj": {
|
||||
"snr": 0.6413647532463074,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.0.mlp.up_proj": {
|
||||
"snr": 26.08852195739746,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.1.mlp.up_proj": {
|
||||
"snr": 13.382951736450195,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.2.mlp.up_proj": {
|
||||
"snr": 20.088768005371094,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.3.mlp.up_proj": {
|
||||
"snr": 23.0632381439209,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.4.mlp.up_proj": {
|
||||
"snr": 16.07433319091797,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.5.mlp.up_proj": {
|
||||
"snr": 8.00507640838623,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.6.mlp.up_proj": {
|
||||
"snr": 9.538354873657227,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.7.mlp.up_proj": {
|
||||
"snr": 6.286602973937988,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.8.mlp.up_proj": {
|
||||
"snr": 10.092820167541504,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.9.mlp.up_proj": {
|
||||
"snr": 7.193963527679443,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.10.mlp.up_proj": {
|
||||
"snr": 7.320116996765137,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.11.mlp.up_proj": {
|
||||
"snr": 4.8728532791137695,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.12.mlp.up_proj": {
|
||||
"snr": 3.596583366394043,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.13.mlp.up_proj": {
|
||||
"snr": 3.166161298751831,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.14.mlp.up_proj": {
|
||||
"snr": 1.5600818395614624,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.15.mlp.up_proj": {
|
||||
"snr": 0.8726214170455933,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.embed_tokens": {
|
||||
"snr": Infinity,
|
||||
"type": "model.embed_tokens"
|
||||
},
|
||||
"model.norm": {
|
||||
"snr": Infinity,
|
||||
"type": "model.norm"
|
||||
},
|
||||
"model.layers.0.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.1.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.2.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.3.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.4.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.5.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.6.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.7.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.8.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.9.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.10.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.11.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.12.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.13.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.14.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.15.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.0.self_attn.k_proj": {
|
||||
"snr": 0.1154392883181572,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.k_proj": {
|
||||
"snr": 0.24299409985542297,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.k_proj": {
|
||||
"snr": 0.3624322712421417,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.k_proj": {
|
||||
"snr": 0.29509487748146057,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.k_proj": {
|
||||
"snr": 0.32953736186027527,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.k_proj": {
|
||||
"snr": 0.2908833622932434,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.k_proj": {
|
||||
"snr": 0.2488437294960022,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.k_proj": {
|
||||
"snr": 0.27847856283187866,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.k_proj": {
|
||||
"snr": 0.27143892645835876,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.k_proj": {
|
||||
"snr": 0.28804272413253784,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.k_proj": {
|
||||
"snr": 0.31197959184646606,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.k_proj": {
|
||||
"snr": 0.3203586935997009,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.k_proj": {
|
||||
"snr": 0.30905747413635254,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.k_proj": {
|
||||
"snr": 0.46828722953796387,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.k_proj": {
|
||||
"snr": 0.24205778539180756,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.k_proj": {
|
||||
"snr": 0.2559327781200409,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.o_proj": {
|
||||
"snr": 0.2638678550720215,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.o_proj": {
|
||||
"snr": 0.21109595894813538,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.o_proj": {
|
||||
"snr": 0.24751724302768707,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.o_proj": {
|
||||
"snr": 0.2728094160556793,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.o_proj": {
|
||||
"snr": 0.3001374304294586,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.o_proj": {
|
||||
"snr": 0.33903488516807556,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.o_proj": {
|
||||
"snr": 0.3530929982662201,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.o_proj": {
|
||||
"snr": 0.36753255128860474,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.o_proj": {
|
||||
"snr": 0.3373180329799652,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.o_proj": {
|
||||
"snr": 0.2970578670501709,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.o_proj": {
|
||||
"snr": 0.3076324760913849,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.o_proj": {
|
||||
"snr": 0.2766900658607483,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.o_proj": {
|
||||
"snr": 0.20973259210586548,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.o_proj": {
|
||||
"snr": 0.18185566365718842,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.o_proj": {
|
||||
"snr": 0.18329747021198273,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.o_proj": {
|
||||
"snr": 0.2437991499900818,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.q_proj": {
|
||||
"snr": 0.038040731102228165,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.q_proj": {
|
||||
"snr": 0.0707998052239418,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.q_proj": {
|
||||
"snr": 0.0787411704659462,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.q_proj": {
|
||||
"snr": 0.08089710026979446,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.q_proj": {
|
||||
"snr": 0.08591937273740768,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.q_proj": {
|
||||
"snr": 0.09852176159620285,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.q_proj": {
|
||||
"snr": 0.09690654277801514,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.q_proj": {
|
||||
"snr": 0.11181341856718063,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.q_proj": {
|
||||
"snr": 0.12042108923196793,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.q_proj": {
|
||||
"snr": 0.09799323976039886,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.q_proj": {
|
||||
"snr": 0.10901063680648804,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.q_proj": {
|
||||
"snr": 0.09307146072387695,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.q_proj": {
|
||||
"snr": 0.0880950540304184,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.q_proj": {
|
||||
"snr": 0.08886399120092392,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.q_proj": {
|
||||
"snr": 0.09955056011676788,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.q_proj": {
|
||||
"snr": 0.08929339051246643,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.v_proj": {
|
||||
"snr": 2.5501928329467773,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.v_proj": {
|
||||
"snr": 9.449499130249023,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.v_proj": {
|
||||
"snr": 7.9920830726623535,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.v_proj": {
|
||||
"snr": 50.69462585449219,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.v_proj": {
|
||||
"snr": 19.083511352539062,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.v_proj": {
|
||||
"snr": 7.21597146987915,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.v_proj": {
|
||||
"snr": 11.27744197845459,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.v_proj": {
|
||||
"snr": 4.579711437225342,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.v_proj": {
|
||||
"snr": 10.940719604492188,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.v_proj": {
|
||||
"snr": 553.4417724609375,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.v_proj": {
|
||||
"snr": 20.59434700012207,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.v_proj": {
|
||||
"snr": 26.636865615844727,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.v_proj": {
|
||||
"snr": 8.614749908447266,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.v_proj": {
|
||||
"snr": 17.722007751464844,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.v_proj": {
|
||||
"snr": 1.48500657081604,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.v_proj": {
|
||||
"snr": 2.5776851177215576,
|
||||
"type": "self_attn.v_proj"
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
146
src/axolotl/monkeypatch/flex_attn.py
Normal file
146
src/axolotl/monkeypatch/flex_attn.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""
|
||||
Taken from https://github.com/pytorch/torchtune/blob/main/torchtune/modules/attention_utils.py
|
||||
"""
|
||||
from typing import Union
|
||||
|
||||
import torch
|
||||
from torch.nn.attention.flex_attention import BlockMask
|
||||
from torch.nn.attention.flex_attention import (
|
||||
create_block_mask as create_block_causal_mask_flex,
|
||||
)
|
||||
|
||||
_MaskType = Union[torch.Tensor, BlockMask]
|
||||
|
||||
|
||||
def create_block_causal_mask(
|
||||
seq_lens: list[torch.Tensor], max_seq_len: int
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Given a batch tensor of seq lens defining the lengths of samples in each pack,
|
||||
Construct a 2D block causal mask for each pack in the batch. For example, if
|
||||
a single sample's seq_lens is [3, 2, 1], the mask would be::
|
||||
|
||||
mask = [
|
||||
[1, 0, 0, 0, 0, 0],
|
||||
[1, 1, 0, 0, 0, 0],
|
||||
[1, 1, 1, 0, 0, 0],
|
||||
[0, 0, 0, 1, 0, 0],
|
||||
[0, 0, 0, 1, 1, 0],
|
||||
[0, 0, 0, 0, 0, 1],
|
||||
]
|
||||
|
||||
Args:
|
||||
seq_lens (List[torch.Tensor]): Sequence lengths of samples in each pack in the batch,
|
||||
shape (batch_size, n), where n is the max number of sequences in a pack and can vary
|
||||
across packs.
|
||||
|
||||
|
||||
Returns:
|
||||
Tensor: Block causal mask of shape (batch_size, max_seq_len, max_seq_len).
|
||||
"""
|
||||
batch_block_attn_masks = []
|
||||
batch_size = len(seq_lens)
|
||||
for sample_idx in range(batch_size):
|
||||
block_attn_masks = [
|
||||
torch.trilu( # torch.tril(
|
||||
torch.ones(seq_len, seq_len, dtype=torch.bool, device=seq_len.device)
|
||||
)
|
||||
for seq_len in seq_lens[sample_idx]
|
||||
]
|
||||
|
||||
"""residue_len = max_seq_len - torch.sum(seq_lens[sample_idx])
|
||||
block_attn_masks.append(
|
||||
torch.tril(
|
||||
torch.ones(
|
||||
residue_len, residue_len, dtype=torch.bool, device=seq_lens[sample_idx].device
|
||||
)
|
||||
)
|
||||
)"""
|
||||
|
||||
batch_block_attn_masks.append(torch.block_diag(*block_attn_masks))
|
||||
|
||||
return torch.stack(batch_block_attn_masks)[:, None, :, :]
|
||||
|
||||
|
||||
def _get_document_ids_from_seq_lens(
|
||||
seq_lens: list[torch.Tensor],
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Convert a batch tensor of seq lens into integer IDs denoting sample ownership.
|
||||
For example, seq_lens = [2, 3, 1] would return [0, 0, 1, 1, 1, 2].
|
||||
|
||||
Args:
|
||||
seq_lens (List[torch.Tensor]): Sequence lengths of samples in each pack in the batch,
|
||||
shape (batch_size, n), where n is the max number of sequences in a pack and can vary
|
||||
across packs.
|
||||
|
||||
Returns:
|
||||
Tensor: Document IDs of shape (batch_size, max_seq_len).
|
||||
"""
|
||||
batch_size = len(seq_lens)
|
||||
batch_document_ids = []
|
||||
for sample_idx in range(batch_size):
|
||||
# We assume seq lens sum to max seq lens, so document_ids should be of
|
||||
# shape (max_seq_len, )
|
||||
document_ids = torch.cat(
|
||||
[
|
||||
torch.full((seq_len,), i, dtype=torch.long, device=seq_len.device)
|
||||
for i, seq_len in enumerate(seq_lens[sample_idx])
|
||||
]
|
||||
)
|
||||
batch_document_ids.append(document_ids)
|
||||
batch_document_ids = torch.stack(batch_document_ids)
|
||||
return batch_document_ids
|
||||
|
||||
|
||||
def packed_block_causal_mask(
|
||||
seq_lens: list[torch.Tensor], totalseqlens: list[int]
|
||||
) -> _MaskType:
|
||||
"""
|
||||
Create a block causal document mask for a batch of packed sequences. If
|
||||
flex attention is supported by the current hardware, block causal logic and
|
||||
passing this into :func:`torch.nn.attention.flex_attention.create_block_mask`.
|
||||
The resultant BlockMask is a compressed representation of the full block causal
|
||||
mask. If on an older version, a standard 2D block causal mask is created and returned.
|
||||
|
||||
Args:
|
||||
seq_lens (List[torch.Tensor]): Sequence lengths of samples in each pack in the batch,
|
||||
shape (batch_size, n), where n is the max number of sequences in a pack and can vary
|
||||
across packs.
|
||||
|
||||
Returns:
|
||||
_MaskType: BlockMask or Tensor if torch version < 2.5.0.
|
||||
"""
|
||||
|
||||
document_ids = _get_document_ids_from_seq_lens(seq_lens)
|
||||
batch_size , max_seq_len = document_ids.shape
|
||||
document_ids = document_ids.to("cuda")
|
||||
totalseqlens = totalseqlens.to("cuda")
|
||||
|
||||
# Instead of passing a tensor mask, flex attention requires a mask_mod function
|
||||
# that determines which elements of QK^T should be included in the attention
|
||||
# computation prior to the softmax. For sample packing, we need both the
|
||||
# logic for both causal mask and document mask. See PyTorch's official
|
||||
# blog post for more details: https://pytorch.org/blog/flexattention/#mask-mods
|
||||
def mask_mod(b, h, q_idx, kv_idx):
|
||||
"""
|
||||
Defines the logic of a block causal mask by combining both a standard causal mask
|
||||
and a block diagonal document mask.
|
||||
|
||||
See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
|
||||
for an illustration.
|
||||
"""
|
||||
causal_mask = q_idx >= kv_idx
|
||||
document_mask = document_ids[b, q_idx] == document_ids[b, kv_idx]
|
||||
finite_mask = q_idx < totalseqlens[b]
|
||||
return causal_mask & document_mask & finite_mask
|
||||
|
||||
return create_block_causal_mask_flex(
|
||||
mask_mod,
|
||||
batch_size,
|
||||
None,
|
||||
max_seq_len,
|
||||
max_seq_len,
|
||||
device="cuda",
|
||||
BLOCK_SIZE=512,
|
||||
)
|
||||
@@ -95,6 +95,103 @@ def get_cu_seqlens(attn_mask):
|
||||
return torch.stack(results).to(dtype=torch.int32), torch.stack(max_seq_lens)
|
||||
|
||||
|
||||
def get_packed_mask_from_pos_ids(position_ids):
|
||||
if len(position_ids.shape) == 1:
|
||||
position_ids = position_ids.unsqueeze(0)
|
||||
|
||||
device = position_ids.device
|
||||
results = []
|
||||
|
||||
for i, row in enumerate(position_ids):
|
||||
# Count the number of consecutive zeros from the right side
|
||||
padding_length = (row == 0).int().flip(dims=[0]).cumprod(dim=0).sum().item()
|
||||
|
||||
# Adjust the row to exclude padding
|
||||
adjusted_row = row[:-padding_length] if padding_length else row.clone()
|
||||
|
||||
# Find where the position resets to 0 (indicating a new sequence)
|
||||
seq_starts = torch.cat(
|
||||
[
|
||||
torch.tensor([True], dtype=torch.bool, device=device),
|
||||
adjusted_row[1:] == 0,
|
||||
]
|
||||
)
|
||||
# Get the indices where the sequence starts
|
||||
start_indices = torch.cat(
|
||||
[
|
||||
torch.nonzero(seq_starts).unbind(dim=1)[0],
|
||||
torch.tensor([len(adjusted_row)], dtype=torch.int32, device=device),
|
||||
]
|
||||
)
|
||||
# Calculate the sequence lengths
|
||||
seq_lengths = start_indices[1:] - start_indices[:-1]
|
||||
# Append the padding length to the sequence lengths
|
||||
doc_mask = torch.ones(len(row), dtype=torch.int32, device=device)
|
||||
for i, seq_len in enumerate(seq_lengths):
|
||||
start_id = start_indices[i]
|
||||
doc_mask[start_id : start_id + seq_len] = (
|
||||
(i+1) * doc_mask[start_id : start_id + seq_len]
|
||||
)
|
||||
if padding_length:
|
||||
doc_mask[len(adjusted_row) :] = 0 * doc_mask[len(adjusted_row) :]
|
||||
|
||||
results.append(doc_mask)
|
||||
|
||||
return torch.stack(results)
|
||||
|
||||
|
||||
def get_seqlens_from_pos_ids(position_ids):
|
||||
"""generate a sequence length set using pos ids for doc mask creation in flex attention"""
|
||||
if len(position_ids.shape) == 1:
|
||||
position_ids = position_ids.unsqueeze(0)
|
||||
max_seq_len = position_ids.shape[1]
|
||||
|
||||
device = position_ids.device
|
||||
results = []
|
||||
totalseqlens = []
|
||||
|
||||
for row in position_ids:
|
||||
# Count the number of consecutive zeros from the right side
|
||||
padding_length = (row == 0).int().flip(dims=[0]).cumprod(dim=0).sum().item()
|
||||
|
||||
# Adjust the row to exclude padding
|
||||
adjusted_row = row[:-padding_length] if padding_length else row.clone()
|
||||
|
||||
# Find where the position resets to 0 (indicating a new sequence)
|
||||
seq_starts = torch.cat(
|
||||
[
|
||||
torch.tensor([True], dtype=torch.bool, device=device),
|
||||
adjusted_row[1:] == 0,
|
||||
]
|
||||
)
|
||||
# Get the indices where the sequence starts
|
||||
start_indices = torch.cat(
|
||||
[
|
||||
torch.nonzero(seq_starts).unbind(dim=1)[0],
|
||||
torch.tensor([len(adjusted_row)], dtype=torch.int32, device=device),
|
||||
]
|
||||
)
|
||||
# Calculate the sequence lengths
|
||||
seq_lengths = start_indices[1:] - start_indices[:-1]
|
||||
# Append the padding length to the sequence lengths
|
||||
if padding_length:
|
||||
seq_lengths = torch.cat(
|
||||
[
|
||||
seq_lengths,
|
||||
torch.tensor(
|
||||
[len(row) - torch.sum(seq_lengths)],
|
||||
dtype=torch.int32,
|
||||
device=device,
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
results.append(seq_lengths)
|
||||
totalseqlens.append(len(adjusted_row))
|
||||
|
||||
return results, torch.tensor(totalseqlens, dtype=torch.int32, device=device)
|
||||
|
||||
|
||||
def get_cu_seqlens_from_pos_ids(position_ids):
|
||||
"""generate a cumulative sequence length mask for flash attention using pos ids"""
|
||||
if len(position_ids.shape) == 1:
|
||||
@@ -176,7 +273,10 @@ def mask_2d_to_4d(
|
||||
when they attend to each other within that sequence.
|
||||
This expansion transforms the mask to lower triangular form to prevent future peeking.
|
||||
"""
|
||||
bsz, src_len = mask.size()
|
||||
|
||||
if len(mask.size()) == 4:
|
||||
return mask
|
||||
bsz, src_len = int(mask.size()[0]), int(mask.size()[1])
|
||||
tgt_len = tgt_len if tgt_len is not None else src_len
|
||||
|
||||
mask = mask.unsqueeze(1).unsqueeze(2)
|
||||
|
||||
@@ -47,7 +47,7 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
|
||||
|
||||
if len(chosen_tokenized["input_ids"]) > max_length:
|
||||
LOG.warning(
|
||||
f"Chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}",
|
||||
f"To-be-trimmed chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}",
|
||||
)
|
||||
|
||||
chosen_tokenized["input_ids"] = chosen_tokenized["input_ids"][:max_length]
|
||||
@@ -70,7 +70,7 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
|
||||
|
||||
if len(rejected_tokenized["input_ids"]) > max_length:
|
||||
LOG.warning(
|
||||
f"Rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}",
|
||||
f"To-be-trimmed rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}",
|
||||
)
|
||||
|
||||
rejected_tokenized["input_ids"] = rejected_tokenized["input_ids"][
|
||||
|
||||
@@ -4,6 +4,7 @@ shared axolotl collators for multipack, mamba, multimodal
|
||||
from .batching import ( # noqa: F401
|
||||
BatchSamplerDataCollatorForSeq2Seq,
|
||||
DataCollatorForSeq2Seq,
|
||||
FlexBatchSamplerDataCollatorForSeq2Seq,
|
||||
PretrainingBatchSamplerDataCollatorForSeq2Seq,
|
||||
V2BatchSamplerDataCollatorForSeq2Seq,
|
||||
)
|
||||
|
||||
@@ -3,12 +3,21 @@ DataCollator for axolotl to pad labels and position_ids for packed sequences
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Optional, Union
|
||||
from typing import Any, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
from transformers.utils import PaddingStrategy
|
||||
|
||||
from axolotl.monkeypatch.flex_attn import (
|
||||
create_block_causal_mask,
|
||||
packed_block_causal_mask,
|
||||
)
|
||||
from axolotl.monkeypatch.utils import (
|
||||
get_packed_mask_from_pos_ids,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataCollatorForSeq2Seq:
|
||||
@@ -151,6 +160,42 @@ class BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
||||
return super().__call__(out_features, return_tensors=return_tensors)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FlexBatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
||||
"""
|
||||
Collator for multipack specific to Flex Attention using the BatchSampler
|
||||
"""
|
||||
|
||||
def __call__(self, features, return_tensors=None):
|
||||
if not isinstance(features[0], list):
|
||||
features = [features]
|
||||
out_features = [{} for _ in features]
|
||||
for i, features_ in enumerate(features):
|
||||
for feature in features_[0].keys():
|
||||
if feature == "length":
|
||||
continue
|
||||
elif feature == "attention_mask":
|
||||
"""arrays = [
|
||||
i * np.array(item[feature])
|
||||
for i, item in enumerate(features_)
|
||||
if feature in item
|
||||
]
|
||||
out_features[i][feature] = np.concatenate(arrays)"""
|
||||
continue
|
||||
else:
|
||||
arrays = [
|
||||
np.array(item[feature]) for item in features_ if feature in item
|
||||
]
|
||||
out_features[i][feature] = np.concatenate(arrays)
|
||||
out = super().__call__(out_features, return_tensors=return_tensors)
|
||||
|
||||
# collated_seq_lens, totalseqlens = get_seqlens_from_pos_ids(out["position_ids"])
|
||||
# out["attention_mask"] = packed_block_causal_mask(collated_seq_lens, totalseqlens)
|
||||
out["attention_mask"] = get_packed_mask_from_pos_ids(out["position_ids"])
|
||||
# out["attention_mask"] = create_block_causal_mask(collated_seq_lens, max_seq_len)
|
||||
return out
|
||||
|
||||
|
||||
@dataclass
|
||||
class V2BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
||||
"""
|
||||
|
||||
@@ -115,6 +115,9 @@ class RemappedParameters(BaseModel):
|
||||
overrides_of_model_config: Optional[Dict[str, Any]] = Field(
|
||||
default=None, alias="model_config"
|
||||
)
|
||||
overrides_of_model_kwargs: Optional[Dict[str, Any]] = Field(
|
||||
default=None, alias="model_kwargs"
|
||||
)
|
||||
type_of_model: Optional[str] = Field(default=None, alias="model_type")
|
||||
revision_of_model: Optional[str] = Field(default=None, alias="model_revision")
|
||||
|
||||
@@ -426,8 +429,6 @@ class ModelInputConfig(BaseModel):
|
||||
)
|
||||
trust_remote_code: Optional[bool] = None
|
||||
|
||||
model_kwargs: Optional[Dict[str, Any]] = None
|
||||
|
||||
@field_validator("trust_remote_code")
|
||||
@classmethod
|
||||
def hint_trust_remote_code(cls, trust_remote_code):
|
||||
@@ -783,6 +784,7 @@ class AxolotlInputConfig(
|
||||
xformers_attention: Optional[bool] = None
|
||||
sdp_attention: Optional[bool] = None
|
||||
s2_attention: Optional[bool] = None
|
||||
flex_attention: Optional[bool] = None
|
||||
flash_attention: Optional[bool] = None
|
||||
flash_attn_cross_entropy: Optional[bool] = None
|
||||
flash_attn_rms_norm: Optional[bool] = None
|
||||
@@ -1678,6 +1680,26 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
|
||||
)
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_flex_torch_version(cls, data):
|
||||
if (data.get("flex_attention") is not None) and (
|
||||
data.get("flex_attention") is True
|
||||
):
|
||||
env_capabilities = data.get("env_capabilities", {})
|
||||
torch_version = env_capabilities.get("torch_version")
|
||||
|
||||
if torch_version is None:
|
||||
import torch
|
||||
|
||||
torch_version = str(torch.__version__).split("+", maxsplit=1)[0]
|
||||
|
||||
if version.parse(torch_version) < version.parse("2.5.1"):
|
||||
raise ValueError(
|
||||
"Flex attention is not supported on torch version < 2.5.1"
|
||||
)
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_torch_compile_auto(cls, data):
|
||||
|
||||
@@ -46,6 +46,7 @@ from axolotl.utils.data.pretraining import wrap_pretraining_dataset
|
||||
from axolotl.utils.data.shared import load_dataset_w_config
|
||||
from axolotl.utils.data.utils import (
|
||||
deduplicate_and_log_datasets,
|
||||
drop_long_seq_in_dataset,
|
||||
md5,
|
||||
retry_on_request_exceptions,
|
||||
)
|
||||
@@ -56,7 +57,7 @@ from axolotl.utils.trainer import (
|
||||
process_datasets_for_packing,
|
||||
)
|
||||
|
||||
LOG = logging.getLogger("axolotl")
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@retry_on_request_exceptions(max_retries=3, delay=5)
|
||||
@@ -339,8 +340,11 @@ def load_tokenized_prepared_datasets(
|
||||
else:
|
||||
LOG.debug("NOT shuffling merged datasets")
|
||||
|
||||
if cfg.sample_packing and not cfg.skip_prepare_dataset:
|
||||
dataset, _ = process_datasets_for_packing(cfg, dataset, None)
|
||||
if not cfg.skip_prepare_dataset:
|
||||
dataset = drop_long_seq_in_dataset(dataset, cfg)
|
||||
|
||||
if cfg.sample_packing:
|
||||
dataset, _ = process_datasets_for_packing(cfg, dataset, None)
|
||||
|
||||
if cfg.local_rank == 0 and not cfg.skip_prepare_dataset:
|
||||
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""data handling helpers"""
|
||||
|
||||
import functools
|
||||
import hashlib
|
||||
import logging
|
||||
@@ -6,10 +7,15 @@ import time
|
||||
from enum import Enum
|
||||
|
||||
import huggingface_hub
|
||||
import numpy as np
|
||||
import requests
|
||||
from datasets import Dataset
|
||||
from datasets import Dataset, IterableDataset
|
||||
|
||||
LOG = logging.getLogger("axolotl")
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.samplers.utils import get_dataset_lengths
|
||||
from axolotl.utils.trainer import drop_long_seq
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RetryStrategy(Enum):
|
||||
@@ -150,3 +156,53 @@ def deduplicate_and_log_datasets(
|
||||
)
|
||||
|
||||
return train_dataset, eval_dataset, dataset
|
||||
|
||||
|
||||
def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault):
|
||||
if "input_ids" not in dataset.column_names:
|
||||
LOG.warning(
|
||||
"Dataset does not contain 'input_ids' column. Skip drop long seq. This is expected for RewardModeling."
|
||||
)
|
||||
return dataset
|
||||
|
||||
drop_long = functools.partial(
|
||||
drop_long_seq,
|
||||
sequence_len=cfg.sequence_len,
|
||||
min_sequence_len=cfg.min_sample_len,
|
||||
)
|
||||
|
||||
try:
|
||||
min_input_len = np.min(get_dataset_lengths(dataset))
|
||||
LOG.debug(f"min_input_len: {min_input_len}")
|
||||
max_input_len = np.max(get_dataset_lengths(dataset))
|
||||
LOG.debug(f"max_input_len: {max_input_len}")
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
try:
|
||||
prior_len = len(dataset)
|
||||
except TypeError:
|
||||
# handle iterable datasets case
|
||||
prior_len = None
|
||||
|
||||
filter_map_kwargs = {}
|
||||
if not isinstance(dataset, IterableDataset):
|
||||
filter_map_kwargs["num_proc"] = cfg.dataset_processes
|
||||
filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
|
||||
|
||||
drop_long_kwargs = {}
|
||||
if filter_map_kwargs:
|
||||
drop_long_kwargs["desc"] = "Dropping Long Sequences"
|
||||
|
||||
dataset = dataset.filter(
|
||||
drop_long,
|
||||
batched=True,
|
||||
**filter_map_kwargs,
|
||||
**drop_long_kwargs,
|
||||
)
|
||||
if prior_len:
|
||||
dropped = prior_len - len(dataset)
|
||||
if dropped:
|
||||
LOG.warning(f"Dropped {dropped} long samples from dataset")
|
||||
|
||||
return dataset
|
||||
|
||||
@@ -357,8 +357,8 @@ class ModelLoader:
|
||||
|
||||
# init model kwargs
|
||||
self.model_kwargs: Dict[str, Any] = {}
|
||||
if cfg.model_kwargs:
|
||||
for key, val in cfg.model_kwargs.items():
|
||||
if cfg.overrides_of_model_kwargs:
|
||||
for key, val in cfg.overrides_of_model_kwargs.items():
|
||||
self.model_kwargs[key] = val
|
||||
|
||||
# init model
|
||||
@@ -403,7 +403,7 @@ class ModelLoader:
|
||||
|
||||
if (
|
||||
self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
|
||||
and self.cfg.flash_attention
|
||||
and (self.cfg.flash_attention or self.cfg.flex_attention)
|
||||
and self.cfg.sample_packing
|
||||
):
|
||||
if "auto_map" in self.model_config:
|
||||
@@ -708,7 +708,13 @@ class ModelLoader:
|
||||
"""
|
||||
sample packing uses custom FA2 patch
|
||||
"""
|
||||
if self.cfg.flash_attention:
|
||||
|
||||
if self.cfg.flex_attention:
|
||||
self.model_kwargs["attn_implementation"] = "flex_attention"
|
||||
self.model_config._attn_implementation = ( # pylint: disable=protected-access
|
||||
"flex_attention"
|
||||
)
|
||||
elif self.cfg.flash_attention:
|
||||
if not self.cfg.sample_packing and self.cfg.s2_attention:
|
||||
pass
|
||||
self.model_kwargs["attn_implementation"] = "flash_attention_2"
|
||||
@@ -1100,7 +1106,7 @@ class ModelLoader:
|
||||
should_convert = (
|
||||
# LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
|
||||
# convert them back to fp16/bf16 for flash-attn compatibility.
|
||||
((needs_fa2_dtype or self.cfg.flash_attention) and not qlora_fsdp)
|
||||
((needs_fa2_dtype or self.cfg.flash_attention or self.cfg.flex_attention) and not qlora_fsdp)
|
||||
or self.cfg.cut_cross_entropy # Cut cross entropy requires embedding layers to be in fp16/bf16 for backward pass
|
||||
)
|
||||
|
||||
|
||||
@@ -13,5 +13,4 @@ def get_dataset_lengths(dataset):
|
||||
else:
|
||||
input_ids = dataset.data.column("input_ids")
|
||||
lengths = np.vectorize(len)(np.array(input_ids, dtype=object))
|
||||
return lengths
|
||||
return lengths
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Module containing the Trainer class and related functions"""
|
||||
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
@@ -210,6 +211,8 @@ def drop_long_seq(sample, sequence_len=2048, min_sequence_len=2):
|
||||
|
||||
Works for both single-example (list[int]) or batched (list[list[int]]).
|
||||
"""
|
||||
min_sequence_len = min_sequence_len or 2
|
||||
|
||||
input_ids = sample["input_ids"]
|
||||
|
||||
# Edge case: if input_ids is empty
|
||||
@@ -232,20 +235,6 @@ def drop_long_seq(sample, sequence_len=2048, min_sequence_len=2):
|
||||
|
||||
|
||||
def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
||||
drop_long = partial(
|
||||
drop_long_seq,
|
||||
sequence_len=cfg.sequence_len,
|
||||
min_sequence_len=cfg.min_sample_len or 2,
|
||||
)
|
||||
|
||||
try:
|
||||
min_input_len = np.min(get_dataset_lengths(train_dataset))
|
||||
LOG.debug(f"min_input_len: {min_input_len}", main_process_only=True)
|
||||
max_input_len = np.max(get_dataset_lengths(train_dataset))
|
||||
LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
if cfg.model_config_type == "mamba":
|
||||
LOG.info("dropping attention_mask column")
|
||||
train_dataset = train_dataset.remove_columns("attention_mask")
|
||||
@@ -259,46 +248,6 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
||||
if eval_dataset and "token_type_ids" in eval_dataset.column_names:
|
||||
eval_dataset = eval_dataset.remove_columns("token_type_ids")
|
||||
|
||||
filter_map_kwargs = {}
|
||||
if not isinstance(train_dataset, IterableDataset):
|
||||
filter_map_kwargs["num_proc"] = cfg.dataset_processes
|
||||
filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
|
||||
|
||||
try:
|
||||
prior_len = len(train_dataset)
|
||||
except TypeError:
|
||||
# handle iterable datasets case
|
||||
prior_len = None
|
||||
drop_long_kwargs = {}
|
||||
if filter_map_kwargs:
|
||||
drop_long_kwargs["desc"] = "Dropping Long Sequences"
|
||||
train_dataset = train_dataset.filter(
|
||||
drop_long,
|
||||
batched=True,
|
||||
**filter_map_kwargs,
|
||||
**drop_long_kwargs,
|
||||
)
|
||||
if prior_len:
|
||||
dropped = prior_len - len(train_dataset)
|
||||
if dropped:
|
||||
LOG.warning(f"Dropped {dropped} long samples from train dataset")
|
||||
|
||||
if eval_dataset:
|
||||
try:
|
||||
prior_len = len(eval_dataset)
|
||||
except TypeError:
|
||||
# handle iterable datasets case
|
||||
prior_len = None
|
||||
eval_dataset = eval_dataset.filter(
|
||||
drop_long,
|
||||
**filter_map_kwargs,
|
||||
**drop_long_kwargs,
|
||||
)
|
||||
if prior_len:
|
||||
dropped = prior_len - len(eval_dataset)
|
||||
if dropped:
|
||||
LOG.warning(f"Dropped {dropped} long samples from eval dataset")
|
||||
|
||||
def drop_no_trainable_tokens(sample):
|
||||
"""
|
||||
Drop samples if all labels are -100 (i.e., zero trainable tokens).
|
||||
@@ -325,6 +274,11 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
||||
except TypeError:
|
||||
# handle iterable datasets case
|
||||
prior_len = None
|
||||
filter_map_kwargs = {}
|
||||
if not isinstance(train_dataset, IterableDataset):
|
||||
filter_map_kwargs["num_proc"] = cfg.dataset_processes
|
||||
filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
|
||||
|
||||
drop_long_kwargs = {}
|
||||
if filter_map_kwargs:
|
||||
drop_long_kwargs["desc"] = "Drop Samples with Zero Trainable Tokens"
|
||||
|
||||
@@ -33,7 +33,7 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
|
||||
"num_labels": 1,
|
||||
"chat_template": "alpaca",
|
||||
"reward_model": True,
|
||||
"sequence_len": 1024,
|
||||
"sequence_len": 2048,
|
||||
"pad_to_sequence_len": True,
|
||||
"adapter": "lora",
|
||||
"lora_r": 8,
|
||||
|
||||
Reference in New Issue
Block a user