Compare commits
11 Commits
kd-logits-
...
docs-lint-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f24efd77a1 | ||
|
|
44f64ab627 | ||
|
|
826f1b1494 | ||
|
|
526e5ee8b8 | ||
|
|
fd8cb32547 | ||
|
|
e48e2df4dd | ||
|
|
b7616022ab | ||
|
|
1faf1a5c5a | ||
|
|
5bbad5ef93 | ||
|
|
a971eb4ce6 | ||
|
|
a620d481e2 |
12
.github/workflows/base.yml
vendored
12
.github/workflows/base.yml
vendored
@@ -22,12 +22,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: "124"
|
||||
cuda_version: 12.4.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.10"
|
||||
pytorch: 2.4.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
- cuda: "124"
|
||||
cuda_version: 12.4.1
|
||||
cudnn_version: ""
|
||||
@@ -40,6 +34,12 @@ jobs:
|
||||
python_version: "3.11"
|
||||
pytorch: 2.5.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
- cuda: "124"
|
||||
cuda_version: 12.4.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
2
.github/workflows/docs.yml
vendored
2
.github/workflows/docs.yml
vendored
@@ -19,7 +19,7 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.10'
|
||||
python-version: '3.11'
|
||||
- name: install dependencies
|
||||
run: |
|
||||
python3 -m pip install jupyter
|
||||
|
||||
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@@ -19,6 +19,6 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
cache: 'pip' # caching pip dependencies
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
|
||||
5
.github/workflows/main.yml
vendored
5
.github/workflows/main.yml
vendored
@@ -26,6 +26,11 @@ jobs:
|
||||
pytorch: 2.5.1
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
9
.github/workflows/multi-gpu-e2e.yml
vendored
9
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -34,6 +34,13 @@ jobs:
|
||||
axolotl_extras:
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
axolotl_extras:
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 120
|
||||
steps:
|
||||
@@ -42,7 +49,7 @@ jobs:
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
5
.github/workflows/nightlies.yml
vendored
5
.github/workflows/nightlies.yml
vendored
@@ -22,6 +22,11 @@ jobs:
|
||||
python_version: "3.11"
|
||||
pytorch: 2.5.1
|
||||
axolotl_extras:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
2
.github/workflows/pypi.yml
vendored
2
.github/workflows/pypi.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
|
||||
20
.github/workflows/tests-nightly.yml
vendored
20
.github/workflows/tests-nightly.yml
vendored
@@ -12,7 +12,7 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
cache: 'pip' # caching pip dependencies
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
env:
|
||||
@@ -25,13 +25,8 @@ jobs:
|
||||
fail-fast: false
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
python_version: ["3.10", "3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1"]
|
||||
exclude:
|
||||
- python_version: "3.10"
|
||||
pytorch_version: "2.4.1"
|
||||
- python_version: "3.10"
|
||||
pytorch_version: "2.5.1"
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
@@ -112,13 +107,20 @@ jobs:
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
23
.github/workflows/tests.yml
vendored
23
.github/workflows/tests.yml
vendored
@@ -35,7 +35,7 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
cache: 'pip' # caching pip dependencies
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
env:
|
||||
@@ -48,13 +48,8 @@ jobs:
|
||||
fail-fast: false
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
python_version: ["3.10", "3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1"]
|
||||
exclude:
|
||||
- python_version: "3.10"
|
||||
pytorch_version: "2.4.1"
|
||||
- python_version: "3.10"
|
||||
pytorch_version: "2.5.1"
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
@@ -127,7 +122,7 @@ jobs:
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1"]
|
||||
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
@@ -216,7 +211,7 @@ jobs:
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
@@ -251,13 +246,19 @@ jobs:
|
||||
pytorch: 2.4.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
@@ -51,7 +51,7 @@ Features:
|
||||
|
||||
**Requirements**:
|
||||
- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
|
||||
- Python ≥3.10
|
||||
- Python 3.11
|
||||
- PyTorch ≥2.4.1
|
||||
|
||||
### Installation
|
||||
|
||||
@@ -46,6 +46,10 @@ overrides_of_model_config:
|
||||
type: # linear | dynamic
|
||||
factor: # float
|
||||
|
||||
# optional overrides the base model loading from_pretrained
|
||||
overrides_of_model_kwargs:
|
||||
# use_cache: False
|
||||
|
||||
# optional overrides to the bnb 4bit quantization configuration
|
||||
# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
|
||||
bnb_config_kwargs:
|
||||
|
||||
@@ -19,3 +19,7 @@ description: Frequently asked questions
|
||||
**Q: AttributeError: 'DummyOptim' object has no attribute 'step'**
|
||||
|
||||
> A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
|
||||
|
||||
**Q: The codes is stuck on saving preprocessed datasets.**
|
||||
|
||||
> A: This is usually an issue with the GPU. This can be resolved through setting the os environment variable `CUDA_VISIBLE_DEVICES=0`. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.
|
||||
|
||||
@@ -3,6 +3,18 @@ title: Multi Node
|
||||
description: How to use Axolotl on multiple machines
|
||||
---
|
||||
|
||||
The below are three ways to train multi-node in Axolotl.
|
||||
|
||||
::: {.callout-important}
|
||||
Each machine needs a copy of Axolotl, we suggest using the same commit to ensure compatibility.
|
||||
|
||||
You will also need to have the same configuration file for your model on each machine.
|
||||
|
||||
Make sure the main machine is reachable by other machines.
|
||||
:::
|
||||
|
||||
# Accelerate
|
||||
|
||||
You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:
|
||||
|
||||
~/.cache/huggingface/accelerate/default_config.yaml
|
||||
@@ -26,7 +38,7 @@ tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
Configure your model to use FSDP with for example:
|
||||
Configure your model to use FSDP in the Axolotl yaml. For example:
|
||||
```yaml
|
||||
fsdp:
|
||||
- full_shard
|
||||
@@ -37,12 +49,40 @@ fsdp_config:
|
||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
```
|
||||
|
||||
## Machine configuration
|
||||
|
||||
On each machine you need a copy of Axolotl, we suggest using the same commit to ensure compatibility.
|
||||
|
||||
You will also need to have the same configuration file for your model on each machine.
|
||||
|
||||
On the main machine only, make sure the port you set as `main_process_port` is open in TCP and reachable by other machines.
|
||||
|
||||
All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
|
||||
|
||||
# Raytrain
|
||||
|
||||
Please see ray train doc [here](ray-integration.qmd).
|
||||
|
||||
# Torchrun
|
||||
|
||||
If you are using Infiniband, we recommend torchrun to utilize the full bandwidth.
|
||||
|
||||
Set the following env (change buffersize/socketname depending on your system):
|
||||
|
||||
```yaml
|
||||
export NCCL_IB_DISABLE=0
|
||||
export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
|
||||
export NCCL_BUFFSIZE=2097152
|
||||
```
|
||||
|
||||
Run the following on each node:
|
||||
|
||||
```bash
|
||||
torchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port" -m axolotl.cli.train config.yaml
|
||||
```
|
||||
|
||||
Please make sure to substitute the placeholder variables.
|
||||
|
||||
- `num_nodes`: Number of nodes (containing GPUs)
|
||||
- `gpu_per_node`: Number of gpus per node
|
||||
- `head_node_ip`: IP of the head node (make sure other machines can connect to this)
|
||||
- `head_node_port`: Port of the head node (make sure other machines can connect to this. Default 29400)
|
||||
- `rdzv_id`: A unique job ID that is used by the job across nodes.
|
||||
|
||||
::: {.callout-note}
|
||||
You need to call `axolotl.cli.train` instead of `axolotl train` as the latter calls accelerate under the hood
|
||||
:::
|
||||
|
||||
More info on the available configs can be found on the Pytorch docs [here](https://pytorch.org/docs/stable/elastic/run.html)
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||
|
||||
# START section of dependencies that don't install on Darwin/MacOS
|
||||
bitsandbytes==0.45.1
|
||||
bitsandbytes==0.45.2
|
||||
triton>=3.0.0
|
||||
mamba-ssm==1.2.0.post1
|
||||
flash-attn==2.7.0.post2
|
||||
flash-attn==2.7.4.post1
|
||||
xformers>=0.0.23.post1
|
||||
autoawq==0.2.7.post3
|
||||
liger-kernel==0.5.2
|
||||
@@ -13,7 +13,7 @@ liger-kernel==0.5.2
|
||||
packaging==23.2
|
||||
|
||||
peft==0.14.0
|
||||
transformers==4.48.1
|
||||
transformers==4.48.3
|
||||
tokenizers>=0.21.0
|
||||
accelerate==1.3.0
|
||||
datasets==3.2.0
|
||||
|
||||
7
setup.py
7
setup.py
@@ -71,12 +71,15 @@ def parse_requirements():
|
||||
else:
|
||||
raise ValueError("Invalid version format")
|
||||
|
||||
if (major, minor) >= (2, 5):
|
||||
if (major, minor) >= (2, 6):
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
_install_requires.append("xformers==0.0.29.post2")
|
||||
elif (major, minor) >= (2, 5):
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
if patch == 0:
|
||||
_install_requires.append("xformers==0.0.28.post2")
|
||||
else:
|
||||
_install_requires.append("xformers==0.0.28.post3")
|
||||
_install_requires.append("xformers==0.0.29")
|
||||
_install_requires.pop(_install_requires.index(autoawq_version))
|
||||
elif (major, minor) >= (2, 4):
|
||||
if patch == 0:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,590 @@
|
||||
{
|
||||
"model.layers.0.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.1.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.2.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.3.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.4.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.5.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.6.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.7.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.8.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.9.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.10.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.11.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.12.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.13.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.14.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.15.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"lm_head": {
|
||||
"snr": Infinity,
|
||||
"type": "lm_head"
|
||||
},
|
||||
"model.layers.0.mlp.down_proj": {
|
||||
"snr": 70.0594253540039,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.1.mlp.down_proj": {
|
||||
"snr": 11.135851860046387,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.2.mlp.down_proj": {
|
||||
"snr": 7.035482883453369,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.3.mlp.down_proj": {
|
||||
"snr": 6.422532081604004,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.4.mlp.down_proj": {
|
||||
"snr": 5.748020172119141,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.5.mlp.down_proj": {
|
||||
"snr": 3.885556697845459,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.6.mlp.down_proj": {
|
||||
"snr": 3.4336745738983154,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.7.mlp.down_proj": {
|
||||
"snr": 2.791595935821533,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.8.mlp.down_proj": {
|
||||
"snr": 5.36277961730957,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.9.mlp.down_proj": {
|
||||
"snr": 4.459208011627197,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.10.mlp.down_proj": {
|
||||
"snr": 6.272170066833496,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.11.mlp.down_proj": {
|
||||
"snr": 5.264761447906494,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.12.mlp.down_proj": {
|
||||
"snr": 4.324735641479492,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.13.mlp.down_proj": {
|
||||
"snr": 3.878648042678833,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.14.mlp.down_proj": {
|
||||
"snr": 2.9773054122924805,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.15.mlp.down_proj": {
|
||||
"snr": 4.471445560455322,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.0.mlp.gate_proj": {
|
||||
"snr": 25.227100372314453,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.1.mlp.gate_proj": {
|
||||
"snr": 6.58299446105957,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.2.mlp.gate_proj": {
|
||||
"snr": 3.4688243865966797,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.3.mlp.gate_proj": {
|
||||
"snr": 1.555246114730835,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.4.mlp.gate_proj": {
|
||||
"snr": 0.7770601511001587,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.5.mlp.gate_proj": {
|
||||
"snr": 0.6239906549453735,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.6.mlp.gate_proj": {
|
||||
"snr": 0.6440379023551941,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.7.mlp.gate_proj": {
|
||||
"snr": 0.5120116472244263,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.8.mlp.gate_proj": {
|
||||
"snr": 0.6544050574302673,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.9.mlp.gate_proj": {
|
||||
"snr": 0.5381016731262207,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.10.mlp.gate_proj": {
|
||||
"snr": 0.622873842716217,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.11.mlp.gate_proj": {
|
||||
"snr": 0.9361700415611267,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.12.mlp.gate_proj": {
|
||||
"snr": 1.475605845451355,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.13.mlp.gate_proj": {
|
||||
"snr": 1.608325719833374,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.14.mlp.gate_proj": {
|
||||
"snr": 1.0720024108886719,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.15.mlp.gate_proj": {
|
||||
"snr": 0.7111338973045349,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.0.mlp.up_proj": {
|
||||
"snr": 28.431896209716797,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.1.mlp.up_proj": {
|
||||
"snr": 15.546019554138184,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.2.mlp.up_proj": {
|
||||
"snr": 23.048023223876953,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.3.mlp.up_proj": {
|
||||
"snr": 25.790977478027344,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.4.mlp.up_proj": {
|
||||
"snr": 18.552549362182617,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.5.mlp.up_proj": {
|
||||
"snr": 8.85106372833252,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.6.mlp.up_proj": {
|
||||
"snr": 10.653799057006836,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.7.mlp.up_proj": {
|
||||
"snr": 7.365357875823975,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.8.mlp.up_proj": {
|
||||
"snr": 11.98373794555664,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.9.mlp.up_proj": {
|
||||
"snr": 8.04493236541748,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.10.mlp.up_proj": {
|
||||
"snr": 8.523039817810059,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.11.mlp.up_proj": {
|
||||
"snr": 5.381742477416992,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.12.mlp.up_proj": {
|
||||
"snr": 3.9845118522644043,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.13.mlp.up_proj": {
|
||||
"snr": 3.4893221855163574,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.14.mlp.up_proj": {
|
||||
"snr": 1.764201045036316,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.15.mlp.up_proj": {
|
||||
"snr": 0.9730708599090576,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.embed_tokens": {
|
||||
"snr": Infinity,
|
||||
"type": "model.embed_tokens"
|
||||
},
|
||||
"model.norm": {
|
||||
"snr": Infinity,
|
||||
"type": "model.norm"
|
||||
},
|
||||
"model.layers.0.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.1.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.2.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.3.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.4.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.5.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.6.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.7.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.8.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.9.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.10.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.11.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.12.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.13.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.14.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.15.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.0.self_attn.k_proj": {
|
||||
"snr": 0.11727584153413773,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.k_proj": {
|
||||
"snr": 0.24786807596683502,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.k_proj": {
|
||||
"snr": 0.36378130316734314,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.k_proj": {
|
||||
"snr": 0.2983120381832123,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.k_proj": {
|
||||
"snr": 0.33789733052253723,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.k_proj": {
|
||||
"snr": 0.29155924916267395,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.k_proj": {
|
||||
"snr": 0.2537297010421753,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.k_proj": {
|
||||
"snr": 0.28204113245010376,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.k_proj": {
|
||||
"snr": 0.2776711583137512,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.k_proj": {
|
||||
"snr": 0.2927376627922058,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.k_proj": {
|
||||
"snr": 0.31486213207244873,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.k_proj": {
|
||||
"snr": 0.32363659143447876,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.k_proj": {
|
||||
"snr": 0.31382912397384644,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.k_proj": {
|
||||
"snr": 0.4635234773159027,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.k_proj": {
|
||||
"snr": 0.25379249453544617,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.k_proj": {
|
||||
"snr": 0.2628238797187805,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.o_proj": {
|
||||
"snr": 0.27602291107177734,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.o_proj": {
|
||||
"snr": 0.2149604707956314,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.o_proj": {
|
||||
"snr": 0.2540294826030731,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.o_proj": {
|
||||
"snr": 0.27978822588920593,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.o_proj": {
|
||||
"snr": 0.3121289908885956,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.o_proj": {
|
||||
"snr": 0.35037684440612793,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.o_proj": {
|
||||
"snr": 0.366205096244812,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.o_proj": {
|
||||
"snr": 0.3692712187767029,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.o_proj": {
|
||||
"snr": 0.3301038146018982,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.o_proj": {
|
||||
"snr": 0.3003396987915039,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.o_proj": {
|
||||
"snr": 0.30804169178009033,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.o_proj": {
|
||||
"snr": 0.28501132130622864,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.o_proj": {
|
||||
"snr": 0.2171541005373001,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.o_proj": {
|
||||
"snr": 0.19183959066867828,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.o_proj": {
|
||||
"snr": 0.19215913116931915,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.o_proj": {
|
||||
"snr": 0.25486502051353455,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.q_proj": {
|
||||
"snr": 0.03850084915757179,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.q_proj": {
|
||||
"snr": 0.0713055431842804,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.q_proj": {
|
||||
"snr": 0.07948919385671616,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.q_proj": {
|
||||
"snr": 0.08047746121883392,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.q_proj": {
|
||||
"snr": 0.0852593332529068,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.q_proj": {
|
||||
"snr": 0.09794823825359344,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.q_proj": {
|
||||
"snr": 0.09627152234315872,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.q_proj": {
|
||||
"snr": 0.11065381020307541,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.q_proj": {
|
||||
"snr": 0.12031875550746918,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.q_proj": {
|
||||
"snr": 0.09804573655128479,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.q_proj": {
|
||||
"snr": 0.10897502303123474,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.q_proj": {
|
||||
"snr": 0.09267337620258331,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.q_proj": {
|
||||
"snr": 0.08803492039442062,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.q_proj": {
|
||||
"snr": 0.0902542844414711,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.q_proj": {
|
||||
"snr": 0.10154066979885101,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.q_proj": {
|
||||
"snr": 0.09083802253007889,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.v_proj": {
|
||||
"snr": 2.842210054397583,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.v_proj": {
|
||||
"snr": 10.59461498260498,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.v_proj": {
|
||||
"snr": 8.993025779724121,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.v_proj": {
|
||||
"snr": 62.567787170410156,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.v_proj": {
|
||||
"snr": 23.80082893371582,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.v_proj": {
|
||||
"snr": 7.957369804382324,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.v_proj": {
|
||||
"snr": 12.01815414428711,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.v_proj": {
|
||||
"snr": 5.095500469207764,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.v_proj": {
|
||||
"snr": 11.719332695007324,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.v_proj": {
|
||||
"snr": 555.0869750976562,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.v_proj": {
|
||||
"snr": 22.95538330078125,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.v_proj": {
|
||||
"snr": 30.042158126831055,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.v_proj": {
|
||||
"snr": 9.577271461486816,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.v_proj": {
|
||||
"snr": 18.176361083984375,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.v_proj": {
|
||||
"snr": 1.5695856809616089,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.v_proj": {
|
||||
"snr": 2.7235565185546875,
|
||||
"type": "self_attn.v_proj"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,590 @@
|
||||
{
|
||||
"model.layers.0.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.1.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.2.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.3.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.4.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.5.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.6.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.7.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.8.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.9.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.10.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.11.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.12.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.13.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.14.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.15.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"lm_head": {
|
||||
"snr": Infinity,
|
||||
"type": "lm_head"
|
||||
},
|
||||
"model.layers.0.mlp.down_proj": {
|
||||
"snr": 57.09797286987305,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.1.mlp.down_proj": {
|
||||
"snr": 9.538983345031738,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.2.mlp.down_proj": {
|
||||
"snr": 6.227016925811768,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.3.mlp.down_proj": {
|
||||
"snr": 5.660686492919922,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.4.mlp.down_proj": {
|
||||
"snr": 5.178432464599609,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.5.mlp.down_proj": {
|
||||
"snr": 3.5638349056243896,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.6.mlp.down_proj": {
|
||||
"snr": 3.0918056964874268,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.7.mlp.down_proj": {
|
||||
"snr": 2.456392288208008,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.8.mlp.down_proj": {
|
||||
"snr": 4.525328636169434,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.9.mlp.down_proj": {
|
||||
"snr": 3.9409055709838867,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.10.mlp.down_proj": {
|
||||
"snr": 5.447249412536621,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.11.mlp.down_proj": {
|
||||
"snr": 4.807600975036621,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.12.mlp.down_proj": {
|
||||
"snr": 3.915374517440796,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.13.mlp.down_proj": {
|
||||
"snr": 3.4820363521575928,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.14.mlp.down_proj": {
|
||||
"snr": 2.6045074462890625,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.15.mlp.down_proj": {
|
||||
"snr": 3.7237701416015625,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.0.mlp.gate_proj": {
|
||||
"snr": 22.160131454467773,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.1.mlp.gate_proj": {
|
||||
"snr": 6.072206020355225,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.2.mlp.gate_proj": {
|
||||
"snr": 3.2467362880706787,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.3.mlp.gate_proj": {
|
||||
"snr": 1.4111896753311157,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.4.mlp.gate_proj": {
|
||||
"snr": 0.7405938506126404,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.5.mlp.gate_proj": {
|
||||
"snr": 0.5916463136672974,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.6.mlp.gate_proj": {
|
||||
"snr": 0.6149423718452454,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.7.mlp.gate_proj": {
|
||||
"snr": 0.48369669914245605,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.8.mlp.gate_proj": {
|
||||
"snr": 0.6047574877738953,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.9.mlp.gate_proj": {
|
||||
"snr": 0.5092479586601257,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.10.mlp.gate_proj": {
|
||||
"snr": 0.5999670624732971,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.11.mlp.gate_proj": {
|
||||
"snr": 0.8980127573013306,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.12.mlp.gate_proj": {
|
||||
"snr": 1.4252448081970215,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.13.mlp.gate_proj": {
|
||||
"snr": 1.509937047958374,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.14.mlp.gate_proj": {
|
||||
"snr": 1.0066585540771484,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.15.mlp.gate_proj": {
|
||||
"snr": 0.6413647532463074,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.0.mlp.up_proj": {
|
||||
"snr": 26.08852195739746,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.1.mlp.up_proj": {
|
||||
"snr": 13.382951736450195,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.2.mlp.up_proj": {
|
||||
"snr": 20.088768005371094,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.3.mlp.up_proj": {
|
||||
"snr": 23.0632381439209,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.4.mlp.up_proj": {
|
||||
"snr": 16.07433319091797,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.5.mlp.up_proj": {
|
||||
"snr": 8.00507640838623,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.6.mlp.up_proj": {
|
||||
"snr": 9.538354873657227,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.7.mlp.up_proj": {
|
||||
"snr": 6.286602973937988,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.8.mlp.up_proj": {
|
||||
"snr": 10.092820167541504,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.9.mlp.up_proj": {
|
||||
"snr": 7.193963527679443,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.10.mlp.up_proj": {
|
||||
"snr": 7.320116996765137,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.11.mlp.up_proj": {
|
||||
"snr": 4.8728532791137695,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.12.mlp.up_proj": {
|
||||
"snr": 3.596583366394043,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.13.mlp.up_proj": {
|
||||
"snr": 3.166161298751831,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.14.mlp.up_proj": {
|
||||
"snr": 1.5600818395614624,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.15.mlp.up_proj": {
|
||||
"snr": 0.8726214170455933,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.embed_tokens": {
|
||||
"snr": Infinity,
|
||||
"type": "model.embed_tokens"
|
||||
},
|
||||
"model.norm": {
|
||||
"snr": Infinity,
|
||||
"type": "model.norm"
|
||||
},
|
||||
"model.layers.0.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.1.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.2.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.3.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.4.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.5.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.6.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.7.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.8.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.9.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.10.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.11.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.12.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.13.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.14.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.15.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.0.self_attn.k_proj": {
|
||||
"snr": 0.1154392883181572,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.k_proj": {
|
||||
"snr": 0.24299409985542297,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.k_proj": {
|
||||
"snr": 0.3624322712421417,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.k_proj": {
|
||||
"snr": 0.29509487748146057,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.k_proj": {
|
||||
"snr": 0.32953736186027527,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.k_proj": {
|
||||
"snr": 0.2908833622932434,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.k_proj": {
|
||||
"snr": 0.2488437294960022,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.k_proj": {
|
||||
"snr": 0.27847856283187866,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.k_proj": {
|
||||
"snr": 0.27143892645835876,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.k_proj": {
|
||||
"snr": 0.28804272413253784,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.k_proj": {
|
||||
"snr": 0.31197959184646606,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.k_proj": {
|
||||
"snr": 0.3203586935997009,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.k_proj": {
|
||||
"snr": 0.30905747413635254,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.k_proj": {
|
||||
"snr": 0.46828722953796387,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.k_proj": {
|
||||
"snr": 0.24205778539180756,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.k_proj": {
|
||||
"snr": 0.2559327781200409,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.o_proj": {
|
||||
"snr": 0.2638678550720215,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.o_proj": {
|
||||
"snr": 0.21109595894813538,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.o_proj": {
|
||||
"snr": 0.24751724302768707,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.o_proj": {
|
||||
"snr": 0.2728094160556793,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.o_proj": {
|
||||
"snr": 0.3001374304294586,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.o_proj": {
|
||||
"snr": 0.33903488516807556,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.o_proj": {
|
||||
"snr": 0.3530929982662201,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.o_proj": {
|
||||
"snr": 0.36753255128860474,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.o_proj": {
|
||||
"snr": 0.3373180329799652,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.o_proj": {
|
||||
"snr": 0.2970578670501709,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.o_proj": {
|
||||
"snr": 0.3076324760913849,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.o_proj": {
|
||||
"snr": 0.2766900658607483,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.o_proj": {
|
||||
"snr": 0.20973259210586548,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.o_proj": {
|
||||
"snr": 0.18185566365718842,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.o_proj": {
|
||||
"snr": 0.18329747021198273,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.o_proj": {
|
||||
"snr": 0.2437991499900818,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.q_proj": {
|
||||
"snr": 0.038040731102228165,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.q_proj": {
|
||||
"snr": 0.0707998052239418,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.q_proj": {
|
||||
"snr": 0.0787411704659462,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.q_proj": {
|
||||
"snr": 0.08089710026979446,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.q_proj": {
|
||||
"snr": 0.08591937273740768,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.q_proj": {
|
||||
"snr": 0.09852176159620285,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.q_proj": {
|
||||
"snr": 0.09690654277801514,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.q_proj": {
|
||||
"snr": 0.11181341856718063,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.q_proj": {
|
||||
"snr": 0.12042108923196793,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.q_proj": {
|
||||
"snr": 0.09799323976039886,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.q_proj": {
|
||||
"snr": 0.10901063680648804,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.q_proj": {
|
||||
"snr": 0.09307146072387695,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.q_proj": {
|
||||
"snr": 0.0880950540304184,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.q_proj": {
|
||||
"snr": 0.08886399120092392,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.q_proj": {
|
||||
"snr": 0.09955056011676788,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.q_proj": {
|
||||
"snr": 0.08929339051246643,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.v_proj": {
|
||||
"snr": 2.5501928329467773,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.v_proj": {
|
||||
"snr": 9.449499130249023,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.v_proj": {
|
||||
"snr": 7.9920830726623535,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.v_proj": {
|
||||
"snr": 50.69462585449219,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.v_proj": {
|
||||
"snr": 19.083511352539062,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.v_proj": {
|
||||
"snr": 7.21597146987915,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.v_proj": {
|
||||
"snr": 11.27744197845459,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.v_proj": {
|
||||
"snr": 4.579711437225342,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.v_proj": {
|
||||
"snr": 10.940719604492188,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.v_proj": {
|
||||
"snr": 553.4417724609375,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.v_proj": {
|
||||
"snr": 20.59434700012207,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.v_proj": {
|
||||
"snr": 26.636865615844727,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.v_proj": {
|
||||
"snr": 8.614749908447266,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.v_proj": {
|
||||
"snr": 17.722007751464844,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.v_proj": {
|
||||
"snr": 1.48500657081604,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.v_proj": {
|
||||
"snr": 2.5776851177215576,
|
||||
"type": "self_attn.v_proj"
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -47,7 +47,7 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
|
||||
|
||||
if len(chosen_tokenized["input_ids"]) > max_length:
|
||||
LOG.warning(
|
||||
f"Chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}",
|
||||
f"To-be-trimmed chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}",
|
||||
)
|
||||
|
||||
chosen_tokenized["input_ids"] = chosen_tokenized["input_ids"][:max_length]
|
||||
@@ -70,7 +70,7 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
|
||||
|
||||
if len(rejected_tokenized["input_ids"]) > max_length:
|
||||
LOG.warning(
|
||||
f"Rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}",
|
||||
f"To-be-trimmed rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}",
|
||||
)
|
||||
|
||||
rejected_tokenized["input_ids"] = rejected_tokenized["input_ids"][
|
||||
|
||||
@@ -115,6 +115,9 @@ class RemappedParameters(BaseModel):
|
||||
overrides_of_model_config: Optional[Dict[str, Any]] = Field(
|
||||
default=None, alias="model_config"
|
||||
)
|
||||
overrides_of_model_kwargs: Optional[Dict[str, Any]] = Field(
|
||||
default=None, alias="model_kwargs"
|
||||
)
|
||||
type_of_model: Optional[str] = Field(default=None, alias="model_type")
|
||||
revision_of_model: Optional[str] = Field(default=None, alias="model_revision")
|
||||
|
||||
@@ -426,8 +429,6 @@ class ModelInputConfig(BaseModel):
|
||||
)
|
||||
trust_remote_code: Optional[bool] = None
|
||||
|
||||
model_kwargs: Optional[Dict[str, Any]] = None
|
||||
|
||||
@field_validator("trust_remote_code")
|
||||
@classmethod
|
||||
def hint_trust_remote_code(cls, trust_remote_code):
|
||||
|
||||
@@ -46,6 +46,7 @@ from axolotl.utils.data.pretraining import wrap_pretraining_dataset
|
||||
from axolotl.utils.data.shared import load_dataset_w_config
|
||||
from axolotl.utils.data.utils import (
|
||||
deduplicate_and_log_datasets,
|
||||
drop_long_seq_in_dataset,
|
||||
md5,
|
||||
retry_on_request_exceptions,
|
||||
)
|
||||
@@ -56,7 +57,7 @@ from axolotl.utils.trainer import (
|
||||
process_datasets_for_packing,
|
||||
)
|
||||
|
||||
LOG = logging.getLogger("axolotl")
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@retry_on_request_exceptions(max_retries=3, delay=5)
|
||||
@@ -339,8 +340,11 @@ def load_tokenized_prepared_datasets(
|
||||
else:
|
||||
LOG.debug("NOT shuffling merged datasets")
|
||||
|
||||
if cfg.sample_packing and not cfg.skip_prepare_dataset:
|
||||
dataset, _ = process_datasets_for_packing(cfg, dataset, None)
|
||||
if not cfg.skip_prepare_dataset:
|
||||
dataset = drop_long_seq_in_dataset(dataset, cfg)
|
||||
|
||||
if cfg.sample_packing:
|
||||
dataset, _ = process_datasets_for_packing(cfg, dataset, None)
|
||||
|
||||
if cfg.local_rank == 0 and not cfg.skip_prepare_dataset:
|
||||
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""data handling helpers"""
|
||||
|
||||
import functools
|
||||
import hashlib
|
||||
import logging
|
||||
@@ -6,10 +7,15 @@ import time
|
||||
from enum import Enum
|
||||
|
||||
import huggingface_hub
|
||||
import numpy as np
|
||||
import requests
|
||||
from datasets import Dataset
|
||||
from datasets import Dataset, IterableDataset
|
||||
|
||||
LOG = logging.getLogger("axolotl")
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.samplers.utils import get_dataset_lengths
|
||||
from axolotl.utils.trainer import drop_long_seq
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RetryStrategy(Enum):
|
||||
@@ -150,3 +156,53 @@ def deduplicate_and_log_datasets(
|
||||
)
|
||||
|
||||
return train_dataset, eval_dataset, dataset
|
||||
|
||||
|
||||
def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault):
|
||||
if "input_ids" not in dataset.column_names:
|
||||
LOG.warning(
|
||||
"Dataset does not contain 'input_ids' column. Skip drop long seq. This is expected for RewardModeling."
|
||||
)
|
||||
return dataset
|
||||
|
||||
drop_long = functools.partial(
|
||||
drop_long_seq,
|
||||
sequence_len=cfg.sequence_len,
|
||||
min_sequence_len=cfg.min_sample_len,
|
||||
)
|
||||
|
||||
try:
|
||||
min_input_len = np.min(get_dataset_lengths(dataset))
|
||||
LOG.debug(f"min_input_len: {min_input_len}")
|
||||
max_input_len = np.max(get_dataset_lengths(dataset))
|
||||
LOG.debug(f"max_input_len: {max_input_len}")
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
try:
|
||||
prior_len = len(dataset)
|
||||
except TypeError:
|
||||
# handle iterable datasets case
|
||||
prior_len = None
|
||||
|
||||
filter_map_kwargs = {}
|
||||
if not isinstance(dataset, IterableDataset):
|
||||
filter_map_kwargs["num_proc"] = cfg.dataset_processes
|
||||
filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
|
||||
|
||||
drop_long_kwargs = {}
|
||||
if filter_map_kwargs:
|
||||
drop_long_kwargs["desc"] = "Dropping Long Sequences"
|
||||
|
||||
dataset = dataset.filter(
|
||||
drop_long,
|
||||
batched=True,
|
||||
**filter_map_kwargs,
|
||||
**drop_long_kwargs,
|
||||
)
|
||||
if prior_len:
|
||||
dropped = prior_len - len(dataset)
|
||||
if dropped:
|
||||
LOG.warning(f"Dropped {dropped} long samples from dataset")
|
||||
|
||||
return dataset
|
||||
|
||||
@@ -357,8 +357,8 @@ class ModelLoader:
|
||||
|
||||
# init model kwargs
|
||||
self.model_kwargs: Dict[str, Any] = {}
|
||||
if cfg.model_kwargs:
|
||||
for key, val in cfg.model_kwargs.items():
|
||||
if cfg.overrides_of_model_kwargs:
|
||||
for key, val in cfg.overrides_of_model_kwargs.items():
|
||||
self.model_kwargs[key] = val
|
||||
|
||||
# init model
|
||||
|
||||
@@ -13,5 +13,4 @@ def get_dataset_lengths(dataset):
|
||||
else:
|
||||
input_ids = dataset.data.column("input_ids")
|
||||
lengths = np.vectorize(len)(np.array(input_ids, dtype=object))
|
||||
return lengths
|
||||
return lengths
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Module containing the Trainer class and related functions"""
|
||||
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
@@ -210,6 +211,8 @@ def drop_long_seq(sample, sequence_len=2048, min_sequence_len=2):
|
||||
|
||||
Works for both single-example (list[int]) or batched (list[list[int]]).
|
||||
"""
|
||||
min_sequence_len = min_sequence_len or 2
|
||||
|
||||
input_ids = sample["input_ids"]
|
||||
|
||||
# Edge case: if input_ids is empty
|
||||
@@ -232,20 +235,6 @@ def drop_long_seq(sample, sequence_len=2048, min_sequence_len=2):
|
||||
|
||||
|
||||
def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
||||
drop_long = partial(
|
||||
drop_long_seq,
|
||||
sequence_len=cfg.sequence_len,
|
||||
min_sequence_len=cfg.min_sample_len or 2,
|
||||
)
|
||||
|
||||
try:
|
||||
min_input_len = np.min(get_dataset_lengths(train_dataset))
|
||||
LOG.debug(f"min_input_len: {min_input_len}", main_process_only=True)
|
||||
max_input_len = np.max(get_dataset_lengths(train_dataset))
|
||||
LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
if cfg.model_config_type == "mamba":
|
||||
LOG.info("dropping attention_mask column")
|
||||
train_dataset = train_dataset.remove_columns("attention_mask")
|
||||
@@ -259,46 +248,6 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
||||
if eval_dataset and "token_type_ids" in eval_dataset.column_names:
|
||||
eval_dataset = eval_dataset.remove_columns("token_type_ids")
|
||||
|
||||
filter_map_kwargs = {}
|
||||
if not isinstance(train_dataset, IterableDataset):
|
||||
filter_map_kwargs["num_proc"] = cfg.dataset_processes
|
||||
filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
|
||||
|
||||
try:
|
||||
prior_len = len(train_dataset)
|
||||
except TypeError:
|
||||
# handle iterable datasets case
|
||||
prior_len = None
|
||||
drop_long_kwargs = {}
|
||||
if filter_map_kwargs:
|
||||
drop_long_kwargs["desc"] = "Dropping Long Sequences"
|
||||
train_dataset = train_dataset.filter(
|
||||
drop_long,
|
||||
batched=True,
|
||||
**filter_map_kwargs,
|
||||
**drop_long_kwargs,
|
||||
)
|
||||
if prior_len:
|
||||
dropped = prior_len - len(train_dataset)
|
||||
if dropped:
|
||||
LOG.warning(f"Dropped {dropped} long samples from train dataset")
|
||||
|
||||
if eval_dataset:
|
||||
try:
|
||||
prior_len = len(eval_dataset)
|
||||
except TypeError:
|
||||
# handle iterable datasets case
|
||||
prior_len = None
|
||||
eval_dataset = eval_dataset.filter(
|
||||
drop_long,
|
||||
**filter_map_kwargs,
|
||||
**drop_long_kwargs,
|
||||
)
|
||||
if prior_len:
|
||||
dropped = prior_len - len(eval_dataset)
|
||||
if dropped:
|
||||
LOG.warning(f"Dropped {dropped} long samples from eval dataset")
|
||||
|
||||
def drop_no_trainable_tokens(sample):
|
||||
"""
|
||||
Drop samples if all labels are -100 (i.e., zero trainable tokens).
|
||||
@@ -325,6 +274,11 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
||||
except TypeError:
|
||||
# handle iterable datasets case
|
||||
prior_len = None
|
||||
filter_map_kwargs = {}
|
||||
if not isinstance(train_dataset, IterableDataset):
|
||||
filter_map_kwargs["num_proc"] = cfg.dataset_processes
|
||||
filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
|
||||
|
||||
drop_long_kwargs = {}
|
||||
if filter_map_kwargs:
|
||||
drop_long_kwargs["desc"] = "Drop Samples with Zero Trainable Tokens"
|
||||
|
||||
@@ -33,7 +33,7 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
|
||||
"num_labels": 1,
|
||||
"chat_template": "alpaca",
|
||||
"reward_model": True,
|
||||
"sequence_len": 1024,
|
||||
"sequence_len": 2048,
|
||||
"pad_to_sequence_len": True,
|
||||
"adapter": "lora",
|
||||
"lora_r": 8,
|
||||
|
||||
Reference in New Issue
Block a user