Compare commits
13 Commits
grpo-path
...
bursterato
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a44f398b14 | ||
|
|
356d9b6458 | ||
|
|
4393d8e649 | ||
|
|
826f1b1494 | ||
|
|
526e5ee8b8 | ||
|
|
fd8cb32547 | ||
|
|
e48e2df4dd | ||
|
|
b7616022ab | ||
|
|
1faf1a5c5a | ||
|
|
f7bf883470 | ||
|
|
7591331d0c | ||
|
|
5bbad5ef93 | ||
|
|
a971eb4ce6 |
12
.github/workflows/base.yml
vendored
12
.github/workflows/base.yml
vendored
@@ -22,12 +22,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: "124"
|
||||
cuda_version: 12.4.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.10"
|
||||
pytorch: 2.4.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
- cuda: "124"
|
||||
cuda_version: 12.4.1
|
||||
cudnn_version: ""
|
||||
@@ -40,6 +34,12 @@ jobs:
|
||||
python_version: "3.11"
|
||||
pytorch: 2.5.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
- cuda: "124"
|
||||
cuda_version: 12.4.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
2
.github/workflows/docs.yml
vendored
2
.github/workflows/docs.yml
vendored
@@ -19,7 +19,7 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.10'
|
||||
python-version: '3.11'
|
||||
- name: install dependencies
|
||||
run: |
|
||||
python3 -m pip install jupyter
|
||||
|
||||
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@@ -19,6 +19,6 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
cache: 'pip' # caching pip dependencies
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
|
||||
5
.github/workflows/main.yml
vendored
5
.github/workflows/main.yml
vendored
@@ -26,6 +26,11 @@ jobs:
|
||||
pytorch: 2.5.1
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
9
.github/workflows/multi-gpu-e2e.yml
vendored
9
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -34,6 +34,13 @@ jobs:
|
||||
axolotl_extras:
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
axolotl_extras:
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 120
|
||||
steps:
|
||||
@@ -42,7 +49,7 @@ jobs:
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
5
.github/workflows/nightlies.yml
vendored
5
.github/workflows/nightlies.yml
vendored
@@ -22,6 +22,11 @@ jobs:
|
||||
python_version: "3.11"
|
||||
pytorch: 2.5.1
|
||||
axolotl_extras:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
2
.github/workflows/pypi.yml
vendored
2
.github/workflows/pypi.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
|
||||
20
.github/workflows/tests-nightly.yml
vendored
20
.github/workflows/tests-nightly.yml
vendored
@@ -12,7 +12,7 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
cache: 'pip' # caching pip dependencies
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
env:
|
||||
@@ -25,13 +25,8 @@ jobs:
|
||||
fail-fast: false
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
python_version: ["3.10", "3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1"]
|
||||
exclude:
|
||||
- python_version: "3.10"
|
||||
pytorch_version: "2.4.1"
|
||||
- python_version: "3.10"
|
||||
pytorch_version: "2.5.1"
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
@@ -112,13 +107,20 @@ jobs:
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
23
.github/workflows/tests.yml
vendored
23
.github/workflows/tests.yml
vendored
@@ -35,7 +35,7 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
cache: 'pip' # caching pip dependencies
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
env:
|
||||
@@ -48,13 +48,8 @@ jobs:
|
||||
fail-fast: false
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
python_version: ["3.10", "3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1"]
|
||||
exclude:
|
||||
- python_version: "3.10"
|
||||
pytorch_version: "2.4.1"
|
||||
- python_version: "3.10"
|
||||
pytorch_version: "2.5.1"
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
@@ -127,7 +122,7 @@ jobs:
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.4.1", "2.5.1"]
|
||||
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
@@ -216,7 +211,7 @@ jobs:
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
@@ -251,13 +246,19 @@ jobs:
|
||||
pytorch: 2.4.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
@@ -51,7 +51,7 @@ Features:
|
||||
|
||||
**Requirements**:
|
||||
- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
|
||||
- Python ≥3.10
|
||||
- Python 3.11
|
||||
- PyTorch ≥2.4.1
|
||||
|
||||
### Installation
|
||||
|
||||
@@ -46,6 +46,10 @@ overrides_of_model_config:
|
||||
type: # linear | dynamic
|
||||
factor: # float
|
||||
|
||||
# optional overrides the base model loading from_pretrained
|
||||
overrides_of_model_kwargs:
|
||||
# use_cache: False
|
||||
|
||||
# optional overrides to the bnb 4bit quantization configuration
|
||||
# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
|
||||
bnb_config_kwargs:
|
||||
|
||||
@@ -19,3 +19,7 @@ description: Frequently asked questions
|
||||
**Q: AttributeError: 'DummyOptim' object has no attribute 'step'**
|
||||
|
||||
> A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
|
||||
|
||||
**Q: The codes is stuck on saving preprocessed datasets.**
|
||||
|
||||
> A: This is usually an issue with the GPU. This can be resolved through setting the os environment variable `CUDA_VISIBLE_DEVICES=0`. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.
|
||||
|
||||
@@ -3,6 +3,18 @@ title: Multi Node
|
||||
description: How to use Axolotl on multiple machines
|
||||
---
|
||||
|
||||
The below are three ways to train multi-node in Axolotl.
|
||||
|
||||
::: {.callout-important}
|
||||
Each machine needs a copy of Axolotl, we suggest using the same commit to ensure compatibility.
|
||||
|
||||
You will also need to have the same configuration file for your model on each machine.
|
||||
|
||||
Make sure the main machine is reachable by other machines.
|
||||
:::
|
||||
|
||||
# Accelerate
|
||||
|
||||
You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:
|
||||
|
||||
~/.cache/huggingface/accelerate/default_config.yaml
|
||||
@@ -26,7 +38,7 @@ tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
Configure your model to use FSDP with for example:
|
||||
Configure your model to use FSDP in the Axolotl yaml. For example:
|
||||
```yaml
|
||||
fsdp:
|
||||
- full_shard
|
||||
@@ -37,12 +49,40 @@ fsdp_config:
|
||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
```
|
||||
|
||||
## Machine configuration
|
||||
|
||||
On each machine you need a copy of Axolotl, we suggest using the same commit to ensure compatibility.
|
||||
|
||||
You will also need to have the same configuration file for your model on each machine.
|
||||
|
||||
On the main machine only, make sure the port you set as `main_process_port` is open in TCP and reachable by other machines.
|
||||
|
||||
All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
|
||||
|
||||
# Raytrain
|
||||
|
||||
Please see ray train doc [here](ray-integration.qmd).
|
||||
|
||||
# Torchrun
|
||||
|
||||
If you are using Infiniband, we recommend torchrun to utilize the full bandwidth.
|
||||
|
||||
Set the following env (change buffersize/socketname depending on your system):
|
||||
|
||||
```yaml
|
||||
export NCCL_IB_DISABLE=0
|
||||
export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
|
||||
export NCCL_BUFFSIZE=2097152
|
||||
```
|
||||
|
||||
Run the following on each node:
|
||||
|
||||
```bash
|
||||
torchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port" -m axolotl.cli.train config.yaml
|
||||
```
|
||||
|
||||
Please make sure to substitute the placeholder variables.
|
||||
|
||||
- `num_nodes`: Number of nodes (containing GPUs)
|
||||
- `gpu_per_node`: Number of gpus per node
|
||||
- `head_node_ip`: IP of the head node (make sure other machines can connect to this)
|
||||
- `head_node_port`: Port of the head node (make sure other machines can connect to this. Default 29400)
|
||||
- `rdzv_id`: A unique job ID that is used by the job across nodes.
|
||||
|
||||
::: {.callout-note}
|
||||
You need to call `axolotl.cli.train` instead of `axolotl train` as the latter calls accelerate under the hood
|
||||
:::
|
||||
|
||||
More info on the available configs can be found on the Pytorch docs [here](https://pytorch.org/docs/stable/elastic/run.html)
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||
|
||||
# START section of dependencies that don't install on Darwin/MacOS
|
||||
bitsandbytes==0.45.1
|
||||
bitsandbytes==0.45.2
|
||||
triton>=3.0.0
|
||||
mamba-ssm==1.2.0.post1
|
||||
flash-attn==2.7.0.post2
|
||||
flash-attn==2.7.4.post1
|
||||
xformers>=0.0.23.post1
|
||||
autoawq==0.2.7.post3
|
||||
liger-kernel==0.5.2
|
||||
@@ -13,7 +13,7 @@ liger-kernel==0.5.2
|
||||
packaging==23.2
|
||||
|
||||
peft==0.14.0
|
||||
transformers==4.48.1
|
||||
transformers==4.48.3
|
||||
tokenizers>=0.21.0
|
||||
accelerate==1.3.0
|
||||
datasets==3.2.0
|
||||
|
||||
7
setup.py
7
setup.py
@@ -71,12 +71,15 @@ def parse_requirements():
|
||||
else:
|
||||
raise ValueError("Invalid version format")
|
||||
|
||||
if (major, minor) >= (2, 5):
|
||||
if (major, minor) >= (2, 6):
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
_install_requires.append("xformers==0.0.29.post2")
|
||||
elif (major, minor) >= (2, 5):
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
if patch == 0:
|
||||
_install_requires.append("xformers==0.0.28.post2")
|
||||
else:
|
||||
_install_requires.append("xformers==0.0.28.post3")
|
||||
_install_requires.append("xformers==0.0.29")
|
||||
_install_requires.pop(_install_requires.index(autoawq_version))
|
||||
elif (major, minor) >= (2, 4):
|
||||
if patch == 0:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,590 @@
|
||||
{
|
||||
"model.layers.0.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.1.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.2.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.3.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.4.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.5.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.6.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.7.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.8.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.9.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.10.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.11.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.12.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.13.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.14.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.15.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"lm_head": {
|
||||
"snr": Infinity,
|
||||
"type": "lm_head"
|
||||
},
|
||||
"model.layers.0.mlp.down_proj": {
|
||||
"snr": 70.0594253540039,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.1.mlp.down_proj": {
|
||||
"snr": 11.135851860046387,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.2.mlp.down_proj": {
|
||||
"snr": 7.035482883453369,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.3.mlp.down_proj": {
|
||||
"snr": 6.422532081604004,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.4.mlp.down_proj": {
|
||||
"snr": 5.748020172119141,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.5.mlp.down_proj": {
|
||||
"snr": 3.885556697845459,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.6.mlp.down_proj": {
|
||||
"snr": 3.4336745738983154,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.7.mlp.down_proj": {
|
||||
"snr": 2.791595935821533,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.8.mlp.down_proj": {
|
||||
"snr": 5.36277961730957,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.9.mlp.down_proj": {
|
||||
"snr": 4.459208011627197,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.10.mlp.down_proj": {
|
||||
"snr": 6.272170066833496,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.11.mlp.down_proj": {
|
||||
"snr": 5.264761447906494,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.12.mlp.down_proj": {
|
||||
"snr": 4.324735641479492,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.13.mlp.down_proj": {
|
||||
"snr": 3.878648042678833,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.14.mlp.down_proj": {
|
||||
"snr": 2.9773054122924805,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.15.mlp.down_proj": {
|
||||
"snr": 4.471445560455322,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.0.mlp.gate_proj": {
|
||||
"snr": 25.227100372314453,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.1.mlp.gate_proj": {
|
||||
"snr": 6.58299446105957,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.2.mlp.gate_proj": {
|
||||
"snr": 3.4688243865966797,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.3.mlp.gate_proj": {
|
||||
"snr": 1.555246114730835,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.4.mlp.gate_proj": {
|
||||
"snr": 0.7770601511001587,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.5.mlp.gate_proj": {
|
||||
"snr": 0.6239906549453735,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.6.mlp.gate_proj": {
|
||||
"snr": 0.6440379023551941,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.7.mlp.gate_proj": {
|
||||
"snr": 0.5120116472244263,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.8.mlp.gate_proj": {
|
||||
"snr": 0.6544050574302673,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.9.mlp.gate_proj": {
|
||||
"snr": 0.5381016731262207,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.10.mlp.gate_proj": {
|
||||
"snr": 0.622873842716217,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.11.mlp.gate_proj": {
|
||||
"snr": 0.9361700415611267,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.12.mlp.gate_proj": {
|
||||
"snr": 1.475605845451355,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.13.mlp.gate_proj": {
|
||||
"snr": 1.608325719833374,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.14.mlp.gate_proj": {
|
||||
"snr": 1.0720024108886719,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.15.mlp.gate_proj": {
|
||||
"snr": 0.7111338973045349,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.0.mlp.up_proj": {
|
||||
"snr": 28.431896209716797,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.1.mlp.up_proj": {
|
||||
"snr": 15.546019554138184,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.2.mlp.up_proj": {
|
||||
"snr": 23.048023223876953,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.3.mlp.up_proj": {
|
||||
"snr": 25.790977478027344,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.4.mlp.up_proj": {
|
||||
"snr": 18.552549362182617,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.5.mlp.up_proj": {
|
||||
"snr": 8.85106372833252,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.6.mlp.up_proj": {
|
||||
"snr": 10.653799057006836,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.7.mlp.up_proj": {
|
||||
"snr": 7.365357875823975,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.8.mlp.up_proj": {
|
||||
"snr": 11.98373794555664,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.9.mlp.up_proj": {
|
||||
"snr": 8.04493236541748,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.10.mlp.up_proj": {
|
||||
"snr": 8.523039817810059,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.11.mlp.up_proj": {
|
||||
"snr": 5.381742477416992,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.12.mlp.up_proj": {
|
||||
"snr": 3.9845118522644043,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.13.mlp.up_proj": {
|
||||
"snr": 3.4893221855163574,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.14.mlp.up_proj": {
|
||||
"snr": 1.764201045036316,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.15.mlp.up_proj": {
|
||||
"snr": 0.9730708599090576,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.embed_tokens": {
|
||||
"snr": Infinity,
|
||||
"type": "model.embed_tokens"
|
||||
},
|
||||
"model.norm": {
|
||||
"snr": Infinity,
|
||||
"type": "model.norm"
|
||||
},
|
||||
"model.layers.0.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.1.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.2.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.3.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.4.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.5.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.6.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.7.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.8.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.9.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.10.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.11.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.12.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.13.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.14.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.15.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.0.self_attn.k_proj": {
|
||||
"snr": 0.11727584153413773,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.k_proj": {
|
||||
"snr": 0.24786807596683502,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.k_proj": {
|
||||
"snr": 0.36378130316734314,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.k_proj": {
|
||||
"snr": 0.2983120381832123,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.k_proj": {
|
||||
"snr": 0.33789733052253723,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.k_proj": {
|
||||
"snr": 0.29155924916267395,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.k_proj": {
|
||||
"snr": 0.2537297010421753,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.k_proj": {
|
||||
"snr": 0.28204113245010376,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.k_proj": {
|
||||
"snr": 0.2776711583137512,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.k_proj": {
|
||||
"snr": 0.2927376627922058,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.k_proj": {
|
||||
"snr": 0.31486213207244873,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.k_proj": {
|
||||
"snr": 0.32363659143447876,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.k_proj": {
|
||||
"snr": 0.31382912397384644,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.k_proj": {
|
||||
"snr": 0.4635234773159027,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.k_proj": {
|
||||
"snr": 0.25379249453544617,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.k_proj": {
|
||||
"snr": 0.2628238797187805,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.o_proj": {
|
||||
"snr": 0.27602291107177734,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.o_proj": {
|
||||
"snr": 0.2149604707956314,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.o_proj": {
|
||||
"snr": 0.2540294826030731,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.o_proj": {
|
||||
"snr": 0.27978822588920593,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.o_proj": {
|
||||
"snr": 0.3121289908885956,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.o_proj": {
|
||||
"snr": 0.35037684440612793,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.o_proj": {
|
||||
"snr": 0.366205096244812,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.o_proj": {
|
||||
"snr": 0.3692712187767029,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.o_proj": {
|
||||
"snr": 0.3301038146018982,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.o_proj": {
|
||||
"snr": 0.3003396987915039,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.o_proj": {
|
||||
"snr": 0.30804169178009033,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.o_proj": {
|
||||
"snr": 0.28501132130622864,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.o_proj": {
|
||||
"snr": 0.2171541005373001,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.o_proj": {
|
||||
"snr": 0.19183959066867828,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.o_proj": {
|
||||
"snr": 0.19215913116931915,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.o_proj": {
|
||||
"snr": 0.25486502051353455,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.q_proj": {
|
||||
"snr": 0.03850084915757179,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.q_proj": {
|
||||
"snr": 0.0713055431842804,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.q_proj": {
|
||||
"snr": 0.07948919385671616,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.q_proj": {
|
||||
"snr": 0.08047746121883392,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.q_proj": {
|
||||
"snr": 0.0852593332529068,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.q_proj": {
|
||||
"snr": 0.09794823825359344,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.q_proj": {
|
||||
"snr": 0.09627152234315872,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.q_proj": {
|
||||
"snr": 0.11065381020307541,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.q_proj": {
|
||||
"snr": 0.12031875550746918,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.q_proj": {
|
||||
"snr": 0.09804573655128479,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.q_proj": {
|
||||
"snr": 0.10897502303123474,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.q_proj": {
|
||||
"snr": 0.09267337620258331,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.q_proj": {
|
||||
"snr": 0.08803492039442062,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.q_proj": {
|
||||
"snr": 0.0902542844414711,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.q_proj": {
|
||||
"snr": 0.10154066979885101,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.q_proj": {
|
||||
"snr": 0.09083802253007889,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.v_proj": {
|
||||
"snr": 2.842210054397583,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.v_proj": {
|
||||
"snr": 10.59461498260498,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.v_proj": {
|
||||
"snr": 8.993025779724121,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.v_proj": {
|
||||
"snr": 62.567787170410156,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.v_proj": {
|
||||
"snr": 23.80082893371582,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.v_proj": {
|
||||
"snr": 7.957369804382324,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.v_proj": {
|
||||
"snr": 12.01815414428711,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.v_proj": {
|
||||
"snr": 5.095500469207764,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.v_proj": {
|
||||
"snr": 11.719332695007324,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.v_proj": {
|
||||
"snr": 555.0869750976562,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.v_proj": {
|
||||
"snr": 22.95538330078125,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.v_proj": {
|
||||
"snr": 30.042158126831055,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.v_proj": {
|
||||
"snr": 9.577271461486816,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.v_proj": {
|
||||
"snr": 18.176361083984375,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.v_proj": {
|
||||
"snr": 1.5695856809616089,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.v_proj": {
|
||||
"snr": 2.7235565185546875,
|
||||
"type": "self_attn.v_proj"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,590 @@
|
||||
{
|
||||
"model.layers.0.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.1.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.2.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.3.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.4.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.5.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.6.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.7.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.8.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.9.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.10.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.11.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.12.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.13.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.14.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"model.layers.15.input_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "input_layernorm"
|
||||
},
|
||||
"lm_head": {
|
||||
"snr": Infinity,
|
||||
"type": "lm_head"
|
||||
},
|
||||
"model.layers.0.mlp.down_proj": {
|
||||
"snr": 57.09797286987305,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.1.mlp.down_proj": {
|
||||
"snr": 9.538983345031738,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.2.mlp.down_proj": {
|
||||
"snr": 6.227016925811768,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.3.mlp.down_proj": {
|
||||
"snr": 5.660686492919922,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.4.mlp.down_proj": {
|
||||
"snr": 5.178432464599609,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.5.mlp.down_proj": {
|
||||
"snr": 3.5638349056243896,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.6.mlp.down_proj": {
|
||||
"snr": 3.0918056964874268,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.7.mlp.down_proj": {
|
||||
"snr": 2.456392288208008,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.8.mlp.down_proj": {
|
||||
"snr": 4.525328636169434,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.9.mlp.down_proj": {
|
||||
"snr": 3.9409055709838867,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.10.mlp.down_proj": {
|
||||
"snr": 5.447249412536621,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.11.mlp.down_proj": {
|
||||
"snr": 4.807600975036621,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.12.mlp.down_proj": {
|
||||
"snr": 3.915374517440796,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.13.mlp.down_proj": {
|
||||
"snr": 3.4820363521575928,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.14.mlp.down_proj": {
|
||||
"snr": 2.6045074462890625,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.15.mlp.down_proj": {
|
||||
"snr": 3.7237701416015625,
|
||||
"type": "mlp.down_proj"
|
||||
},
|
||||
"model.layers.0.mlp.gate_proj": {
|
||||
"snr": 22.160131454467773,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.1.mlp.gate_proj": {
|
||||
"snr": 6.072206020355225,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.2.mlp.gate_proj": {
|
||||
"snr": 3.2467362880706787,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.3.mlp.gate_proj": {
|
||||
"snr": 1.4111896753311157,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.4.mlp.gate_proj": {
|
||||
"snr": 0.7405938506126404,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.5.mlp.gate_proj": {
|
||||
"snr": 0.5916463136672974,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.6.mlp.gate_proj": {
|
||||
"snr": 0.6149423718452454,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.7.mlp.gate_proj": {
|
||||
"snr": 0.48369669914245605,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.8.mlp.gate_proj": {
|
||||
"snr": 0.6047574877738953,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.9.mlp.gate_proj": {
|
||||
"snr": 0.5092479586601257,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.10.mlp.gate_proj": {
|
||||
"snr": 0.5999670624732971,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.11.mlp.gate_proj": {
|
||||
"snr": 0.8980127573013306,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.12.mlp.gate_proj": {
|
||||
"snr": 1.4252448081970215,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.13.mlp.gate_proj": {
|
||||
"snr": 1.509937047958374,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.14.mlp.gate_proj": {
|
||||
"snr": 1.0066585540771484,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.15.mlp.gate_proj": {
|
||||
"snr": 0.6413647532463074,
|
||||
"type": "mlp.gate_proj"
|
||||
},
|
||||
"model.layers.0.mlp.up_proj": {
|
||||
"snr": 26.08852195739746,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.1.mlp.up_proj": {
|
||||
"snr": 13.382951736450195,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.2.mlp.up_proj": {
|
||||
"snr": 20.088768005371094,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.3.mlp.up_proj": {
|
||||
"snr": 23.0632381439209,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.4.mlp.up_proj": {
|
||||
"snr": 16.07433319091797,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.5.mlp.up_proj": {
|
||||
"snr": 8.00507640838623,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.6.mlp.up_proj": {
|
||||
"snr": 9.538354873657227,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.7.mlp.up_proj": {
|
||||
"snr": 6.286602973937988,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.8.mlp.up_proj": {
|
||||
"snr": 10.092820167541504,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.9.mlp.up_proj": {
|
||||
"snr": 7.193963527679443,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.10.mlp.up_proj": {
|
||||
"snr": 7.320116996765137,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.11.mlp.up_proj": {
|
||||
"snr": 4.8728532791137695,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.12.mlp.up_proj": {
|
||||
"snr": 3.596583366394043,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.13.mlp.up_proj": {
|
||||
"snr": 3.166161298751831,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.14.mlp.up_proj": {
|
||||
"snr": 1.5600818395614624,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.layers.15.mlp.up_proj": {
|
||||
"snr": 0.8726214170455933,
|
||||
"type": "mlp.up_proj"
|
||||
},
|
||||
"model.embed_tokens": {
|
||||
"snr": Infinity,
|
||||
"type": "model.embed_tokens"
|
||||
},
|
||||
"model.norm": {
|
||||
"snr": Infinity,
|
||||
"type": "model.norm"
|
||||
},
|
||||
"model.layers.0.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.1.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.2.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.3.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.4.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.5.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.6.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.7.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.8.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.9.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.10.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.11.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.12.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.13.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.14.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.15.post_attention_layernorm": {
|
||||
"snr": Infinity,
|
||||
"type": "post_attention_layernorm"
|
||||
},
|
||||
"model.layers.0.self_attn.k_proj": {
|
||||
"snr": 0.1154392883181572,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.k_proj": {
|
||||
"snr": 0.24299409985542297,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.k_proj": {
|
||||
"snr": 0.3624322712421417,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.k_proj": {
|
||||
"snr": 0.29509487748146057,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.k_proj": {
|
||||
"snr": 0.32953736186027527,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.k_proj": {
|
||||
"snr": 0.2908833622932434,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.k_proj": {
|
||||
"snr": 0.2488437294960022,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.k_proj": {
|
||||
"snr": 0.27847856283187866,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.k_proj": {
|
||||
"snr": 0.27143892645835876,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.k_proj": {
|
||||
"snr": 0.28804272413253784,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.k_proj": {
|
||||
"snr": 0.31197959184646606,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.k_proj": {
|
||||
"snr": 0.3203586935997009,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.k_proj": {
|
||||
"snr": 0.30905747413635254,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.k_proj": {
|
||||
"snr": 0.46828722953796387,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.k_proj": {
|
||||
"snr": 0.24205778539180756,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.k_proj": {
|
||||
"snr": 0.2559327781200409,
|
||||
"type": "self_attn.k_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.o_proj": {
|
||||
"snr": 0.2638678550720215,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.o_proj": {
|
||||
"snr": 0.21109595894813538,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.o_proj": {
|
||||
"snr": 0.24751724302768707,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.o_proj": {
|
||||
"snr": 0.2728094160556793,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.o_proj": {
|
||||
"snr": 0.3001374304294586,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.o_proj": {
|
||||
"snr": 0.33903488516807556,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.o_proj": {
|
||||
"snr": 0.3530929982662201,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.o_proj": {
|
||||
"snr": 0.36753255128860474,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.o_proj": {
|
||||
"snr": 0.3373180329799652,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.o_proj": {
|
||||
"snr": 0.2970578670501709,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.o_proj": {
|
||||
"snr": 0.3076324760913849,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.o_proj": {
|
||||
"snr": 0.2766900658607483,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.o_proj": {
|
||||
"snr": 0.20973259210586548,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.o_proj": {
|
||||
"snr": 0.18185566365718842,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.o_proj": {
|
||||
"snr": 0.18329747021198273,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.o_proj": {
|
||||
"snr": 0.2437991499900818,
|
||||
"type": "self_attn.o_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.q_proj": {
|
||||
"snr": 0.038040731102228165,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.q_proj": {
|
||||
"snr": 0.0707998052239418,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.q_proj": {
|
||||
"snr": 0.0787411704659462,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.q_proj": {
|
||||
"snr": 0.08089710026979446,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.q_proj": {
|
||||
"snr": 0.08591937273740768,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.q_proj": {
|
||||
"snr": 0.09852176159620285,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.q_proj": {
|
||||
"snr": 0.09690654277801514,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.q_proj": {
|
||||
"snr": 0.11181341856718063,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.q_proj": {
|
||||
"snr": 0.12042108923196793,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.q_proj": {
|
||||
"snr": 0.09799323976039886,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.q_proj": {
|
||||
"snr": 0.10901063680648804,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.q_proj": {
|
||||
"snr": 0.09307146072387695,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.q_proj": {
|
||||
"snr": 0.0880950540304184,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.q_proj": {
|
||||
"snr": 0.08886399120092392,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.q_proj": {
|
||||
"snr": 0.09955056011676788,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.q_proj": {
|
||||
"snr": 0.08929339051246643,
|
||||
"type": "self_attn.q_proj"
|
||||
},
|
||||
"model.layers.0.self_attn.v_proj": {
|
||||
"snr": 2.5501928329467773,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.1.self_attn.v_proj": {
|
||||
"snr": 9.449499130249023,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.2.self_attn.v_proj": {
|
||||
"snr": 7.9920830726623535,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.3.self_attn.v_proj": {
|
||||
"snr": 50.69462585449219,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.4.self_attn.v_proj": {
|
||||
"snr": 19.083511352539062,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.5.self_attn.v_proj": {
|
||||
"snr": 7.21597146987915,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.6.self_attn.v_proj": {
|
||||
"snr": 11.27744197845459,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.7.self_attn.v_proj": {
|
||||
"snr": 4.579711437225342,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.8.self_attn.v_proj": {
|
||||
"snr": 10.940719604492188,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.9.self_attn.v_proj": {
|
||||
"snr": 553.4417724609375,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.10.self_attn.v_proj": {
|
||||
"snr": 20.59434700012207,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.11.self_attn.v_proj": {
|
||||
"snr": 26.636865615844727,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.12.self_attn.v_proj": {
|
||||
"snr": 8.614749908447266,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.13.self_attn.v_proj": {
|
||||
"snr": 17.722007751464844,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.14.self_attn.v_proj": {
|
||||
"snr": 1.48500657081604,
|
||||
"type": "self_attn.v_proj"
|
||||
},
|
||||
"model.layers.15.self_attn.v_proj": {
|
||||
"snr": 2.5776851177215576,
|
||||
"type": "self_attn.v_proj"
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -115,6 +115,9 @@ class RemappedParameters(BaseModel):
|
||||
overrides_of_model_config: Optional[Dict[str, Any]] = Field(
|
||||
default=None, alias="model_config"
|
||||
)
|
||||
overrides_of_model_kwargs: Optional[Dict[str, Any]] = Field(
|
||||
default=None, alias="model_kwargs"
|
||||
)
|
||||
type_of_model: Optional[str] = Field(default=None, alias="model_type")
|
||||
revision_of_model: Optional[str] = Field(default=None, alias="model_revision")
|
||||
|
||||
@@ -426,8 +429,6 @@ class ModelInputConfig(BaseModel):
|
||||
)
|
||||
trust_remote_code: Optional[bool] = None
|
||||
|
||||
model_kwargs: Optional[Dict[str, Any]] = None
|
||||
|
||||
@field_validator("trust_remote_code")
|
||||
@classmethod
|
||||
def hint_trust_remote_code(cls, trust_remote_code):
|
||||
|
||||
@@ -357,8 +357,8 @@ class ModelLoader:
|
||||
|
||||
# init model kwargs
|
||||
self.model_kwargs: Dict[str, Any] = {}
|
||||
if cfg.model_kwargs:
|
||||
for key, val in cfg.model_kwargs.items():
|
||||
if cfg.overrides_of_model_kwargs:
|
||||
for key, val in cfg.overrides_of_model_kwargs.items():
|
||||
self.model_kwargs[key] = val
|
||||
|
||||
# init model
|
||||
|
||||
Reference in New Issue
Block a user