Compare commits
74 Commits
latent-spa
...
benchmark-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c3de28942c | ||
|
|
45848a9285 | ||
|
|
d6cea18034 | ||
|
|
606846e0a5 | ||
|
|
a6c9223114 | ||
|
|
8b16ecd448 | ||
|
|
f5db88a10d | ||
|
|
99d844f215 | ||
|
|
aefd4d74fa | ||
|
|
24b0e93235 | ||
|
|
2455254b92 | ||
|
|
918e040601 | ||
|
|
ef062d8fcb | ||
|
|
d4c8b66f3d | ||
|
|
64e9824d3e | ||
|
|
1134654c98 | ||
|
|
2fc756c289 | ||
|
|
943b84c490 | ||
|
|
6f166464d8 | ||
|
|
e3b07402a7 | ||
|
|
8d3c8a3eab | ||
|
|
c30120e684 | ||
|
|
9aed60fa54 | ||
|
|
98bf76e236 | ||
|
|
4c37bd0b54 | ||
|
|
f144e98a32 | ||
|
|
3a011ea1ef | ||
|
|
1f613e5aa7 | ||
|
|
f319b0bc67 | ||
|
|
7fd662dd89 | ||
|
|
9e699683d7 | ||
|
|
35130711d6 | ||
|
|
3fc9006298 | ||
|
|
ad8be435ad | ||
|
|
fe4d6baf92 | ||
|
|
f31301063d | ||
|
|
868530c39c | ||
|
|
d03887fad5 | ||
|
|
17605b85d8 | ||
|
|
a184549e4c | ||
|
|
f311df9462 | ||
|
|
c500d02517 | ||
|
|
31f3e71764 | ||
|
|
56c4a94caf | ||
|
|
c29117a0d7 | ||
|
|
0b7ba57ec4 | ||
|
|
71bd06243c | ||
|
|
cb9797ef5a | ||
|
|
bde3c5a478 | ||
|
|
55c23c7bcb | ||
|
|
c69faee7a7 | ||
|
|
d5dcf9c350 | ||
|
|
f4746507f6 | ||
|
|
96deb6bd67 | ||
|
|
50682a3c06 | ||
|
|
5a1985ba24 | ||
|
|
5e9c6afa10 | ||
|
|
a213d9972a | ||
|
|
fbf49a4770 | ||
|
|
58cf7e7fed | ||
|
|
04a42b6db1 | ||
|
|
919f4cac90 | ||
|
|
ee262818ef | ||
|
|
9d629d8bff | ||
|
|
d2e7f27240 | ||
|
|
d21318dfb9 | ||
|
|
f733d0f31e | ||
|
|
008505c8ae | ||
|
|
b3f5e00ff5 | ||
|
|
5247c5004e | ||
|
|
cf6654769a | ||
|
|
06edf175ac | ||
|
|
0a228479b3 | ||
|
|
82e111aba9 |
11
.github/workflows/main.yml
vendored
11
.github/workflows/main.yml
vendored
@@ -13,17 +13,17 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- cuda: cu118
|
- cuda: 118
|
||||||
cuda_version: 11.8.0
|
cuda_version: 11.8.0
|
||||||
python_version: "3.9"
|
python_version: "3.9"
|
||||||
pytorch: 2.0.1
|
pytorch: 2.0.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: cu118
|
- cuda: 118
|
||||||
cuda_version: 11.8.0
|
cuda_version: 11.8.0
|
||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
pytorch: 2.0.1
|
pytorch: 2.0.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: cu118
|
- cuda: 118
|
||||||
cuda_version: 11.8.0
|
cuda_version: 11.8.0
|
||||||
python_version: "3.9"
|
python_version: "3.9"
|
||||||
pytorch: 2.0.1
|
pytorch: 2.0.1
|
||||||
@@ -49,10 +49,11 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
build-args: |
|
build-args: |
|
||||||
BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}
|
BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
||||||
|
CUDA=${{ matrix.cuda }}
|
||||||
file: ./docker/Dockerfile
|
file: ./docker/Dockerfile
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
build-axolotl-runpod:
|
build-axolotl-runpod:
|
||||||
needs: build-axolotl
|
needs: build-axolotl
|
||||||
|
|||||||
2
.github/workflows/tests.yml
vendored
2
.github/workflows/tests.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install -e .
|
pip install -e .[peft]
|
||||||
pip install -r requirements-tests.txt
|
pip install -r requirements-tests.txt
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
|
|||||||
78
README.md
78
README.md
@@ -16,6 +16,7 @@ Axolotl is a tool designed to streamline the fine-tuning of various AI models, o
|
|||||||
- [LambdaLabs Installation](#lambdalabs)
|
- [LambdaLabs Installation](#lambdalabs)
|
||||||
- [Dataset](#dataset)
|
- [Dataset](#dataset)
|
||||||
- [How to Add Custom Prompts](#how-to-add-custom-prompts)
|
- [How to Add Custom Prompts](#how-to-add-custom-prompts)
|
||||||
|
- [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
|
||||||
- [Config](#config)
|
- [Config](#config)
|
||||||
- [Train](#train)
|
- [Train](#train)
|
||||||
- [Inference](#inference)
|
- [Inference](#inference)
|
||||||
@@ -68,8 +69,9 @@ Get started with Axolotl in just a few steps! This quickstart guide will walk yo
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/OpenAccess-AI-Collective/axolotl
|
git clone https://github.com/OpenAccess-AI-Collective/axolotl
|
||||||
|
cd axolotl
|
||||||
|
|
||||||
pip3 install -e .
|
pip3 install -e .[flash-attn]
|
||||||
pip3 install -U git+https://github.com/huggingface/peft.git
|
pip3 install -U git+https://github.com/huggingface/peft.git
|
||||||
|
|
||||||
# finetune lora
|
# finetune lora
|
||||||
@@ -98,7 +100,7 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
|
|||||||
```
|
```
|
||||||
|
|
||||||
- Conda/Pip venv
|
- Conda/Pip venv
|
||||||
1. Install python **3.9**
|
1. Install python >=**3.9**
|
||||||
|
|
||||||
2. Install pytorch stable https://pytorch.org/get-started/locally/
|
2. Install pytorch stable https://pytorch.org/get-started/locally/
|
||||||
|
|
||||||
@@ -151,9 +153,7 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
|
|||||||
|
|
||||||
pip3 install -e . # change depend on needs
|
pip3 install -e . # change depend on needs
|
||||||
pip3 install protobuf==3.20.3
|
pip3 install protobuf==3.20.3
|
||||||
pip3 install -U requests
|
pip3 install -U --ignore-installed requests Pillow psutil scipy
|
||||||
pip3 install -U --ignore-installed psutil
|
|
||||||
pip3 install -U scipy
|
|
||||||
pip3 install git+https://github.com/huggingface/peft.git # not for gptq
|
pip3 install git+https://github.com/huggingface/peft.git # not for gptq
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -257,6 +257,10 @@ Have dataset(s) in one of the following format (JSONL recommended):
|
|||||||
```json
|
```json
|
||||||
{"conversations": [{"role": "...", "value": "..."}]}
|
{"conversations": [{"role": "...", "value": "..."}]}
|
||||||
```
|
```
|
||||||
|
- `metharme`: instruction, adds additional eos tokens
|
||||||
|
```json
|
||||||
|
{"prompt": "...", "generation": "..."}
|
||||||
|
```
|
||||||
- `sharegpt_simple.load_role`: conversations where `role` is used instead of `from`
|
- `sharegpt_simple.load_role`: conversations where `role` is used instead of `from`
|
||||||
```json
|
```json
|
||||||
{"conversations": [{"role": "...", "value": "..."}]}
|
{"conversations": [{"role": "...", "value": "..."}]}
|
||||||
@@ -274,11 +278,29 @@ Have dataset(s) in one of the following format (JSONL recommended):
|
|||||||
|
|
||||||
#### How to add custom prompts
|
#### How to add custom prompts
|
||||||
|
|
||||||
1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
|
Using yaml. Example:
|
||||||
2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
|
```yaml
|
||||||
|
datasets:
|
||||||
|
- path: repo
|
||||||
|
type:
|
||||||
|
system_prompt: ""
|
||||||
|
no_input_format: |-
|
||||||
|
User: {instruction}<|end_of_turn|>
|
||||||
|
Assistant:
|
||||||
|
format: |-
|
||||||
|
User: {instruction}
|
||||||
|
{input}<|end_of_turn|>
|
||||||
|
Assistant:
|
||||||
|
```
|
||||||
|
|
||||||
Optionally, download some datasets, see [data/README.md](data/README.md)
|
Using file:
|
||||||
|
1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
|
||||||
|
2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
|
||||||
|
|
||||||
|
#### How to use your custom pretokenized dataset
|
||||||
|
|
||||||
|
- Do not pass a `type:`
|
||||||
|
- Dataset must contain `input_ids`, `attention_mask`, `labels` in columns
|
||||||
|
|
||||||
|
|
||||||
### Config
|
### Config
|
||||||
@@ -308,9 +330,9 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
|
|||||||
|
|
||||||
# local
|
# local
|
||||||
datasets:
|
datasets:
|
||||||
- path: json
|
- path: data.jsonl # or json
|
||||||
data_files: data.jsonl # or json
|
ds_type: json # see other options below
|
||||||
type: alpaca # format from earlier
|
type: alpaca
|
||||||
```
|
```
|
||||||
|
|
||||||
- loading
|
- loading
|
||||||
@@ -391,10 +413,29 @@ datasets:
|
|||||||
- path: vicgalle/alpaca-gpt4
|
- path: vicgalle/alpaca-gpt4
|
||||||
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
||||||
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
||||||
|
ds_type: # Optional[str] (json|arrow|parquet) defines the datatype when path is a file
|
||||||
data_files: # path to source data files
|
data_files: # path to source data files
|
||||||
shards: # number of shards to split data into
|
shards: # number of shards to split data into
|
||||||
name: # name of dataset configuration to load
|
name: # name of dataset configuration to load
|
||||||
|
|
||||||
|
# custom user prompt
|
||||||
|
- path: repo
|
||||||
|
type:
|
||||||
|
# the below are defaults. only set what's needed.
|
||||||
|
system_prompt: ""
|
||||||
|
field_system: system
|
||||||
|
field_instruction: instruction
|
||||||
|
field_output: input
|
||||||
|
|
||||||
|
# customizable to be single line or multi-line
|
||||||
|
system_format: "{system}"
|
||||||
|
# 'format' can include {input}
|
||||||
|
format: |-
|
||||||
|
User: {instruction} {input}
|
||||||
|
Assistant:
|
||||||
|
# 'no_input_format' cannot include {input}
|
||||||
|
no_input_format: "{instruction} "
|
||||||
|
|
||||||
# axolotl attempts to save the dataset as an arrow after packing the data together so
|
# axolotl attempts to save the dataset as an arrow after packing the data together so
|
||||||
# subsequent training attempts load faster, relative path
|
# subsequent training attempts load faster, relative path
|
||||||
dataset_prepared_path: data/last_run_prepared
|
dataset_prepared_path: data/last_run_prepared
|
||||||
@@ -452,6 +493,12 @@ lora_modules_to_save:
|
|||||||
lora_out_dir:
|
lora_out_dir:
|
||||||
lora_fan_in_fan_out: false
|
lora_fan_in_fan_out: false
|
||||||
|
|
||||||
|
# ReLoRA configuration
|
||||||
|
# must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
|
||||||
|
relora_steps: # number of steps per ReLoRA restart
|
||||||
|
relora_warmup_steps: # number of per-restart warmup steps
|
||||||
|
relora_cpu_offload: # true to perform lora weight merges on cpu during restarts, for modest gpu memory savings
|
||||||
|
|
||||||
# wandb configuration if you're using it
|
# wandb configuration if you're using it
|
||||||
wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
|
wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
|
||||||
wandb_project: # your wandb project name
|
wandb_project: # your wandb project name
|
||||||
@@ -472,8 +519,9 @@ warmup_steps: 100
|
|||||||
learning_rate: 0.00003
|
learning_rate: 0.00003
|
||||||
lr_quadratic_warmup:
|
lr_quadratic_warmup:
|
||||||
logging_steps:
|
logging_steps:
|
||||||
|
save_strategy: # set to `no` to skip checkpoint saves
|
||||||
save_steps: # leave empty to save at each epoch
|
save_steps: # leave empty to save at each epoch
|
||||||
eval_steps:
|
eval_steps: # leave empty to eval at each epoch
|
||||||
save_total_limit: # checkpoints saved at a time
|
save_total_limit: # checkpoints saved at a time
|
||||||
max_steps:
|
max_steps:
|
||||||
|
|
||||||
@@ -584,7 +632,7 @@ strict:
|
|||||||
|
|
||||||
Run
|
Run
|
||||||
```bash
|
```bash
|
||||||
accelerate launch scripts/finetune.py configs/your_config.yml
|
accelerate launch scripts/finetune.py your_config.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Multi-GPU
|
#### Multi-GPU
|
||||||
@@ -666,7 +714,9 @@ Please reduce any below
|
|||||||
- `gradient_accumulation_steps`
|
- `gradient_accumulation_steps`
|
||||||
- `sequence_len`
|
- `sequence_len`
|
||||||
|
|
||||||
> `failed (exitcode: -9)` usually means your system has run out of system memory.
|
> `failed (exitcode: -9)`
|
||||||
|
|
||||||
|
Usually means your system has run out of system memory.
|
||||||
Similarly, you should consider reducing the same settings as when you run out of VRAM.
|
Similarly, you should consider reducing the same settings as when you run out of VRAM.
|
||||||
Additionally, look into upgrading your system RAM which should be simpler than GPU upgrades.
|
Additionally, look into upgrading your system RAM which should be simpler than GPU upgrades.
|
||||||
|
|
||||||
|
|||||||
@@ -1,24 +0,0 @@
|
|||||||
|
|
||||||
## Download some datasets
|
|
||||||
```shell
|
|
||||||
curl https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_gpt4.json -o data/raw/alpaca_data_gpt4.json
|
|
||||||
curl https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -L -o data/raw/vicuna_cleaned.json
|
|
||||||
curl https://github.com/teknium1/GPTeacher/blob/main/Instruct/gpt4-instruct-similarity-0.6-dataset.json?raw=true -L -o data/raw/gpt4-instruct-similarity-0.6-dataset.json
|
|
||||||
curl https://github.com/teknium1/GPTeacher/blob/main/Roleplay/roleplay-similarity_0.6-instruct-dataset.json?raw=true -L -o data/raw/roleplay-similarity_0.6-instruct-dataset.json
|
|
||||||
```
|
|
||||||
|
|
||||||
## Convert the JSON data files to JSONL.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
python3 ./scripts/alpaca_json_to_jsonl.py --file data/alpaca_data_gpt4.json --output data/alpaca_data_gpt4.jsonl
|
|
||||||
python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/vicuna_cleaned.json --output data/vicuna_cleaned.jsonl
|
|
||||||
python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/roleplay-similarity_0.6-instruct-dataset.json --output data/roleplay-similarity_0.6-instruct-dataset.jsonl
|
|
||||||
python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/gpt4-instruct-similarity-0.6-dataset.json --output data/gpt4-instruct-similarity-0.6-dataset.jsonl
|
|
||||||
```
|
|
||||||
---
|
|
||||||
|
|
||||||
Using JSONL makes it easier to subset the data if you want a smaller training set, i.e get 2000 random examples.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
shuf -n2000 data/vicuna_cleaned.jsonl > data/vicuna_cleaned.subset0.jsonl
|
|
||||||
```
|
|
||||||
1
data/raw/.gitignore
vendored
1
data/raw/.gitignore
vendored
@@ -1 +0,0 @@
|
|||||||
**
|
|
||||||
46
deepspeed/zero2.json
Normal file
46
deepspeed/zero2.json
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
{
|
||||||
|
"zero_optimization": {
|
||||||
|
"stage": 2,
|
||||||
|
"offload_optimizer": {
|
||||||
|
"device": "cpu"
|
||||||
|
},
|
||||||
|
"contiguous_gradients": true,
|
||||||
|
"overlap_comm": true
|
||||||
|
},
|
||||||
|
"bf16": {
|
||||||
|
"enabled": "auto"
|
||||||
|
},
|
||||||
|
"fp16": {
|
||||||
|
"enabled": "auto",
|
||||||
|
"auto_cast": false,
|
||||||
|
"loss_scale": 0,
|
||||||
|
"initial_scale_power": 32,
|
||||||
|
"loss_scale_window": 1000,
|
||||||
|
"hysteresis": 2,
|
||||||
|
"min_loss_scale": 1
|
||||||
|
},
|
||||||
|
"optimizer": {
|
||||||
|
"type": "AdamW",
|
||||||
|
"params": {
|
||||||
|
"lr": "auto",
|
||||||
|
"betas": [
|
||||||
|
0.9,
|
||||||
|
0.999
|
||||||
|
],
|
||||||
|
"eps": 1e-8,
|
||||||
|
"weight_decay": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"scheduler": {
|
||||||
|
"type": "WarmupDecayLR",
|
||||||
|
"params": {
|
||||||
|
"warmup_min_lr": "auto",
|
||||||
|
"warmup_max_lr": "auto",
|
||||||
|
"warmup_num_steps": "auto",
|
||||||
|
"total_num_steps": "auto"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"train_batch_size": "auto",
|
||||||
|
"train_micro_batch_size_per_gpu": "auto",
|
||||||
|
"wall_clock_breakdown": false
|
||||||
|
}
|
||||||
@@ -16,9 +16,9 @@ RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
|||||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||||
RUN cd axolotl && \
|
RUN cd axolotl && \
|
||||||
if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||||
pip install -e .[$AXOLOTL_EXTRAS]; \
|
pip install -e .[flash-attn,$AXOLOTL_EXTRAS]; \
|
||||||
else \
|
else \
|
||||||
pip install -e .; \
|
pip install -e .[flash-attn]; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# fix so that git fetch/pull from remote works
|
# fix so that git fetch/pull from remote works
|
||||||
|
|||||||
@@ -31,26 +31,6 @@ WORKDIR /workspace
|
|||||||
RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
|
RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
|
||||||
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA
|
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA
|
||||||
|
|
||||||
|
|
||||||
FROM base-builder AS flash-attn-builder
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
|
||||||
|
|
||||||
RUN git clone https://github.com/Dao-AILab/flash-attention.git && \
|
|
||||||
cd flash-attention && \
|
|
||||||
git checkout v2.0.4 && \
|
|
||||||
python3 setup.py bdist_wheel && \
|
|
||||||
cd csrc/fused_dense_lib && \
|
|
||||||
python3 setup.py bdist_wheel && \
|
|
||||||
cd ../xentropy && \
|
|
||||||
python3 setup.py bdist_wheel && \
|
|
||||||
cd ../rotary && \
|
|
||||||
python3 setup.py bdist_wheel && \
|
|
||||||
cd ../layer_norm && \
|
|
||||||
python3 setup.py bdist_wheel
|
|
||||||
|
|
||||||
FROM base-builder AS deepspeed-builder
|
FROM base-builder AS deepspeed-builder
|
||||||
|
|
||||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
||||||
@@ -90,13 +70,8 @@ RUN mkdir -p /workspace/wheels/bitsandbytes
|
|||||||
COPY --from=deepspeed-builder /workspace/DeepSpeed/dist/deepspeed-*.whl wheels
|
COPY --from=deepspeed-builder /workspace/DeepSpeed/dist/deepspeed-*.whl wheels
|
||||||
COPY --from=bnb-builder /workspace/bitsandbytes/dist/bitsandbytes-*.whl wheels
|
COPY --from=bnb-builder /workspace/bitsandbytes/dist/bitsandbytes-*.whl wheels
|
||||||
COPY --from=bnb-builder /workspace/bitsandbytes/bitsandbytes/libbitsandbytes*.so wheels/bitsandbytes
|
COPY --from=bnb-builder /workspace/bitsandbytes/bitsandbytes/libbitsandbytes*.so wheels/bitsandbytes
|
||||||
COPY --from=flash-attn-builder /workspace/flash-attention/dist/flash_attn-*.whl wheels
|
|
||||||
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/fused_dense_lib/dist/fused_dense_lib-*.whl wheels
|
|
||||||
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/xentropy/dist/xentropy_cuda_lib-*.whl wheels
|
|
||||||
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/rotary/dist/rotary_emb-*.whl wheels
|
|
||||||
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/layer_norm/dist/dropout_layer_norm-*.whl wheels
|
|
||||||
|
|
||||||
RUN pip3 install wheels/deepspeed-*.whl wheels/flash_attn-*.whl wheels/fused_dense_lib-*.whl wheels/xentropy_cuda_lib-*.whl wheels/rotary_emb-*.whl wheels/dropout_layer_norm-*.whl
|
RUN pip3 install wheels/deepspeed-*.whl
|
||||||
RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
|
RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
|
||||||
RUN git lfs install --skip-repo
|
RUN git lfs install --skip-repo
|
||||||
RUN pip3 install awscli && \
|
RUN pip3 install awscli && \
|
||||||
|
|||||||
67
examples/code-llama/13b/lora.yml
Normal file
67
examples/code-llama/13b/lora.yml
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
base_model: codellama/CodeLlama-13b-hf
|
||||||
|
base_model_config: codellama/CodeLlama-13b-hf
|
||||||
|
model_type: LlamaForCausalLM
|
||||||
|
tokenizer_type: CodeLlamaTokenizer
|
||||||
|
is_llama_derived_model: true
|
||||||
|
|
||||||
|
load_in_8bit: true
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.01
|
||||||
|
output_dir: ./lora-out
|
||||||
|
|
||||||
|
sequence_len: 100000
|
||||||
|
sample_packing: true
|
||||||
|
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_run_id:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 3
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: true
|
||||||
|
fp16: false
|
||||||
|
tf32: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
eval_steps: 20
|
||||||
|
save_steps:
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
special_tokens:
|
||||||
|
bos_token: "<s>"
|
||||||
|
eos_token: "</s>"
|
||||||
|
unk_token: "<unk>"
|
||||||
69
examples/code-llama/13b/qlora.yml
Normal file
69
examples/code-llama/13b/qlora.yml
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
base_model: codellama/CodeLlama-13b-hf
|
||||||
|
base_model_config: codellama/CodeLlama-13b-hf
|
||||||
|
model_type: LlamaForCausalLM
|
||||||
|
tokenizer_type: CodeLlamaTokenizer
|
||||||
|
is_llama_derived_model: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: true
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.01
|
||||||
|
output_dir: ./qlora-out
|
||||||
|
|
||||||
|
adapter: qlora
|
||||||
|
lora_model_dir:
|
||||||
|
|
||||||
|
sequence_len: 100000
|
||||||
|
sample_packing: true
|
||||||
|
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_run_id:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 3
|
||||||
|
optimizer: paged_adamw_32bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: true
|
||||||
|
fp16: false
|
||||||
|
tf32: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
eval_steps: 20
|
||||||
|
save_steps:
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
special_tokens:
|
||||||
|
bos_token: "<s>"
|
||||||
|
eos_token: "</s>"
|
||||||
|
unk_token: "<unk>"
|
||||||
67
examples/code-llama/34b/lora.yml
Normal file
67
examples/code-llama/34b/lora.yml
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
base_model: codellama/CodeLlama-34b-hf
|
||||||
|
base_model_config: codellama/CodeLlama-34b-hf
|
||||||
|
model_type: LlamaForCausalLM
|
||||||
|
tokenizer_type: CodeLlamaTokenizer
|
||||||
|
is_llama_derived_model: true
|
||||||
|
|
||||||
|
load_in_8bit: true
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.01
|
||||||
|
output_dir: ./lora-out
|
||||||
|
|
||||||
|
sequence_len: 100000
|
||||||
|
sample_packing: true
|
||||||
|
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_run_id:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 3
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: true
|
||||||
|
fp16: false
|
||||||
|
tf32: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
eval_steps: 20
|
||||||
|
save_steps:
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
special_tokens:
|
||||||
|
bos_token: "<s>"
|
||||||
|
eos_token: "</s>"
|
||||||
|
unk_token: "<unk>"
|
||||||
69
examples/code-llama/34b/qlora.yml
Normal file
69
examples/code-llama/34b/qlora.yml
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
base_model: codellama/CodeLlama-34b-hf
|
||||||
|
base_model_config: codellama/CodeLlama-34b-hf
|
||||||
|
model_type: LlamaForCausalLM
|
||||||
|
tokenizer_type: CodeLlamaTokenizer
|
||||||
|
is_llama_derived_model: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: true
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.01
|
||||||
|
output_dir: ./qlora-out
|
||||||
|
|
||||||
|
adapter: qlora
|
||||||
|
lora_model_dir:
|
||||||
|
|
||||||
|
sequence_len: 100000
|
||||||
|
sample_packing: true
|
||||||
|
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_run_id:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 3
|
||||||
|
optimizer: paged_adamw_32bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: true
|
||||||
|
fp16: false
|
||||||
|
tf32: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
eval_steps: 20
|
||||||
|
save_steps:
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
special_tokens:
|
||||||
|
bos_token: "<s>"
|
||||||
|
eos_token: "</s>"
|
||||||
|
unk_token: "<unk>"
|
||||||
67
examples/code-llama/7b/lora.yml
Normal file
67
examples/code-llama/7b/lora.yml
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
base_model: codellama/CodeLlama-7b-hf
|
||||||
|
base_model_config: codellama/CodeLlama-7b-hf
|
||||||
|
model_type: LlamaForCausalLM
|
||||||
|
tokenizer_type: CodeLlamaTokenizer
|
||||||
|
is_llama_derived_model: true
|
||||||
|
|
||||||
|
load_in_8bit: true
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.01
|
||||||
|
output_dir: ./lora-out
|
||||||
|
|
||||||
|
sequence_len: 100000
|
||||||
|
sample_packing: true
|
||||||
|
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_run_id:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 3
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: true
|
||||||
|
fp16: false
|
||||||
|
tf32: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
eval_steps: 20
|
||||||
|
save_steps:
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
special_tokens:
|
||||||
|
bos_token: "<s>"
|
||||||
|
eos_token: "</s>"
|
||||||
|
unk_token: "<unk>"
|
||||||
69
examples/code-llama/7b/qlora.yml
Normal file
69
examples/code-llama/7b/qlora.yml
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
base_model: codellama/CodeLlama-7b-hf
|
||||||
|
base_model_config: codellama/CodeLlama-7b-hf
|
||||||
|
model_type: LlamaForCausalLM
|
||||||
|
tokenizer_type: CodeLlamaTokenizer
|
||||||
|
is_llama_derived_model: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: true
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.01
|
||||||
|
output_dir: ./qlora-out
|
||||||
|
|
||||||
|
adapter: qlora
|
||||||
|
lora_model_dir:
|
||||||
|
|
||||||
|
sequence_len: 100000
|
||||||
|
sample_packing: true
|
||||||
|
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_run_id:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 3
|
||||||
|
optimizer: paged_adamw_32bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: true
|
||||||
|
fp16: false
|
||||||
|
tf32: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
eval_steps: 20
|
||||||
|
save_steps:
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
special_tokens:
|
||||||
|
bos_token: "<s>"
|
||||||
|
eos_token: "</s>"
|
||||||
|
unk_token: "<unk>"
|
||||||
22
examples/code-llama/README.md
Normal file
22
examples/code-llama/README.md
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# Overview
|
||||||
|
|
||||||
|
This is an example of CodeLLaMA configuration for 7b, 13b and 34b.
|
||||||
|
|
||||||
|
The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes.
|
||||||
|
|
||||||
|
The 13b variant will fit if you change these settings to these values:
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
micro_batch_size: 1
|
||||||
|
|
||||||
|
The 34b variant does not fit on 24GB of VRAM - you will need something with +40 gb VRAM that also supports flash attention v2 - A6000 or A100 are good choices.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/qlora.yml
|
||||||
|
|
||||||
|
```
|
||||||
|
or
|
||||||
|
|
||||||
|
```shell
|
||||||
|
accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/lora.yml
|
||||||
|
|
||||||
|
```
|
||||||
@@ -57,7 +57,7 @@ weight_decay: 0.0001
|
|||||||
fsdp:
|
fsdp:
|
||||||
fsdp_config:
|
fsdp_config:
|
||||||
tokens:
|
tokens:
|
||||||
pad_token: "[PAD]"
|
pad_token: "<pad>"
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
unk_token: "<unk>"
|
unk_token: "<unk>"
|
||||||
|
|||||||
73
examples/llama-2/relora.yml
Normal file
73
examples/llama-2/relora.yml
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
base_model: meta-llama/Llama-2-7b-hf
|
||||||
|
base_model_config: meta-llama/Llama-2-7b-hf
|
||||||
|
model_type: LlamaForCausalLM
|
||||||
|
tokenizer_type: LlamaTokenizer
|
||||||
|
is_llama_derived_model: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: true
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.01
|
||||||
|
output_dir: ./relora-out
|
||||||
|
|
||||||
|
adapter: qlora
|
||||||
|
lora_model_dir:
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
|
||||||
|
lora_r: 8
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
|
relora_steps: 150
|
||||||
|
relora_warmup_steps: 10
|
||||||
|
relora_cpu_offload: false
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_run_id:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 4
|
||||||
|
num_epochs: 3
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: true
|
||||||
|
fp16: false
|
||||||
|
tf32: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
eval_steps: 20
|
||||||
|
save_steps: 50
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
special_tokens:
|
||||||
|
bos_token: "<s>"
|
||||||
|
eos_token: "</s>"
|
||||||
|
unk_token: "<unk>"
|
||||||
@@ -1,20 +1,23 @@
|
|||||||
|
packaging
|
||||||
peft @ git+https://github.com/huggingface/peft.git
|
peft @ git+https://github.com/huggingface/peft.git
|
||||||
transformers @ git+https://github.com/huggingface/transformers.git
|
transformers @ git+https://github.com/huggingface/transformers.git
|
||||||
bitsandbytes>=0.41.1
|
bitsandbytes>=0.41.1
|
||||||
accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
|
accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
|
||||||
addict
|
addict
|
||||||
|
evaluate
|
||||||
fire
|
fire
|
||||||
PyYAML==6.0
|
PyYAML>=6.0
|
||||||
datasets
|
datasets
|
||||||
accelerate>=0.19.0
|
flash-attn>=2.0.8
|
||||||
sentencepiece
|
sentencepiece
|
||||||
wandb
|
wandb
|
||||||
einops
|
einops
|
||||||
xformers
|
xformers
|
||||||
optimum
|
optimum
|
||||||
hf_transfer
|
hf_transfer
|
||||||
|
colorama
|
||||||
numba
|
numba
|
||||||
numpy==1.24.4
|
numpy>=1.24.4
|
||||||
# qlora things
|
# qlora things
|
||||||
bert-score==0.3.13
|
bert-score==0.3.13
|
||||||
evaluate==0.4.0
|
evaluate==0.4.0
|
||||||
|
|||||||
@@ -1,52 +0,0 @@
|
|||||||
"""Module to convert json file to jsonl"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
import fire
|
|
||||||
|
|
||||||
from axolotl.convert import (
|
|
||||||
FileReader,
|
|
||||||
FileWriter,
|
|
||||||
JsonlSerializer,
|
|
||||||
JsonParser,
|
|
||||||
JsonToJsonlConverter,
|
|
||||||
StdoutWriter,
|
|
||||||
)
|
|
||||||
from axolotl.logging_config import configure_logging
|
|
||||||
|
|
||||||
configure_logging()
|
|
||||||
|
|
||||||
# add src to the pythonpath so we don't need to pip install this
|
|
||||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
||||||
src_dir = os.path.join(project_root, "src")
|
|
||||||
sys.path.insert(0, src_dir)
|
|
||||||
|
|
||||||
|
|
||||||
def main(
|
|
||||||
file: Path,
|
|
||||||
output: Optional[Path] = None,
|
|
||||||
to_stdout: Optional[bool] = False,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Convert a json file to jsonl
|
|
||||||
"""
|
|
||||||
|
|
||||||
file_reader = FileReader()
|
|
||||||
writer: Union[StdoutWriter, FileWriter]
|
|
||||||
if to_stdout or output is None:
|
|
||||||
writer = StdoutWriter()
|
|
||||||
else:
|
|
||||||
writer = FileWriter(output)
|
|
||||||
json_parser = JsonParser()
|
|
||||||
jsonl_serializer = JsonlSerializer()
|
|
||||||
|
|
||||||
converter = JsonToJsonlConverter(file_reader, writer, json_parser, jsonl_serializer)
|
|
||||||
|
|
||||||
converter.convert(file, output)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
fire.Fire(main)
|
|
||||||
@@ -82,6 +82,8 @@ def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
|
|||||||
max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
|
max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
model = model.to(cfg.device)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
# support for multiline inputs
|
# support for multiline inputs
|
||||||
@@ -242,6 +244,21 @@ def train(
|
|||||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
|
||||||
|
possible_checkpoints = [
|
||||||
|
str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
|
||||||
|
]
|
||||||
|
if len(possible_checkpoints) > 0:
|
||||||
|
sorted_paths = sorted(
|
||||||
|
possible_checkpoints,
|
||||||
|
key=lambda path: int(path.split("-")[-1]),
|
||||||
|
)
|
||||||
|
cfg.resume_from_checkpoint = sorted_paths[-1]
|
||||||
|
LOG.info(
|
||||||
|
f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
|
||||||
|
)
|
||||||
|
resume_from_checkpoint = cfg.resume_from_checkpoint
|
||||||
|
|
||||||
trainer = setup_trainer(
|
trainer = setup_trainer(
|
||||||
cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
|
cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
|
||||||
)
|
)
|
||||||
@@ -273,20 +290,6 @@ def train(
|
|||||||
LOG.info("Starting trainer...")
|
LOG.info("Starting trainer...")
|
||||||
if cfg.group_by_length:
|
if cfg.group_by_length:
|
||||||
LOG.info("hang tight... sorting dataset for group_by_length")
|
LOG.info("hang tight... sorting dataset for group_by_length")
|
||||||
resume_from_checkpoint = cfg.resume_from_checkpoint
|
|
||||||
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
|
|
||||||
possible_checkpoints = [
|
|
||||||
str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
|
|
||||||
]
|
|
||||||
if len(possible_checkpoints) > 0:
|
|
||||||
sorted_paths = sorted(
|
|
||||||
possible_checkpoints,
|
|
||||||
key=lambda path: int(path.split("-")[-1]),
|
|
||||||
)
|
|
||||||
resume_from_checkpoint = sorted_paths[-1]
|
|
||||||
LOG.info(
|
|
||||||
f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if not Path(cfg.output_dir).is_dir():
|
if not Path(cfg.output_dir).is_dir():
|
||||||
os.makedirs(cfg.output_dir, exist_ok=True)
|
os.makedirs(cfg.output_dir, exist_ok=True)
|
||||||
@@ -301,6 +304,13 @@ def train(
|
|||||||
|
|
||||||
LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
|
LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
|
||||||
|
|
||||||
|
if cfg.relora_steps:
|
||||||
|
if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
|
||||||
|
model = model.merge_and_unload()
|
||||||
|
else:
|
||||||
|
# final model weights have already been saved by `ReLoRACallback.on_train_end`
|
||||||
|
return
|
||||||
|
|
||||||
# TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
|
# TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
|
||||||
# only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
|
# only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
|
||||||
if cfg.fsdp:
|
if cfg.fsdp:
|
||||||
@@ -308,6 +318,7 @@ def train(
|
|||||||
elif cfg.local_rank == 0:
|
elif cfg.local_rank == 0:
|
||||||
if cfg.flash_optimum:
|
if cfg.flash_optimum:
|
||||||
model = BetterTransformer.reverse(model)
|
model = BetterTransformer.reverse(model)
|
||||||
|
|
||||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
8
setup.py
8
setup.py
@@ -7,6 +7,7 @@ with open("./requirements.txt", encoding="utf-8") as requirements_file:
|
|||||||
# don't include peft yet until we check the int4
|
# don't include peft yet until we check the int4
|
||||||
# need to manually install peft for now...
|
# need to manually install peft for now...
|
||||||
reqs = [r.strip() for r in requirements_file.readlines() if "peft" not in r]
|
reqs = [r.strip() for r in requirements_file.readlines() if "peft" not in r]
|
||||||
|
reqs = [r for r in reqs if "flash-attn" not in r]
|
||||||
reqs = [r for r in reqs if r and r[0] != "#"]
|
reqs = [r for r in reqs if r and r[0] != "#"]
|
||||||
for r in reqs:
|
for r in reqs:
|
||||||
install_requires.append(r)
|
install_requires.append(r)
|
||||||
@@ -25,9 +26,14 @@ setup(
|
|||||||
"gptq_triton": [
|
"gptq_triton": [
|
||||||
"alpaca_lora_4bit[triton] @ git+https://github.com/winglian/alpaca_lora_4bit.git@setup_pip",
|
"alpaca_lora_4bit[triton] @ git+https://github.com/winglian/alpaca_lora_4bit.git@setup_pip",
|
||||||
],
|
],
|
||||||
|
"flash-attn": [
|
||||||
|
"flash-attn==2.0.8",
|
||||||
|
],
|
||||||
"extras": [
|
"extras": [
|
||||||
"flash-attn",
|
|
||||||
"deepspeed",
|
"deepspeed",
|
||||||
],
|
],
|
||||||
|
"peft": [
|
||||||
|
"peft @ git+https://github.com/huggingface/peft.git",
|
||||||
|
],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,16 +1,42 @@
|
|||||||
"""Logging configuration settings"""
|
"""
|
||||||
|
Common logging module for axolotl
|
||||||
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
from logging import Formatter
|
||||||
from logging.config import dictConfig
|
from logging.config import dictConfig
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from colorama import Fore, Style, init
|
||||||
|
|
||||||
|
|
||||||
|
class ColorfulFormatter(Formatter):
|
||||||
|
"""
|
||||||
|
Formatter to add coloring to log messages by log type
|
||||||
|
"""
|
||||||
|
|
||||||
|
COLORS = {
|
||||||
|
"WARNING": Fore.YELLOW,
|
||||||
|
"ERROR": Fore.RED,
|
||||||
|
"CRITICAL": Fore.RED + Style.BRIGHT,
|
||||||
|
}
|
||||||
|
|
||||||
|
def format(self, record):
|
||||||
|
log_message = super().format(record)
|
||||||
|
return self.COLORS.get(record.levelname, "") + log_message + Fore.RESET
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
|
DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
|
||||||
"version": 1,
|
"version": 1,
|
||||||
"formatters": {
|
"formatters": {
|
||||||
"simple": {
|
"simple": {
|
||||||
"format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
|
"format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
|
||||||
},
|
},
|
||||||
|
"colorful": {
|
||||||
|
"()": ColorfulFormatter,
|
||||||
|
"format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"filters": {},
|
"filters": {},
|
||||||
"handlers": {
|
"handlers": {
|
||||||
@@ -20,14 +46,25 @@ DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
|
|||||||
"filters": [],
|
"filters": [],
|
||||||
"stream": sys.stdout,
|
"stream": sys.stdout,
|
||||||
},
|
},
|
||||||
|
"color_console": {
|
||||||
|
"class": "logging.StreamHandler",
|
||||||
|
"formatter": "colorful",
|
||||||
|
"filters": [],
|
||||||
|
"stream": sys.stdout,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"root": {"handlers": ["console"], "level": os.getenv("LOG_LEVEL", "INFO")},
|
"root": {"handlers": ["console"], "level": os.getenv("LOG_LEVEL", "INFO")},
|
||||||
"loggers": {
|
"loggers": {
|
||||||
"axolotl": {"handlers": ["console"], "level": "DEBUG", "propagate": False},
|
"axolotl": {
|
||||||
|
"handlers": ["color_console"],
|
||||||
|
"level": "DEBUG",
|
||||||
|
"propagate": False,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def configure_logging():
|
def configure_logging():
|
||||||
"""Configure with default logging"""
|
"""Configure with default logging"""
|
||||||
|
init() # Initialize colorama
|
||||||
dictConfig(DEFAULT_LOGGING_CONFIG)
|
dictConfig(DEFAULT_LOGGING_CONFIG)
|
||||||
|
|||||||
@@ -2,142 +2,47 @@
|
|||||||
|
|
||||||
# copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
|
# copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
|
||||||
|
|
||||||
from typing import Optional, Tuple
|
import warnings
|
||||||
|
from typing import List, Optional, Tuple, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
import transformers
|
import transformers
|
||||||
from einops import rearrange
|
from einops import rearrange
|
||||||
from flash_attn.bert_padding import pad_input, unpad_input
|
from flash_attn.bert_padding import pad_input, unpad_input
|
||||||
|
from transformers.modeling_outputs import BaseModelOutputWithPast
|
||||||
|
from transformers.models.llama.modeling_llama import (
|
||||||
|
LlamaDecoderLayer as OriginalLlamaDecoderLayer,
|
||||||
|
)
|
||||||
|
from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
|
||||||
|
|
||||||
|
from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
|
from flash_attn.flash_attn_interface import ( # pylint: disable=ungrouped-imports
|
||||||
|
flash_attn_kvpacked_func,
|
||||||
|
flash_attn_varlen_kvpacked_func,
|
||||||
|
flash_attn_varlen_qkvpacked_func,
|
||||||
|
)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
from flash_attn.flash_attn_interface import (
|
||||||
|
flash_attn_unpadded_kvpacked_func as flash_attn_varlen_kvpacked_func,
|
||||||
|
)
|
||||||
from flash_attn.flash_attn_interface import (
|
from flash_attn.flash_attn_interface import (
|
||||||
flash_attn_unpadded_qkvpacked_func as flash_attn_varlen_qkvpacked_func,
|
flash_attn_unpadded_qkvpacked_func as flash_attn_varlen_qkvpacked_func,
|
||||||
)
|
)
|
||||||
|
|
||||||
from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
|
|
||||||
|
|
||||||
from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
|
def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
|
||||||
|
transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( # pylint: disable=protected-access
|
||||||
|
_prepare_decoder_attention_mask
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
hidden_states: torch.Tensor,
|
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
|
||||||
position_ids: Optional[torch.Tensor] = None,
|
|
||||||
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
|
||||||
output_attentions: bool = False,
|
|
||||||
use_cache: bool = False,
|
|
||||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
|
||||||
"""Input shape: Batch x Time x Channel
|
|
||||||
|
|
||||||
attention_mask: [bsz, q_len]
|
|
||||||
"""
|
|
||||||
# pylint: disable=duplicate-code
|
|
||||||
bsz, q_len, _ = hidden_states.size()
|
|
||||||
|
|
||||||
query_states = (
|
|
||||||
self.q_proj(hidden_states)
|
|
||||||
.view(bsz, q_len, self.num_heads, self.head_dim)
|
|
||||||
.transpose(1, 2)
|
|
||||||
)
|
)
|
||||||
key_states = (
|
transformers.models.llama.modeling_llama.LlamaAttention.forward = flashattn_forward
|
||||||
self.k_proj(hidden_states)
|
if packed:
|
||||||
.view(bsz, q_len, self.num_heads, self.head_dim)
|
transformers.models.llama.modeling_llama.LlamaDecoderLayer = LlamaDecoderLayer
|
||||||
.transpose(1, 2)
|
transformers.models.llama.modeling_llama.LlamaModel.forward = (
|
||||||
)
|
llama_model_forward
|
||||||
value_states = (
|
|
||||||
self.v_proj(hidden_states)
|
|
||||||
.view(bsz, q_len, self.num_heads, self.head_dim)
|
|
||||||
.transpose(1, 2)
|
|
||||||
)
|
|
||||||
# [bsz, q_len, nh, hd]
|
|
||||||
# [bsz, nh, q_len, hd]
|
|
||||||
|
|
||||||
kv_seq_len = key_states.shape[-2]
|
|
||||||
assert past_key_value is None, "past_key_value is not supported"
|
|
||||||
|
|
||||||
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
|
||||||
query_states, key_states = apply_rotary_pos_emb(
|
|
||||||
query_states, key_states, cos, sin, position_ids
|
|
||||||
)
|
|
||||||
# [bsz, nh, t, hd]
|
|
||||||
assert not output_attentions, "output_attentions is not supported"
|
|
||||||
assert not use_cache, "use_cache is not supported"
|
|
||||||
|
|
||||||
# Flash attention codes from
|
|
||||||
# https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
|
|
||||||
|
|
||||||
# transform the data into the format required by flash attention
|
|
||||||
qkv = torch.stack(
|
|
||||||
[query_states, key_states, value_states], dim=2
|
|
||||||
) # [bsz, nh, 3, q_len, hd]
|
|
||||||
qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd]
|
|
||||||
# We have disabled _prepare_decoder_attention_mask in LlamaModel
|
|
||||||
# the attention_mask should be the same as the key_padding_mask
|
|
||||||
key_padding_mask = attention_mask
|
|
||||||
|
|
||||||
if key_padding_mask is None:
|
|
||||||
qkv = rearrange(qkv, "b s ... -> (b s) ...")
|
|
||||||
max_s = q_len
|
|
||||||
cu_q_lens = torch.arange(
|
|
||||||
0,
|
|
||||||
(bsz + 1) * q_len,
|
|
||||||
step=q_len,
|
|
||||||
dtype=torch.int32,
|
|
||||||
device=qkv.device,
|
|
||||||
)
|
)
|
||||||
output = flash_attn_varlen_qkvpacked_func(
|
|
||||||
qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
|
|
||||||
)
|
|
||||||
output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
|
|
||||||
elif attention_mask.shape[0] == 1:
|
|
||||||
# special handling using sample packing
|
|
||||||
qkv = rearrange(qkv, "b s ... -> (b s) ...")
|
|
||||||
cu_q_lens, max_s = get_cu_seqlens_from_pos_ids(position_ids)
|
|
||||||
cu_q_lens = cu_q_lens.squeeze()
|
|
||||||
|
|
||||||
output = flash_attn_varlen_qkvpacked_func(
|
|
||||||
qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
|
|
||||||
)
|
|
||||||
output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
|
|
||||||
else:
|
|
||||||
nheads = qkv.shape[-2]
|
|
||||||
|
|
||||||
# pylint: disable=invalid-name
|
|
||||||
x = rearrange(qkv, "b s three h d -> b s (three h d)")
|
|
||||||
x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
|
|
||||||
x_unpad = rearrange(
|
|
||||||
x_unpad,
|
|
||||||
"nnz (three h d) -> nnz three h d",
|
|
||||||
three=3,
|
|
||||||
h=nheads,
|
|
||||||
)
|
|
||||||
output_unpad = flash_attn_varlen_qkvpacked_func(
|
|
||||||
x_unpad,
|
|
||||||
cu_q_lens,
|
|
||||||
max_s,
|
|
||||||
0.0,
|
|
||||||
softmax_scale=None,
|
|
||||||
causal=True,
|
|
||||||
)
|
|
||||||
output = rearrange(
|
|
||||||
pad_input(
|
|
||||||
rearrange(output_unpad, "nnz h d -> nnz (h d)"),
|
|
||||||
indices,
|
|
||||||
bsz,
|
|
||||||
q_len,
|
|
||||||
),
|
|
||||||
"b s (h d) -> b s h d",
|
|
||||||
h=nheads,
|
|
||||||
)
|
|
||||||
|
|
||||||
return (
|
|
||||||
self.o_proj(rearrange(output, "b s h d -> b s (h d)")),
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Disable the transformation of the attention mask in LlamaModel as the flash attention
|
# Disable the transformation of the attention mask in LlamaModel as the flash attention
|
||||||
@@ -153,8 +58,541 @@ def _prepare_decoder_attention_mask(
|
|||||||
return attention_mask
|
return attention_mask
|
||||||
|
|
||||||
|
|
||||||
def replace_llama_attn_with_flash_attn():
|
def flashattn_forward(
|
||||||
transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( # pylint: disable=protected-access
|
self,
|
||||||
_prepare_decoder_attention_mask
|
hidden_states: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_ids: Optional[torch.Tensor] = None,
|
||||||
|
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
||||||
|
output_attentions: bool = False,
|
||||||
|
use_cache: bool = False,
|
||||||
|
cu_seqlens: Optional[torch.Tensor] = None,
|
||||||
|
max_seqlen: Optional[torch.Tensor] = None,
|
||||||
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
|
"""Input shape: Batch x Time x Channel
|
||||||
|
|
||||||
|
attention_mask: [bsz, q_len]
|
||||||
|
"""
|
||||||
|
# pylint: disable=duplicate-code
|
||||||
|
bsz, q_len, _ = hidden_states.size()
|
||||||
|
|
||||||
|
if not hasattr(self, "pretraining_tp"):
|
||||||
|
self.pretraining_tp = 1
|
||||||
|
|
||||||
|
if self.pretraining_tp > 1:
|
||||||
|
key_value_slicing = (
|
||||||
|
self.num_key_value_heads * self.head_dim
|
||||||
|
) // self.pretraining_tp
|
||||||
|
query_slices = self.q_proj.weight.split(
|
||||||
|
(self.num_heads * self.head_dim) // self.pretraining_tp, dim=0
|
||||||
|
)
|
||||||
|
key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
|
||||||
|
value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
|
||||||
|
|
||||||
|
query_states = [
|
||||||
|
F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)
|
||||||
|
]
|
||||||
|
query_states = torch.cat(query_states, dim=-1)
|
||||||
|
|
||||||
|
key_states = [
|
||||||
|
F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)
|
||||||
|
]
|
||||||
|
key_states = torch.cat(key_states, dim=-1)
|
||||||
|
|
||||||
|
value_states = [
|
||||||
|
F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)
|
||||||
|
]
|
||||||
|
value_states = torch.cat(value_states, dim=-1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
query_states = self.q_proj(hidden_states)
|
||||||
|
key_states = self.k_proj(hidden_states)
|
||||||
|
value_states = self.v_proj(hidden_states)
|
||||||
|
|
||||||
|
query_states = query_states.view(
|
||||||
|
bsz, q_len, self.num_heads, self.head_dim
|
||||||
|
).transpose(1, 2)
|
||||||
|
key_states = key_states.view(
|
||||||
|
bsz, q_len, self.num_key_value_heads, self.head_dim
|
||||||
|
).transpose(1, 2)
|
||||||
|
value_states = value_states.view(
|
||||||
|
bsz, q_len, self.num_key_value_heads, self.head_dim
|
||||||
|
).transpose(1, 2)
|
||||||
|
# [bsz, q_len, nh, hd]
|
||||||
|
# [bsz, nh, q_len, hd]
|
||||||
|
|
||||||
|
kv_seq_len = key_states.shape[-2]
|
||||||
|
if past_key_value is not None:
|
||||||
|
kv_seq_len += past_key_value[0].shape[-2]
|
||||||
|
|
||||||
|
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
||||||
|
query_states, key_states = apply_rotary_pos_emb(
|
||||||
|
query_states, key_states, cos, sin, position_ids
|
||||||
)
|
)
|
||||||
transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
|
# [bsz, nh, t, hd]
|
||||||
|
|
||||||
|
if past_key_value is not None:
|
||||||
|
# reuse k, v, self_attention
|
||||||
|
key_states = torch.cat([past_key_value[0], key_states], dim=2)
|
||||||
|
value_states = torch.cat([past_key_value[1], value_states], dim=2)
|
||||||
|
|
||||||
|
past_key_value = (key_states, value_states) if use_cache else None
|
||||||
|
|
||||||
|
# repeat k/v heads if n_kv_heads < n_heads
|
||||||
|
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
||||||
|
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
||||||
|
|
||||||
|
if output_attentions:
|
||||||
|
warnings.warn(
|
||||||
|
"Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
|
||||||
|
)
|
||||||
|
|
||||||
|
#
|
||||||
|
# flash-attn v2 start
|
||||||
|
#
|
||||||
|
|
||||||
|
if self.training:
|
||||||
|
# during training q,k,v always have same seqlen
|
||||||
|
assert key_states.shape == query_states.shape
|
||||||
|
is_causal = True
|
||||||
|
else:
|
||||||
|
# turn off FA causal mask after first inference autoregressive iteration
|
||||||
|
# only on first autoregressive step q,k,v have same seqlen
|
||||||
|
is_causal = key_states.shape == query_states.shape
|
||||||
|
|
||||||
|
if cu_seqlens is not None and max_seqlen is not None:
|
||||||
|
# special handling using sample packing
|
||||||
|
qkv = torch.stack(
|
||||||
|
[query_states, key_states, value_states], dim=2
|
||||||
|
) # [bsz, nh, 3, q_len, hd]
|
||||||
|
qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd]
|
||||||
|
qkv = rearrange(qkv, "b s ... -> (b s) ...")
|
||||||
|
|
||||||
|
output = flash_attn_varlen_qkvpacked_func(
|
||||||
|
qkv, cu_seqlens, max_seqlen, 0.0, softmax_scale=None, causal=True
|
||||||
|
)
|
||||||
|
output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
|
||||||
|
elif query_states.shape == key_states.shape:
|
||||||
|
query_states = query_states.transpose(1, 2)
|
||||||
|
key_states = key_states.transpose(1, 2)
|
||||||
|
value_states = value_states.transpose(1, 2)
|
||||||
|
qkv_unpad, cu_seqlens_q, max_seqlen_q, _, output_pad_fn = generate_qkv(
|
||||||
|
query_states,
|
||||||
|
key_states,
|
||||||
|
value_states,
|
||||||
|
qkvpacked=True,
|
||||||
|
# We have disabled _prepare_decoder_attention_mask in LlamaModel
|
||||||
|
# the attention_mask should be the same as the key_padding_mask
|
||||||
|
key_padding_mask=attention_mask,
|
||||||
|
query_padding_mask=attention_mask[:, -query_states.size(1) :]
|
||||||
|
if attention_mask is not None
|
||||||
|
else None,
|
||||||
|
)
|
||||||
|
output_unpad = flash_attn_varlen_qkvpacked_func(
|
||||||
|
qkv_unpad,
|
||||||
|
cu_seqlens_q,
|
||||||
|
max_seqlen_q,
|
||||||
|
0.0,
|
||||||
|
softmax_scale=None,
|
||||||
|
causal=is_causal,
|
||||||
|
)
|
||||||
|
output = output_pad_fn(output_unpad)
|
||||||
|
else:
|
||||||
|
query_states = query_states.transpose(1, 2)
|
||||||
|
key_states = key_states.transpose(1, 2)
|
||||||
|
value_states = value_states.transpose(1, 2)
|
||||||
|
if attention_mask is None or attention_mask.all().item():
|
||||||
|
output = flash_attn_kvpacked_func(
|
||||||
|
query_states,
|
||||||
|
torch.stack([key_states, value_states], 2),
|
||||||
|
causal=is_causal,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
( # pylint: disable=unbalanced-tuple-unpacking
|
||||||
|
q_unpad,
|
||||||
|
kv_unpad,
|
||||||
|
cu_seqlens_q,
|
||||||
|
cu_seqlens_k,
|
||||||
|
max_seqlen_q,
|
||||||
|
max_seqlen_k,
|
||||||
|
_,
|
||||||
|
_,
|
||||||
|
output_pad_fn,
|
||||||
|
) = generate_qkv(
|
||||||
|
query_states,
|
||||||
|
key_states,
|
||||||
|
value_states,
|
||||||
|
kvpacked=True,
|
||||||
|
key_padding_mask=attention_mask,
|
||||||
|
query_padding_mask=attention_mask[:, -query_states.size(1) :]
|
||||||
|
if attention_mask is not None
|
||||||
|
else None,
|
||||||
|
)
|
||||||
|
output_unpad = flash_attn_varlen_kvpacked_func(
|
||||||
|
q_unpad,
|
||||||
|
kv_unpad,
|
||||||
|
cu_seqlens_q,
|
||||||
|
cu_seqlens_k,
|
||||||
|
max_seqlen_q,
|
||||||
|
max_seqlen_k,
|
||||||
|
0.0,
|
||||||
|
softmax_scale=None,
|
||||||
|
causal=is_causal,
|
||||||
|
)
|
||||||
|
output = output_pad_fn(output_unpad)
|
||||||
|
|
||||||
|
attn_output = output
|
||||||
|
if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim):
|
||||||
|
raise ValueError(
|
||||||
|
f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is"
|
||||||
|
f" {attn_output.size()}"
|
||||||
|
)
|
||||||
|
attn_output = rearrange(attn_output, "b s h d -> b s (h d)")
|
||||||
|
|
||||||
|
#
|
||||||
|
# flash-attn v2 end
|
||||||
|
#
|
||||||
|
|
||||||
|
if self.pretraining_tp > 1:
|
||||||
|
attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
|
||||||
|
o_proj_slices = self.o_proj.weight.split(
|
||||||
|
self.hidden_size // self.pretraining_tp, dim=1
|
||||||
|
)
|
||||||
|
attn_output = sum(
|
||||||
|
F.linear(attn_output[i], o_proj_slices[i])
|
||||||
|
for i in range(self.pretraining_tp)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
attn_output = self.o_proj(attn_output)
|
||||||
|
|
||||||
|
return attn_output, None, past_key_value
|
||||||
|
|
||||||
|
|
||||||
|
# based on https://github.com/Dao-AILab/flash-attention/blob/364a5b/tests/test_flash_attn.py#L38
|
||||||
|
def generate_qkv(
|
||||||
|
q,
|
||||||
|
k,
|
||||||
|
v,
|
||||||
|
query_padding_mask=None,
|
||||||
|
key_padding_mask=None,
|
||||||
|
kvpacked=False,
|
||||||
|
qkvpacked=False,
|
||||||
|
): # pylint: disable=invalid-name,unnecessary-lambda-assignment
|
||||||
|
"""
|
||||||
|
Arguments:
|
||||||
|
q: (batch_size, seqlen_q, nheads, d)
|
||||||
|
k: (batch_size, seqlen_k, nheads_k, d)
|
||||||
|
v: (batch_size, seqlen_k, nheads_k, d)
|
||||||
|
query_padding_mask: (batch_size, seqlen), bool
|
||||||
|
key_padding_mask: (batch_size, seqlen), bool
|
||||||
|
"""
|
||||||
|
assert not (kvpacked and qkvpacked)
|
||||||
|
batch_size, seqlen_q, nheads, d = q.shape
|
||||||
|
_, seqlen_k, nheads_k, _ = k.shape
|
||||||
|
assert k.shape == (batch_size, seqlen_k, nheads_k, d)
|
||||||
|
assert v.shape == (batch_size, seqlen_k, nheads_k, d)
|
||||||
|
|
||||||
|
if query_padding_mask is not None:
|
||||||
|
q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(
|
||||||
|
q, query_padding_mask
|
||||||
|
)
|
||||||
|
|
||||||
|
output_pad_fn = lambda output_unpad: pad_input( # noqa: E731
|
||||||
|
output_unpad, indices_q, batch_size, seqlen_q
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
q_unpad = rearrange(q, "b s h d -> (b s) h d")
|
||||||
|
cu_seqlens_q = torch.arange(
|
||||||
|
0,
|
||||||
|
(batch_size + 1) * seqlen_q,
|
||||||
|
step=seqlen_q,
|
||||||
|
dtype=torch.int32,
|
||||||
|
device=q_unpad.device,
|
||||||
|
)
|
||||||
|
max_seqlen_q = seqlen_q
|
||||||
|
|
||||||
|
output_pad_fn = lambda output_unpad: rearrange( # noqa: E731
|
||||||
|
output_unpad, "(b s) h d -> b s h d", b=batch_size
|
||||||
|
)
|
||||||
|
|
||||||
|
if key_padding_mask is not None:
|
||||||
|
k_unpad, _, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
|
||||||
|
v_unpad, _, _, _ = unpad_input(v, key_padding_mask)
|
||||||
|
else:
|
||||||
|
k_unpad = rearrange(k, "b s h d -> (b s) h d")
|
||||||
|
v_unpad = rearrange(v, "b s h d -> (b s) h d")
|
||||||
|
cu_seqlens_k = torch.arange(
|
||||||
|
0,
|
||||||
|
(batch_size + 1) * seqlen_k,
|
||||||
|
step=seqlen_k,
|
||||||
|
dtype=torch.int32,
|
||||||
|
device=k_unpad.device,
|
||||||
|
)
|
||||||
|
max_seqlen_k = seqlen_k
|
||||||
|
|
||||||
|
if qkvpacked:
|
||||||
|
assert nheads == nheads_k
|
||||||
|
qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
|
||||||
|
qkv = torch.stack([q, k, v], dim=2)
|
||||||
|
return (qkv_unpad, cu_seqlens_q, max_seqlen_q, qkv, output_pad_fn)
|
||||||
|
|
||||||
|
if kvpacked:
|
||||||
|
kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
|
||||||
|
kv = torch.stack([k, v], dim=2)
|
||||||
|
return (
|
||||||
|
q_unpad,
|
||||||
|
kv_unpad,
|
||||||
|
cu_seqlens_q,
|
||||||
|
cu_seqlens_k,
|
||||||
|
max_seqlen_q,
|
||||||
|
max_seqlen_k,
|
||||||
|
q,
|
||||||
|
kv,
|
||||||
|
output_pad_fn,
|
||||||
|
)
|
||||||
|
|
||||||
|
return (
|
||||||
|
q_unpad,
|
||||||
|
k_unpad,
|
||||||
|
v_unpad,
|
||||||
|
cu_seqlens_q,
|
||||||
|
cu_seqlens_k,
|
||||||
|
max_seqlen_q,
|
||||||
|
max_seqlen_k,
|
||||||
|
q,
|
||||||
|
k,
|
||||||
|
v,
|
||||||
|
output_pad_fn,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def llama_model_forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.LongTensor = None,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
|
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
||||||
|
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||||
|
use_cache: Optional[bool] = None,
|
||||||
|
output_attentions: Optional[bool] = None,
|
||||||
|
output_hidden_states: Optional[bool] = None,
|
||||||
|
return_dict: Optional[bool] = None,
|
||||||
|
) -> Union[Tuple, BaseModelOutputWithPast]:
|
||||||
|
output_attentions = (
|
||||||
|
output_attentions
|
||||||
|
if output_attentions is not None
|
||||||
|
else self.config.output_attentions
|
||||||
|
)
|
||||||
|
output_hidden_states = (
|
||||||
|
output_hidden_states
|
||||||
|
if output_hidden_states is not None
|
||||||
|
else self.config.output_hidden_states
|
||||||
|
)
|
||||||
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||||
|
|
||||||
|
return_dict = (
|
||||||
|
return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
)
|
||||||
|
|
||||||
|
# retrieve input_ids and inputs_embeds
|
||||||
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
|
raise ValueError(
|
||||||
|
"You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
|
||||||
|
)
|
||||||
|
if input_ids is not None:
|
||||||
|
batch_size, seq_length = input_ids.shape
|
||||||
|
elif inputs_embeds is not None:
|
||||||
|
batch_size, seq_length, _ = inputs_embeds.shape
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"You have to specify either decoder_input_ids or decoder_inputs_embeds"
|
||||||
|
)
|
||||||
|
|
||||||
|
seq_length_with_past = seq_length
|
||||||
|
past_key_values_length = 0
|
||||||
|
|
||||||
|
if past_key_values is not None:
|
||||||
|
past_key_values_length = past_key_values[0][0].shape[2]
|
||||||
|
seq_length_with_past = seq_length_with_past + past_key_values_length
|
||||||
|
|
||||||
|
cu_seqlens = None
|
||||||
|
max_seqlen = None
|
||||||
|
if position_ids is None:
|
||||||
|
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||||
|
position_ids = torch.arange(
|
||||||
|
past_key_values_length,
|
||||||
|
seq_length + past_key_values_length,
|
||||||
|
dtype=torch.long,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
|
||||||
|
else:
|
||||||
|
position_ids = position_ids.view(-1, seq_length).long()
|
||||||
|
cu_seqlens, max_seqlen = get_cu_seqlens_from_pos_ids(position_ids)
|
||||||
|
cu_seqlens = cu_seqlens.squeeze()
|
||||||
|
|
||||||
|
if inputs_embeds is None:
|
||||||
|
inputs_embeds = self.embed_tokens(input_ids)
|
||||||
|
# embed positions
|
||||||
|
if attention_mask is None:
|
||||||
|
attention_mask = torch.ones(
|
||||||
|
(batch_size, seq_length_with_past),
|
||||||
|
dtype=torch.bool,
|
||||||
|
device=inputs_embeds.device,
|
||||||
|
)
|
||||||
|
attention_mask = (
|
||||||
|
self._prepare_decoder_attention_mask( # pylint: disable=protected-access
|
||||||
|
attention_mask,
|
||||||
|
(batch_size, seq_length),
|
||||||
|
inputs_embeds,
|
||||||
|
past_key_values_length,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
hidden_states = inputs_embeds
|
||||||
|
|
||||||
|
if self.gradient_checkpointing and self.training:
|
||||||
|
if use_cache:
|
||||||
|
transformers.logger.warning_once(
|
||||||
|
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
||||||
|
)
|
||||||
|
use_cache = False
|
||||||
|
|
||||||
|
# decoder layers
|
||||||
|
all_hidden_states = () if output_hidden_states else None
|
||||||
|
all_self_attns = () if output_attentions else None
|
||||||
|
next_decoder_cache = () if use_cache else None
|
||||||
|
|
||||||
|
for idx, decoder_layer in enumerate(self.layers):
|
||||||
|
if output_hidden_states:
|
||||||
|
all_hidden_states += (hidden_states,)
|
||||||
|
|
||||||
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|
||||||
|
if self.gradient_checkpointing and self.training:
|
||||||
|
|
||||||
|
def create_custom_forward(module):
|
||||||
|
def custom_forward(*inputs):
|
||||||
|
# None for past_key_value
|
||||||
|
return module(*inputs)
|
||||||
|
|
||||||
|
return custom_forward
|
||||||
|
|
||||||
|
layer_outputs = torch.utils.checkpoint.checkpoint(
|
||||||
|
create_custom_forward(decoder_layer),
|
||||||
|
hidden_states,
|
||||||
|
attention_mask,
|
||||||
|
position_ids,
|
||||||
|
None,
|
||||||
|
output_attentions,
|
||||||
|
None,
|
||||||
|
cu_seqlens,
|
||||||
|
max_seqlen,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
layer_outputs = decoder_layer(
|
||||||
|
hidden_states,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
|
past_key_value=past_key_value,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
use_cache=use_cache,
|
||||||
|
cu_seqlens=cu_seqlens,
|
||||||
|
max_seqlen=max_seqlen,
|
||||||
|
)
|
||||||
|
|
||||||
|
hidden_states = layer_outputs[0]
|
||||||
|
|
||||||
|
if use_cache:
|
||||||
|
next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
|
||||||
|
|
||||||
|
if output_attentions:
|
||||||
|
all_self_attns += (layer_outputs[1],)
|
||||||
|
|
||||||
|
hidden_states = self.norm(hidden_states)
|
||||||
|
|
||||||
|
# add hidden states from the last decoder layer
|
||||||
|
if output_hidden_states:
|
||||||
|
all_hidden_states += (hidden_states,)
|
||||||
|
|
||||||
|
next_cache = next_decoder_cache if use_cache else None
|
||||||
|
if not return_dict:
|
||||||
|
return tuple(
|
||||||
|
v
|
||||||
|
for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
|
||||||
|
if v is not None
|
||||||
|
)
|
||||||
|
return BaseModelOutputWithPast(
|
||||||
|
last_hidden_state=hidden_states,
|
||||||
|
past_key_values=next_cache,
|
||||||
|
hidden_states=all_hidden_states,
|
||||||
|
attentions=all_self_attns,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class LlamaDecoderLayer(OriginalLlamaDecoderLayer):
|
||||||
|
"""
|
||||||
|
patched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens
|
||||||
|
"""
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
|
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
||||||
|
output_attentions: Optional[bool] = False,
|
||||||
|
use_cache: Optional[bool] = False,
|
||||||
|
cu_seqlens: Optional[torch.Tensor] = None,
|
||||||
|
max_seqlen: Optional[torch.Tensor] = None,
|
||||||
|
) -> Tuple[
|
||||||
|
torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
|
||||||
|
]:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
|
||||||
|
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
|
||||||
|
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
|
||||||
|
output_attentions (`bool`, *optional*):
|
||||||
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||||
|
returned tensors for more detail.
|
||||||
|
use_cache (`bool`, *optional*):
|
||||||
|
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
|
||||||
|
(see `past_key_values`).
|
||||||
|
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
|
||||||
|
cu_seqlens (`torch.Tensor`, *optional*) cumulative sequence len when packing
|
||||||
|
"""
|
||||||
|
|
||||||
|
residual = hidden_states
|
||||||
|
|
||||||
|
hidden_states = self.input_layernorm(hidden_states)
|
||||||
|
|
||||||
|
# Self Attention
|
||||||
|
hidden_states, self_attn_weights, present_key_value = self.self_attn(
|
||||||
|
hidden_states=hidden_states,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
|
past_key_value=past_key_value,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
use_cache=use_cache,
|
||||||
|
cu_seqlens=cu_seqlens,
|
||||||
|
max_seqlen=max_seqlen,
|
||||||
|
)
|
||||||
|
hidden_states = residual + hidden_states
|
||||||
|
|
||||||
|
# Fully Connected
|
||||||
|
residual = hidden_states
|
||||||
|
hidden_states = self.post_attention_layernorm(hidden_states)
|
||||||
|
hidden_states = self.mlp(hidden_states)
|
||||||
|
hidden_states = residual + hidden_states
|
||||||
|
|
||||||
|
outputs = (hidden_states,)
|
||||||
|
|
||||||
|
if output_attentions:
|
||||||
|
outputs += (self_attn_weights,)
|
||||||
|
|
||||||
|
if use_cache:
|
||||||
|
outputs += (present_key_value,)
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|||||||
140
src/axolotl/monkeypatch/llama_attn_hijack_sdp.py
Normal file
140
src/axolotl/monkeypatch/llama_attn_hijack_sdp.py
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
"""
|
||||||
|
Patched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention
|
||||||
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import transformers.models.llama.modeling_llama
|
||||||
|
from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
|
||||||
|
|
||||||
|
|
||||||
|
def hijack_llama_sdp_attention():
|
||||||
|
transformers.models.llama.modeling_llama.LlamaAttention.forward = (
|
||||||
|
sdp_attention_forward
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def sdp_attention_forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_ids: Optional[torch.LongTensor] = None,
|
||||||
|
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
||||||
|
output_attentions: bool = False,
|
||||||
|
use_cache: bool = False,
|
||||||
|
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||||
|
# pylint: disable=duplicate-code
|
||||||
|
bsz, q_len, _ = hidden_states.size()
|
||||||
|
|
||||||
|
if not hasattr(self, "pretraining_tp"):
|
||||||
|
self.pretraining_tp = 1
|
||||||
|
|
||||||
|
if self.pretraining_tp > 1:
|
||||||
|
key_value_slicing = (
|
||||||
|
self.num_key_value_heads * self.head_dim
|
||||||
|
) // self.pretraining_tp
|
||||||
|
query_slices = self.q_proj.weight.split(
|
||||||
|
(self.num_heads * self.head_dim) // self.pretraining_tp, dim=0
|
||||||
|
)
|
||||||
|
key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
|
||||||
|
value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
|
||||||
|
|
||||||
|
query_states = [
|
||||||
|
F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)
|
||||||
|
]
|
||||||
|
query_states = torch.cat(query_states, dim=-1)
|
||||||
|
|
||||||
|
key_states = [
|
||||||
|
F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)
|
||||||
|
]
|
||||||
|
key_states = torch.cat(key_states, dim=-1)
|
||||||
|
|
||||||
|
value_states = [
|
||||||
|
F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)
|
||||||
|
]
|
||||||
|
value_states = torch.cat(value_states, dim=-1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
query_states = self.q_proj(hidden_states)
|
||||||
|
key_states = self.k_proj(hidden_states)
|
||||||
|
value_states = self.v_proj(hidden_states)
|
||||||
|
|
||||||
|
query_states = query_states.view(
|
||||||
|
bsz, q_len, self.num_heads, self.head_dim
|
||||||
|
).transpose(1, 2)
|
||||||
|
key_states = key_states.view(
|
||||||
|
bsz, q_len, self.num_key_value_heads, self.head_dim
|
||||||
|
).transpose(1, 2)
|
||||||
|
value_states = value_states.view(
|
||||||
|
bsz, q_len, self.num_key_value_heads, self.head_dim
|
||||||
|
).transpose(1, 2)
|
||||||
|
# [bsz, q_len, nh, hd]
|
||||||
|
# [bsz, nh, q_len, hd]
|
||||||
|
|
||||||
|
kv_seq_len = key_states.shape[-2]
|
||||||
|
if past_key_value is not None:
|
||||||
|
kv_seq_len += past_key_value[0].shape[-2]
|
||||||
|
|
||||||
|
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
||||||
|
query_states, key_states = apply_rotary_pos_emb(
|
||||||
|
query_states, key_states, cos, sin, position_ids
|
||||||
|
)
|
||||||
|
# [bsz, nh, t, hd]
|
||||||
|
|
||||||
|
if past_key_value is not None:
|
||||||
|
# reuse k, v, self_attention
|
||||||
|
key_states = torch.cat([past_key_value[0], key_states], dim=2)
|
||||||
|
value_states = torch.cat([past_key_value[1], value_states], dim=2)
|
||||||
|
|
||||||
|
past_key_value = (key_states, value_states) if use_cache else None
|
||||||
|
|
||||||
|
# repeat k/v heads if n_kv_heads < n_heads
|
||||||
|
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
||||||
|
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
||||||
|
|
||||||
|
if output_attentions:
|
||||||
|
warnings.warn(
|
||||||
|
"Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
|
||||||
|
)
|
||||||
|
|
||||||
|
#
|
||||||
|
# sdp-attn start
|
||||||
|
#
|
||||||
|
|
||||||
|
with torch.backends.cuda.sdp_kernel():
|
||||||
|
attn_output = torch.nn.functional.scaled_dot_product_attention(
|
||||||
|
query_states,
|
||||||
|
key_states,
|
||||||
|
value_states,
|
||||||
|
attn_mask=attention_mask,
|
||||||
|
is_causal=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
|
||||||
|
raise ValueError(
|
||||||
|
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
|
||||||
|
f" {attn_output.size()}"
|
||||||
|
)
|
||||||
|
attn_output = attn_output.transpose(1, 2)
|
||||||
|
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
|
||||||
|
|
||||||
|
#
|
||||||
|
# sdp-attn end
|
||||||
|
#
|
||||||
|
|
||||||
|
if self.pretraining_tp > 1:
|
||||||
|
attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
|
||||||
|
o_proj_slices = self.o_proj.weight.split(
|
||||||
|
self.hidden_size // self.pretraining_tp, dim=1
|
||||||
|
)
|
||||||
|
attn_output = sum(
|
||||||
|
F.linear(attn_output[i], o_proj_slices[i])
|
||||||
|
for i in range(self.pretraining_tp)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
attn_output = self.o_proj(attn_output)
|
||||||
|
|
||||||
|
return attn_output, None, past_key_value
|
||||||
@@ -3,13 +3,13 @@ Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-g
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import math
|
import warnings
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
import transformers.models.llama.modeling_llama
|
import transformers.models.llama.modeling_llama
|
||||||
from torch import nn
|
from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import xformers.ops
|
import xformers.ops
|
||||||
@@ -21,12 +21,6 @@ def hijack_llama_attention():
|
|||||||
transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward
|
transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward
|
||||||
|
|
||||||
|
|
||||||
def hijack_llama_sdp_attention():
|
|
||||||
transformers.models.llama.modeling_llama.LlamaAttention.forward = (
|
|
||||||
sdp_attention_forward
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def xformers_forward(
|
def xformers_forward(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
@@ -81,15 +75,15 @@ def xformers_forward(
|
|||||||
value_states = value_states.view(
|
value_states = value_states.view(
|
||||||
bsz, q_len, self.num_key_value_heads, self.head_dim
|
bsz, q_len, self.num_key_value_heads, self.head_dim
|
||||||
).transpose(1, 2)
|
).transpose(1, 2)
|
||||||
|
# [bsz, q_len, nh, hd]
|
||||||
|
# [bsz, nh, q_len, hd]
|
||||||
|
|
||||||
kv_seq_len = key_states.shape[-2]
|
kv_seq_len = key_states.shape[-2]
|
||||||
if past_key_value is not None:
|
if past_key_value is not None:
|
||||||
kv_seq_len += past_key_value[0].shape[-2]
|
kv_seq_len += past_key_value[0].shape[-2]
|
||||||
|
|
||||||
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
||||||
(
|
query_states, key_states = apply_rotary_pos_emb(
|
||||||
query_states,
|
|
||||||
key_states,
|
|
||||||
) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
|
|
||||||
query_states, key_states, cos, sin, position_ids
|
query_states, key_states, cos, sin, position_ids
|
||||||
)
|
)
|
||||||
# [bsz, nh, t, hd]
|
# [bsz, nh, t, hd]
|
||||||
@@ -102,74 +96,50 @@ def xformers_forward(
|
|||||||
past_key_value = (key_states, value_states) if use_cache else None
|
past_key_value = (key_states, value_states) if use_cache else None
|
||||||
|
|
||||||
# repeat k/v heads if n_kv_heads < n_heads
|
# repeat k/v heads if n_kv_heads < n_heads
|
||||||
key_states = transformers.models.llama.modeling_llama.repeat_kv(
|
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
||||||
key_states, self.num_key_value_groups
|
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
||||||
)
|
|
||||||
value_states = transformers.models.llama.modeling_llama.repeat_kv(
|
|
||||||
value_states, self.num_key_value_groups
|
|
||||||
)
|
|
||||||
|
|
||||||
# We only apply xformers optimizations if we don't need to output the whole attention matrix
|
if output_attentions:
|
||||||
if not output_attentions:
|
warnings.warn(
|
||||||
query_states = query_states.transpose(1, 2)
|
"Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
|
||||||
key_states = key_states.transpose(1, 2)
|
)
|
||||||
value_states = value_states.transpose(1, 2)
|
|
||||||
|
|
||||||
# This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
|
#
|
||||||
# We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
|
# xformers-attn start
|
||||||
if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
|
#
|
||||||
# input and output should be of form (bsz, q_len, num_heads, head_dim)
|
|
||||||
attn_output = xformers.ops.memory_efficient_attention(
|
query_states = query_states.transpose(1, 2)
|
||||||
query_states, key_states, value_states, attn_bias=None
|
key_states = key_states.transpose(1, 2)
|
||||||
)
|
value_states = value_states.transpose(1, 2)
|
||||||
else:
|
|
||||||
# input and output should be of form (bsz, q_len, num_heads, head_dim)
|
# This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
|
||||||
attn_output = xformers.ops.memory_efficient_attention(
|
# We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
|
||||||
query_states,
|
if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
|
||||||
key_states,
|
# input and output should be of form (bsz, q_len, num_heads, head_dim)
|
||||||
value_states,
|
attn_output = xformers.ops.memory_efficient_attention(
|
||||||
# attn_bias=attention_mask,
|
query_states, key_states, value_states, attn_bias=None
|
||||||
attn_bias=xformers.ops.LowerTriangularMask(),
|
)
|
||||||
)
|
|
||||||
attn_weights = None
|
|
||||||
else:
|
else:
|
||||||
attn_weights = torch.matmul(
|
# input and output should be of form (bsz, q_len, num_heads, head_dim)
|
||||||
query_states, key_states.transpose(2, 3)
|
attn_output = xformers.ops.memory_efficient_attention(
|
||||||
) / math.sqrt(self.head_dim)
|
query_states,
|
||||||
|
key_states,
|
||||||
if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
|
value_states,
|
||||||
raise ValueError(
|
# attn_bias=attention_mask,
|
||||||
f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
|
attn_bias=xformers.ops.LowerTriangularMask(),
|
||||||
f" {attn_weights.size()}"
|
)
|
||||||
)
|
|
||||||
|
|
||||||
if attention_mask is not None:
|
|
||||||
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
|
|
||||||
raise ValueError(
|
|
||||||
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
|
|
||||||
)
|
|
||||||
attn_weights = attn_weights + attention_mask
|
|
||||||
attn_weights = torch.max(
|
|
||||||
attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
|
|
||||||
)
|
|
||||||
|
|
||||||
# upcast attention to fp32
|
|
||||||
attn_weights = nn.functional.softmax(
|
|
||||||
attn_weights, dim=-1, dtype=torch.float32
|
|
||||||
).to(query_states.dtype)
|
|
||||||
attn_output = torch.matmul(attn_weights, value_states)
|
|
||||||
|
|
||||||
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
|
|
||||||
raise ValueError(
|
|
||||||
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
|
|
||||||
f" {attn_output.size()}"
|
|
||||||
)
|
|
||||||
|
|
||||||
attn_output = attn_output.transpose(1, 2).contiguous()
|
|
||||||
# end x-formers vs. not x-formers if-else block
|
|
||||||
|
|
||||||
|
if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim):
|
||||||
|
raise ValueError(
|
||||||
|
f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is"
|
||||||
|
f" {attn_output.size()}"
|
||||||
|
)
|
||||||
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
|
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
|
||||||
|
|
||||||
|
#
|
||||||
|
# xformers-attn end
|
||||||
|
#
|
||||||
|
|
||||||
if self.pretraining_tp > 1:
|
if self.pretraining_tp > 1:
|
||||||
attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
|
attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
|
||||||
o_proj_slices = self.o_proj.weight.split(
|
o_proj_slices = self.o_proj.weight.split(
|
||||||
@@ -182,103 +152,4 @@ def xformers_forward(
|
|||||||
else:
|
else:
|
||||||
attn_output = self.o_proj(attn_output)
|
attn_output = self.o_proj(attn_output)
|
||||||
|
|
||||||
return attn_output, attn_weights, past_key_value
|
return attn_output, None, past_key_value
|
||||||
|
|
||||||
|
|
||||||
def sdp_attention_forward(
|
|
||||||
self,
|
|
||||||
hidden_states: torch.Tensor,
|
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
|
||||||
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
|
||||||
output_attentions: bool = False,
|
|
||||||
use_cache: bool = False,
|
|
||||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
|
||||||
# pylint: disable=duplicate-code
|
|
||||||
bsz, q_len, _ = hidden_states.size()
|
|
||||||
|
|
||||||
query_states = (
|
|
||||||
self.q_proj(hidden_states)
|
|
||||||
.view(bsz, q_len, self.num_heads, self.head_dim)
|
|
||||||
.transpose(1, 2)
|
|
||||||
)
|
|
||||||
key_states = (
|
|
||||||
self.k_proj(hidden_states)
|
|
||||||
.view(bsz, q_len, self.num_heads, self.head_dim)
|
|
||||||
.transpose(1, 2)
|
|
||||||
)
|
|
||||||
value_states = (
|
|
||||||
self.v_proj(hidden_states)
|
|
||||||
.view(bsz, q_len, self.num_heads, self.head_dim)
|
|
||||||
.transpose(1, 2)
|
|
||||||
)
|
|
||||||
|
|
||||||
kv_seq_len = key_states.shape[-2]
|
|
||||||
if past_key_value is not None:
|
|
||||||
kv_seq_len += past_key_value[0].shape[-2]
|
|
||||||
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
|
||||||
(
|
|
||||||
query_states,
|
|
||||||
key_states,
|
|
||||||
) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
|
|
||||||
query_states, key_states, cos, sin, position_ids
|
|
||||||
)
|
|
||||||
# [bsz, nh, t, hd]
|
|
||||||
|
|
||||||
if past_key_value is not None:
|
|
||||||
# reuse k, v, self_attention
|
|
||||||
key_states = torch.cat([past_key_value[0], key_states], dim=2)
|
|
||||||
value_states = torch.cat([past_key_value[1], value_states], dim=2)
|
|
||||||
|
|
||||||
past_key_value = (key_states, value_states) if use_cache else None
|
|
||||||
|
|
||||||
# We only apply sdp attention if we don't need to output the whole attention matrix
|
|
||||||
if not output_attentions:
|
|
||||||
with torch.backends.cuda.sdp_kernel():
|
|
||||||
attn_output = torch.nn.functional.scaled_dot_product_attention(
|
|
||||||
query_states,
|
|
||||||
key_states,
|
|
||||||
value_states,
|
|
||||||
attn_mask=attention_mask,
|
|
||||||
is_causal=False,
|
|
||||||
)
|
|
||||||
attn_weights = None
|
|
||||||
else:
|
|
||||||
attn_weights = torch.matmul(
|
|
||||||
query_states, key_states.transpose(2, 3)
|
|
||||||
) / math.sqrt(self.head_dim)
|
|
||||||
|
|
||||||
if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
|
|
||||||
raise ValueError(
|
|
||||||
f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
|
|
||||||
f" {attn_weights.size()}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if attention_mask is not None:
|
|
||||||
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
|
|
||||||
raise ValueError(
|
|
||||||
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
|
|
||||||
)
|
|
||||||
attn_weights = attn_weights + attention_mask
|
|
||||||
attn_weights = torch.max(
|
|
||||||
attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
|
|
||||||
)
|
|
||||||
|
|
||||||
# upcast attention to fp32
|
|
||||||
attn_weights = nn.functional.softmax(
|
|
||||||
attn_weights, dim=-1, dtype=torch.float32
|
|
||||||
).to(query_states.dtype)
|
|
||||||
attn_output = torch.matmul(attn_weights, value_states)
|
|
||||||
|
|
||||||
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
|
|
||||||
raise ValueError(
|
|
||||||
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
|
|
||||||
f" {attn_output.size()}"
|
|
||||||
)
|
|
||||||
|
|
||||||
attn_output = attn_output.transpose(1, 2)
|
|
||||||
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
|
|
||||||
|
|
||||||
attn_output = self.o_proj(attn_output)
|
|
||||||
|
|
||||||
return attn_output, attn_weights, past_key_value
|
|
||||||
|
|||||||
393
src/axolotl/monkeypatch/relora.py
Normal file
393
src/axolotl/monkeypatch/relora.py
Normal file
@@ -0,0 +1,393 @@
|
|||||||
|
"""Implements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune."""
|
||||||
|
import glob
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os.path
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Sequence
|
||||||
|
|
||||||
|
import bitsandbytes as bnb
|
||||||
|
import peft
|
||||||
|
import safetensors.torch as st
|
||||||
|
import torch
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
from torch.optim.lr_scheduler import LRScheduler
|
||||||
|
from torch.optim.optimizer import Optimizer
|
||||||
|
from transformers import (
|
||||||
|
TrainerCallback,
|
||||||
|
TrainerControl,
|
||||||
|
TrainerState,
|
||||||
|
TrainingArguments,
|
||||||
|
)
|
||||||
|
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
|
||||||
|
|
||||||
|
from axolotl.utils.dict import DictDefault
|
||||||
|
from axolotl.utils.distributed import is_main_process
|
||||||
|
|
||||||
|
LOG = logging.getLogger("axolotl.relora")
|
||||||
|
|
||||||
|
|
||||||
|
def reset_optimizer(optimizer: torch.optim.Optimizer):
|
||||||
|
for group in optimizer.param_groups:
|
||||||
|
for param in group["params"]:
|
||||||
|
param_state = optimizer.state[param]
|
||||||
|
for key in param_state:
|
||||||
|
if "qmap" in key:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if key == "step" and isinstance(param_state[key], int):
|
||||||
|
param_state[key] = 0
|
||||||
|
else:
|
||||||
|
param_state[key] = torch.zeros_like(param_state[key])
|
||||||
|
|
||||||
|
|
||||||
|
class ReLoRACallback(TrainerCallback):
|
||||||
|
"""Callback to merge LoRA weights into the base model and save full-weight checkpoints"""
|
||||||
|
|
||||||
|
def __init__(self, cfg: DictDefault):
|
||||||
|
self.relora_steps = cfg.relora_steps
|
||||||
|
self.cpu_offload = cfg.relora_cpu_offload
|
||||||
|
self.quantized = cfg.load_in_4bit or cfg.load_in_8bit
|
||||||
|
self.last_full_model = cfg.base_model
|
||||||
|
self.resume_from_checkpoint = cfg.resume_from_checkpoint
|
||||||
|
|
||||||
|
if not os.path.exists(self.last_full_model):
|
||||||
|
self.last_full_model = str(Path(snapshot_download(cfg.base_model)))
|
||||||
|
|
||||||
|
assert os.path.exists(
|
||||||
|
self.last_full_model
|
||||||
|
), "for ReLORA base_model must be a local path"
|
||||||
|
|
||||||
|
self.num_lora_restarts = 0
|
||||||
|
self.need_full_save = False
|
||||||
|
|
||||||
|
def on_train_begin(
|
||||||
|
self,
|
||||||
|
_args: TrainingArguments,
|
||||||
|
_state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
model: peft.LoraModel,
|
||||||
|
**_kwargs,
|
||||||
|
):
|
||||||
|
if self.resume_from_checkpoint:
|
||||||
|
weight_path = os.path.join(self.resume_from_checkpoint, "relora")
|
||||||
|
if not os.path.exists(weight_path):
|
||||||
|
LOG.warning(
|
||||||
|
"Resuming ReLoRA from checkpoint, but no full-weight save found"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
LOG.info(f"Loading adjusted base weights from {weight_path}")
|
||||||
|
load_weight_checkpoint(model, weight_path)
|
||||||
|
return control
|
||||||
|
|
||||||
|
def on_step_begin(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments,
|
||||||
|
state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
model: peft.LoraModel,
|
||||||
|
optimizer: torch.optim.Optimizer,
|
||||||
|
**_kwargs,
|
||||||
|
):
|
||||||
|
if state.global_step > 0 and state.global_step % self.relora_steps == 0:
|
||||||
|
checkpoint_folder = os.path.join(
|
||||||
|
args.output_dir,
|
||||||
|
f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
|
||||||
|
"relora",
|
||||||
|
)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
merge_and_save(
|
||||||
|
model,
|
||||||
|
self.last_full_model,
|
||||||
|
checkpoint_folder,
|
||||||
|
reinit=True,
|
||||||
|
quantized=self.quantized,
|
||||||
|
actually_save=is_main_process(),
|
||||||
|
cpu_offload=self.cpu_offload,
|
||||||
|
)
|
||||||
|
reset_optimizer(optimizer)
|
||||||
|
|
||||||
|
if self.quantized:
|
||||||
|
self.last_full_model = checkpoint_folder
|
||||||
|
self.num_lora_restarts += 1
|
||||||
|
|
||||||
|
return control
|
||||||
|
|
||||||
|
def on_save(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments,
|
||||||
|
state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
model: peft.LoraModel,
|
||||||
|
**_kwargs,
|
||||||
|
):
|
||||||
|
checkpoint_folder = os.path.join(
|
||||||
|
args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}", "relora"
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
state.global_step >= self.relora_steps
|
||||||
|
and state.global_step % self.relora_steps != 0
|
||||||
|
):
|
||||||
|
if self.quantized:
|
||||||
|
if is_main_process() and self.last_full_model != checkpoint_folder:
|
||||||
|
# ensure the latest full parameter save is in the latest checkpoint
|
||||||
|
# folder, so that automatic pruning of checkpoints does not remove it
|
||||||
|
LOG.info(f"moving last full parameter save to {checkpoint_folder}")
|
||||||
|
os.makedirs(checkpoint_folder, exist_ok=True)
|
||||||
|
chunks = glob.glob(
|
||||||
|
f"{self.last_full_model}/model*.safetensors"
|
||||||
|
) + glob.glob(f"{self.last_full_model}/model*.index.json")
|
||||||
|
for path in chunks:
|
||||||
|
new_path = os.path.abspath(shutil.move(path, checkpoint_folder))
|
||||||
|
try:
|
||||||
|
os.symlink(new_path, path)
|
||||||
|
except OSError:
|
||||||
|
# probably on windows without permission to symlink
|
||||||
|
pass
|
||||||
|
|
||||||
|
self.last_full_model = checkpoint_folder
|
||||||
|
else:
|
||||||
|
model.model.save_pretrained(checkpoint_folder, safe_serialization=True)
|
||||||
|
|
||||||
|
return control
|
||||||
|
|
||||||
|
def on_log(
|
||||||
|
self,
|
||||||
|
_args: TrainingArguments,
|
||||||
|
_state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
logs: Dict[str, float],
|
||||||
|
**_kwargs,
|
||||||
|
):
|
||||||
|
logs["num_lora_restarts"] = self.num_lora_restarts
|
||||||
|
return control
|
||||||
|
|
||||||
|
def on_train_end(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments,
|
||||||
|
_state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
model: peft.LoraModel,
|
||||||
|
**_kwargs,
|
||||||
|
):
|
||||||
|
if self.quantized:
|
||||||
|
# perform final merge and save
|
||||||
|
with torch.no_grad():
|
||||||
|
merge_and_save(
|
||||||
|
model,
|
||||||
|
self.last_full_model,
|
||||||
|
args.output_dir,
|
||||||
|
reinit=False,
|
||||||
|
quantized=self.quantized,
|
||||||
|
actually_save=is_main_process(),
|
||||||
|
cpu_offload=self.cpu_offload,
|
||||||
|
)
|
||||||
|
# no need to save if unquantized, as finetune.py will call merge_and_unload()
|
||||||
|
return control
|
||||||
|
|
||||||
|
|
||||||
|
class ReLoRAScheduler(LRScheduler):
|
||||||
|
"""Wraps another scheduler to apply per-lora-restart learning rate warmups."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
optimizer: Optimizer,
|
||||||
|
inner_schedule: LRScheduler,
|
||||||
|
relora_steps: int,
|
||||||
|
warmup_steps: int,
|
||||||
|
min_lr_scale: float = 0.001,
|
||||||
|
) -> None:
|
||||||
|
self.inner_schedule = inner_schedule
|
||||||
|
self.relora_steps = relora_steps
|
||||||
|
self.warmup_steps = warmup_steps
|
||||||
|
self.min_lr_scale = min_lr_scale
|
||||||
|
super().__init__(optimizer, inner_schedule.last_epoch, inner_schedule.verbose)
|
||||||
|
|
||||||
|
def get_lr(self) -> float:
|
||||||
|
self.inner_schedule.last_epoch = self.last_epoch
|
||||||
|
|
||||||
|
original = self.inner_schedule.get_lr()
|
||||||
|
step = self.last_epoch
|
||||||
|
if step < self.relora_steps:
|
||||||
|
scale = 1
|
||||||
|
else:
|
||||||
|
cycle_t = min(1.0, (step % self.relora_steps) / self.warmup_steps)
|
||||||
|
scale = cycle_t * (1 - self.min_lr_scale) + self.min_lr_scale
|
||||||
|
|
||||||
|
if isinstance(original, Sequence):
|
||||||
|
return [lr * scale for lr in original]
|
||||||
|
return original * scale
|
||||||
|
|
||||||
|
|
||||||
|
def sharded_paths(path: str, module_names: List[str]) -> Dict[str, str]:
|
||||||
|
model_name = "model.safetensors"
|
||||||
|
if not os.path.exists(str(Path(path) / model_name)) and not os.path.exists(
|
||||||
|
str(Path(path) / f"{model_name}.index.json")
|
||||||
|
):
|
||||||
|
model_name = "pytorch_model.bin"
|
||||||
|
|
||||||
|
index_path = str(Path(path) / f"{model_name}.index.json")
|
||||||
|
if os.path.exists(index_path):
|
||||||
|
with open(index_path, "r", encoding="utf-8") as file:
|
||||||
|
data = json.load(file)
|
||||||
|
return data["weight_map"]
|
||||||
|
return {(module_name + ".weight"): model_name for module_name in module_names}
|
||||||
|
|
||||||
|
|
||||||
|
def lora_delta_weight(layer: peft.tuners.lora.LoraLayer, device) -> torch.Tensor:
|
||||||
|
if isinstance(layer, (peft.tuners.lora.Linear8bitLt, peft.tuners.lora.Linear4bit)):
|
||||||
|
adapter = layer.active_adapter
|
||||||
|
return (
|
||||||
|
peft.utils.transpose(
|
||||||
|
layer.lora_B[adapter].weight.detach().to(device)
|
||||||
|
@ layer.lora_A[adapter].weight.detach().to(device),
|
||||||
|
getattr(layer, "fan_in_fan_out", False),
|
||||||
|
)
|
||||||
|
* layer.scaling[adapter]
|
||||||
|
)
|
||||||
|
|
||||||
|
return layer.get_delta_weight().to(device)
|
||||||
|
|
||||||
|
|
||||||
|
def find_lora_modules(model: peft.LoraModel) -> Dict[str, peft.tuners.lora.LoraLayer]:
|
||||||
|
modules: Dict[str, peft.tuners.lora.LoraLayer] = {}
|
||||||
|
|
||||||
|
key_list = [key for key, _ in model.model.named_modules() if "lora" not in key]
|
||||||
|
for key in key_list:
|
||||||
|
try:
|
||||||
|
# pylint: disable=protected-access
|
||||||
|
_parent, target, _target_name = peft.utils._get_submodules(model.model, key)
|
||||||
|
except AttributeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isinstance(target, peft.tuners.lora.LoraLayer):
|
||||||
|
modules[key] = target
|
||||||
|
|
||||||
|
return modules
|
||||||
|
|
||||||
|
|
||||||
|
def update_weights(
|
||||||
|
target: peft.tuners.lora.LoraLayer, new_weight: torch.Tensor, reinit: bool, device
|
||||||
|
):
|
||||||
|
if reinit:
|
||||||
|
for adapter_name in target.lora_A:
|
||||||
|
target.reset_lora_parameters(adapter_name)
|
||||||
|
for adapter_name in target.lora_embedding_A:
|
||||||
|
target.reset_lora_parameters(adapter_name)
|
||||||
|
|
||||||
|
if isinstance(target, peft.tuners.lora.Linear4bit):
|
||||||
|
# This could be faster, but the quantization of Linear4bit weights occurs
|
||||||
|
# when the module is moved from cpu to gpu. Without meddling *too* deeply in
|
||||||
|
# PEFT's innards or maintaining a duplicate of that codepath, this is good
|
||||||
|
# enough for now.
|
||||||
|
target.weight.quant_state = None
|
||||||
|
target.weight.data = new_weight.cpu()
|
||||||
|
target.to(device)
|
||||||
|
elif isinstance(target, peft.tuners.lora.Linear8bitLt):
|
||||||
|
target.weight = bnb.nn.Int8Params(new_weight, requires_grad=False).to(device)
|
||||||
|
else:
|
||||||
|
target.weight.data = new_weight.to(device)
|
||||||
|
|
||||||
|
|
||||||
|
def merge_and_save(
|
||||||
|
model: peft.LoraModel,
|
||||||
|
model_src: str,
|
||||||
|
model_dst: str,
|
||||||
|
reinit: bool = False,
|
||||||
|
quantized: bool = False,
|
||||||
|
cpu_offload: bool = False,
|
||||||
|
actually_save: bool = True,
|
||||||
|
):
|
||||||
|
modules = find_lora_modules(model)
|
||||||
|
|
||||||
|
if not quantized:
|
||||||
|
for module_name, target in modules.items():
|
||||||
|
update = target.get_delta_weight(target.active_adapter).detach()
|
||||||
|
target.weight.data += update
|
||||||
|
|
||||||
|
if reinit:
|
||||||
|
for adapter_name in target.lora_A:
|
||||||
|
target.reset_lora_parameters(adapter_name)
|
||||||
|
for adapter_name in target.lora_embedding_A:
|
||||||
|
target.reset_lora_parameters(adapter_name)
|
||||||
|
return
|
||||||
|
|
||||||
|
os.makedirs(model_dst, exist_ok=True)
|
||||||
|
shard_paths = sharded_paths(model_src, modules.keys())
|
||||||
|
out_shard_paths = {}
|
||||||
|
|
||||||
|
unique_shards = list(set(shard_paths.values()))
|
||||||
|
for shard_path in unique_shards:
|
||||||
|
out_tensors = {}
|
||||||
|
if shard_path.endswith(".safetensors"):
|
||||||
|
in_tensors = st.load_file(str(Path(model_src) / shard_path))
|
||||||
|
else:
|
||||||
|
in_tensors = torch.load(Path(model_src) / shard_path)
|
||||||
|
if "state_dict" in in_tensors:
|
||||||
|
in_tensors = in_tensors["state_dict"]
|
||||||
|
|
||||||
|
for module_name, target in modules.items():
|
||||||
|
key = module_name + ".weight"
|
||||||
|
if key not in shard_paths or shard_paths[key] != shard_path:
|
||||||
|
continue
|
||||||
|
|
||||||
|
orig_weight = in_tensors[key]
|
||||||
|
old_dev = target.weight.device
|
||||||
|
math_dev = "cpu" if cpu_offload else old_dev
|
||||||
|
|
||||||
|
delta_weight = lora_delta_weight(target, math_dev)
|
||||||
|
new_weight = orig_weight.to(math_dev) + delta_weight
|
||||||
|
del delta_weight
|
||||||
|
|
||||||
|
if actually_save:
|
||||||
|
out_tensors[key] = new_weight.half().cpu()
|
||||||
|
|
||||||
|
update_weights(target, new_weight, reinit=reinit, device=old_dev)
|
||||||
|
|
||||||
|
if actually_save:
|
||||||
|
out_shard_name = shard_path
|
||||||
|
if out_shard_name.startswith("pytorch_model"):
|
||||||
|
out_shard_name = (
|
||||||
|
out_shard_name.replace("pytorch_model", "model").rstrip(".bin")
|
||||||
|
+ ".safetensors"
|
||||||
|
)
|
||||||
|
|
||||||
|
for module_name in in_tensors:
|
||||||
|
if module_name not in out_tensors:
|
||||||
|
out_tensors[module_name] = in_tensors[module_name].half()
|
||||||
|
out_shard_paths[module_name] = out_shard_name
|
||||||
|
|
||||||
|
shard_fn = str(Path(model_dst) / out_shard_name)
|
||||||
|
LOG.info(f"saving tensors to {shard_fn}")
|
||||||
|
st.save_file(out_tensors, shard_fn, metadata={"format": "pt"})
|
||||||
|
|
||||||
|
del in_tensors
|
||||||
|
del out_tensors
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
if actually_save and len(unique_shards) > 1:
|
||||||
|
with open(
|
||||||
|
str(Path(model_dst, "model.safetensors.index.json")), "w", encoding="utf-8"
|
||||||
|
) as file:
|
||||||
|
json.dump({"metadata": {}, "weight_map": out_shard_paths}, file)
|
||||||
|
|
||||||
|
|
||||||
|
def load_weight_checkpoint(model: peft.LoraModel, checkpoint_path: str):
|
||||||
|
modules = find_lora_modules(model)
|
||||||
|
shard_paths = sharded_paths(checkpoint_path, modules.keys())
|
||||||
|
unique_shards = list(set(shard_paths.values()))
|
||||||
|
|
||||||
|
for shard_path in unique_shards:
|
||||||
|
tensors = st.load_file(os.path.join(checkpoint_path, shard_path))
|
||||||
|
|
||||||
|
for module_name, target in modules.items():
|
||||||
|
key = module_name + ".weight"
|
||||||
|
if key not in shard_paths or shard_paths[key] != shard_path:
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_weight = tensors[key]
|
||||||
|
update_weights(
|
||||||
|
target, new_weight, reinit=False, device=target.weight.device
|
||||||
|
)
|
||||||
@@ -2,8 +2,10 @@
|
|||||||
|
|
||||||
import importlib
|
import importlib
|
||||||
|
|
||||||
|
from axolotl.prompt_strategies.user_defined import UserDefinedDatasetConfig
|
||||||
|
|
||||||
def load(strategy, tokenizer, cfg):
|
|
||||||
|
def load(strategy, tokenizer, cfg, ds_cfg):
|
||||||
try:
|
try:
|
||||||
load_fn = "load"
|
load_fn = "load"
|
||||||
if strategy.split(".")[-1].startswith("load_"):
|
if strategy.split(".")[-1].startswith("load_"):
|
||||||
@@ -11,6 +13,9 @@ def load(strategy, tokenizer, cfg):
|
|||||||
strategy = ".".join(strategy.split(".")[:-1])
|
strategy = ".".join(strategy.split(".")[:-1])
|
||||||
mod = importlib.import_module(f".{strategy}", "axolotl.prompt_strategies")
|
mod = importlib.import_module(f".{strategy}", "axolotl.prompt_strategies")
|
||||||
func = getattr(mod, load_fn)
|
func = getattr(mod, load_fn)
|
||||||
return func(tokenizer, cfg)
|
load_kwargs = {}
|
||||||
|
if strategy == "user_defined":
|
||||||
|
load_kwargs["ds_cfg"] = UserDefinedDatasetConfig(**ds_cfg)
|
||||||
|
return func(tokenizer, cfg, **load_kwargs)
|
||||||
except Exception: # pylint: disable=broad-exception-caught
|
except Exception: # pylint: disable=broad-exception-caught
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -1,49 +1,8 @@
|
|||||||
"""Module loading the AlpacaInstructPromptTokenizingStrategy class"""
|
"""Module loading the AlpacaInstructPromptTokenizingStrategy class"""
|
||||||
import logging
|
|
||||||
|
|
||||||
from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
|
from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
|
||||||
from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter
|
from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.prompt_strategies.alpaca_instruct")
|
|
||||||
|
|
||||||
|
|
||||||
class LatentSpaceAlpacaPromptTokenizingStrategy(AlpacaPromptTokenizingStrategy):
|
|
||||||
"""
|
|
||||||
Overrides the tokenization to include additional padding tokens as
|
|
||||||
latent space on the inputs
|
|
||||||
"""
|
|
||||||
|
|
||||||
def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False):
|
|
||||||
# pylint: disable=duplicate-code
|
|
||||||
result = self.tokenizer(
|
|
||||||
prompt,
|
|
||||||
truncation=True,
|
|
||||||
max_length=self.sequence_len,
|
|
||||||
padding=False,
|
|
||||||
return_tensors=None,
|
|
||||||
)
|
|
||||||
if len(result["input_ids"]) == 0:
|
|
||||||
LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
|
|
||||||
if (
|
|
||||||
len(result["input_ids"]) > 0
|
|
||||||
and result["input_ids"][-1] != self.tokenizer.eos_token_id
|
|
||||||
and len(result["input_ids"]) < self.sequence_len
|
|
||||||
and add_eos_token
|
|
||||||
):
|
|
||||||
result["input_ids"].append(self.tokenizer.eos_token_id)
|
|
||||||
result["attention_mask"].append(1)
|
|
||||||
|
|
||||||
if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
|
|
||||||
result["input_ids"] = result["input_ids"][1:]
|
|
||||||
result["attention_mask"] = result["attention_mask"][1:]
|
|
||||||
|
|
||||||
# latent space
|
|
||||||
if add_eos_token and not strip_bos_token:
|
|
||||||
result["input_ids"].extend([self.tokenizer.pad_token_id] * 100)
|
|
||||||
|
|
||||||
result["labels"] = result["input_ids"].copy()
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def load(tokenizer, cfg):
|
def load(tokenizer, cfg):
|
||||||
return AlpacaPromptTokenizingStrategy(
|
return AlpacaPromptTokenizingStrategy(
|
||||||
@@ -61,12 +20,3 @@ def load_no_prompt(tokenizer, cfg):
|
|||||||
cfg.train_on_inputs,
|
cfg.train_on_inputs,
|
||||||
cfg.sequence_len,
|
cfg.sequence_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_latent_space(tokenizer, cfg):
|
|
||||||
return LatentSpaceAlpacaPromptTokenizingStrategy(
|
|
||||||
AlpacaPrompter(PromptStyle.INSTRUCT.value),
|
|
||||||
tokenizer,
|
|
||||||
cfg.train_on_inputs,
|
|
||||||
cfg.sequence_len,
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -57,6 +57,8 @@ class SystemDataPrompter(AlpacaPrompter):
|
|||||||
Alpaca Style Prompter that uses system prompts from the dataset
|
Alpaca Style Prompter that uses system prompts from the dataset
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
system_format: str = "### System:\n{system}\n\n"
|
||||||
|
|
||||||
def build_prompt_w_system(
|
def build_prompt_w_system(
|
||||||
self,
|
self,
|
||||||
system: str,
|
system: str,
|
||||||
|
|||||||
76
src/axolotl/prompt_strategies/metharme.py
Normal file
76
src/axolotl/prompt_strategies/metharme.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
"""Module containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy
|
||||||
|
from axolotl.prompters import AlpacaPrompter
|
||||||
|
|
||||||
|
LOG = logging.getLogger("axolotl")
|
||||||
|
|
||||||
|
IGNORE_TOKEN_ID = -100
|
||||||
|
|
||||||
|
# pylint: disable=duplicate-code
|
||||||
|
|
||||||
|
|
||||||
|
class MetharmePromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
|
||||||
|
"""
|
||||||
|
Tokenizing strategy for the Metharme models
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
|
||||||
|
return (prompt["prompt"], "", prompt["generation"])
|
||||||
|
|
||||||
|
def _tokenize(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
add_eos_token: bool = True,
|
||||||
|
strip_bos_token: bool = False,
|
||||||
|
num_eos_tokens: int = 3,
|
||||||
|
):
|
||||||
|
result = self.tokenizer(
|
||||||
|
prompt,
|
||||||
|
truncation=True,
|
||||||
|
max_length=self.sequence_len,
|
||||||
|
padding=False,
|
||||||
|
return_tensors=None,
|
||||||
|
)
|
||||||
|
if len(result["input_ids"]) == 0:
|
||||||
|
LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
|
||||||
|
# If there's already an EOS token there, subtract from the number added
|
||||||
|
if result["input_ids"][-1] == self.tokenizer.eos_token_id:
|
||||||
|
num_eos_tokens -= 1
|
||||||
|
|
||||||
|
if num_eos_tokens > 0 and add_eos_token and len(result["input_ids"]) > 0:
|
||||||
|
for _ in range(num_eos_tokens):
|
||||||
|
if len(result["input_ids"]) < self.sequence_len:
|
||||||
|
result["input_ids"].append(self.tokenizer.eos_token_id)
|
||||||
|
result["attention_mask"].append(1)
|
||||||
|
|
||||||
|
if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
|
||||||
|
result["input_ids"] = result["input_ids"][1:]
|
||||||
|
result["attention_mask"] = result["attention_mask"][1:]
|
||||||
|
|
||||||
|
result["labels"] = result["input_ids"].copy()
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
class MetharmePrompter(AlpacaPrompter):
|
||||||
|
"""
|
||||||
|
Prompter for the Metharme models.
|
||||||
|
"""
|
||||||
|
|
||||||
|
system_prompt = ""
|
||||||
|
system_no_input_prompt = ""
|
||||||
|
system_format = ""
|
||||||
|
turn_format = "{instruction}"
|
||||||
|
turn_no_input_format = "{instruction}"
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs): # pylint: disable=super-init-not-called
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def load(tokenizer, cfg):
|
||||||
|
return MetharmePromptTokenizingStrategy(
|
||||||
|
MetharmePrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
|
||||||
|
)
|
||||||
@@ -31,52 +31,6 @@ def load_guanaco(tokenizer, cfg):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_latent_space(tokenizer, cfg):
|
|
||||||
return LatentSpaceShareGPTPromptTokenizingStrategy(
|
|
||||||
ShareGPTPrompter(PromptStyle.CHAT.value),
|
|
||||||
tokenizer,
|
|
||||||
cfg.train_on_inputs,
|
|
||||||
cfg.sequence_len,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class LatentSpaceShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
|
|
||||||
"""
|
|
||||||
latent space padded sharegpt strategy to grab conversations from the sample row
|
|
||||||
"""
|
|
||||||
|
|
||||||
def get_conversation_thread(self, prompt):
|
|
||||||
return prompt["conversations"]
|
|
||||||
|
|
||||||
def _tokenize(self, prompt, add_eos_token=True, strip_bos_token=False):
|
|
||||||
# pylint: disable=duplicate-code
|
|
||||||
result = self.tokenizer(
|
|
||||||
prompt,
|
|
||||||
truncation=True,
|
|
||||||
max_length=self.sequence_len,
|
|
||||||
padding=False,
|
|
||||||
return_tensors=None,
|
|
||||||
)
|
|
||||||
if (
|
|
||||||
result["input_ids"][-1] != self.tokenizer.eos_token_id
|
|
||||||
and len(result["input_ids"]) < self.sequence_len
|
|
||||||
and add_eos_token
|
|
||||||
):
|
|
||||||
result["input_ids"].append(self.tokenizer.eos_token_id)
|
|
||||||
result["attention_mask"].append(1)
|
|
||||||
|
|
||||||
if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
|
|
||||||
result["input_ids"] = result["input_ids"][1:]
|
|
||||||
result["attention_mask"] = result["attention_mask"][1:]
|
|
||||||
|
|
||||||
# latent space
|
|
||||||
if add_eos_token and not strip_bos_token:
|
|
||||||
result["input_ids"].extend([self.tokenizer.pad_token_id] * 100)
|
|
||||||
|
|
||||||
result["labels"] = result["input_ids"].copy()
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
|
class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
|
||||||
"""
|
"""
|
||||||
basic sharegpt strategy to grab conversations from the sample row
|
basic sharegpt strategy to grab conversations from the sample row
|
||||||
|
|||||||
98
src/axolotl/prompt_strategies/user_defined.py
Normal file
98
src/axolotl/prompt_strategies/user_defined.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
"""
|
||||||
|
User Defined prompts with configuration from the YML config
|
||||||
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from functools import partial
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from axolotl.prompt_strategies.alpaca_w_system import (
|
||||||
|
InstructionWSystemPromptTokenizingStrategy,
|
||||||
|
SystemDataPrompter,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class UserDefinedDatasetConfig:
|
||||||
|
"""
|
||||||
|
dataclass configuration representing a userdefined dataset type
|
||||||
|
"""
|
||||||
|
|
||||||
|
system_prompt: str = ""
|
||||||
|
field_system: str = "system"
|
||||||
|
field_instruction: str = "instruction"
|
||||||
|
field_input: str = "input"
|
||||||
|
field_output: str = "output"
|
||||||
|
format: str = "{instruction} {input} "
|
||||||
|
no_input_format: str = "{instruction} "
|
||||||
|
system_format: str = "{system}"
|
||||||
|
|
||||||
|
def __getitem__(self, item):
|
||||||
|
return getattr(self, item)
|
||||||
|
|
||||||
|
|
||||||
|
class UserDefinedPromptTokenizationStrategy(InstructionWSystemPromptTokenizingStrategy):
|
||||||
|
"""
|
||||||
|
Prompt Tokenization Strategy for user defined prompts
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def load(tokenizer, cfg, ds_cfg: Optional[UserDefinedDatasetConfig] = None):
|
||||||
|
if not ds_cfg:
|
||||||
|
raise ValueError("Missing dataset prompt configuration")
|
||||||
|
|
||||||
|
system_prompt = ""
|
||||||
|
if ds_cfg.system_prompt:
|
||||||
|
system_prompt = ds_cfg.system_prompt
|
||||||
|
|
||||||
|
def parse_instruction_fields(
|
||||||
|
field_instruction,
|
||||||
|
field_input,
|
||||||
|
field_output,
|
||||||
|
field_system,
|
||||||
|
system_prompt,
|
||||||
|
prompt,
|
||||||
|
) -> Tuple[str, str, str, str]:
|
||||||
|
return (
|
||||||
|
prompt[field_instruction],
|
||||||
|
prompt[field_input] if field_input in prompt else "",
|
||||||
|
prompt[field_output] if field_output in prompt else "",
|
||||||
|
prompt[field_system] if field_system in prompt else system_prompt,
|
||||||
|
)
|
||||||
|
|
||||||
|
turn_format = ds_cfg.format
|
||||||
|
turn_no_input_format = ds_cfg.no_input_format
|
||||||
|
system_format = ds_cfg.system_format
|
||||||
|
|
||||||
|
class UserDefinedPrompter(SystemDataPrompter):
|
||||||
|
"""
|
||||||
|
Prompter for user defined prompts
|
||||||
|
"""
|
||||||
|
|
||||||
|
def match_prompt_style(self):
|
||||||
|
self.turn_format = turn_format
|
||||||
|
self.turn_no_input_format = turn_no_input_format
|
||||||
|
self.system_format = system_format
|
||||||
|
|
||||||
|
prompter = UserDefinedPrompter()
|
||||||
|
|
||||||
|
strat = UserDefinedPromptTokenizationStrategy(
|
||||||
|
prompter,
|
||||||
|
tokenizer,
|
||||||
|
cfg.train_on_inputs,
|
||||||
|
cfg.sequence_len,
|
||||||
|
)
|
||||||
|
|
||||||
|
setattr(
|
||||||
|
strat,
|
||||||
|
"parse_instruction_fields",
|
||||||
|
partial(
|
||||||
|
parse_instruction_fields,
|
||||||
|
ds_cfg.field_instruction,
|
||||||
|
ds_cfg.field_input,
|
||||||
|
ds_cfg.field_output,
|
||||||
|
ds_cfg.field_system,
|
||||||
|
system_prompt,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return strat
|
||||||
@@ -13,7 +13,7 @@ from axolotl.prompters import IGNORE_TOKEN_ID
|
|||||||
LOG = logging.getLogger("axolotl")
|
LOG = logging.getLogger("axolotl")
|
||||||
|
|
||||||
IGNORE_INDEX = -100
|
IGNORE_INDEX = -100
|
||||||
LLAMA_DEFAULT_PAD_TOKEN = "[PAD]" # nosec
|
LLAMA_DEFAULT_PAD_TOKEN = "<pad>" # nosec
|
||||||
LLAMA_DEFAULT_EOS_TOKEN = "</s>" # nosec
|
LLAMA_DEFAULT_EOS_TOKEN = "</s>" # nosec
|
||||||
LLAMA_DEFAULT_BOS_TOKEN = "<s>" # nosec
|
LLAMA_DEFAULT_BOS_TOKEN = "<s>" # nosec
|
||||||
LLAMA_DEFAULT_UNK_TOKEN = "<unk>" # nosec
|
LLAMA_DEFAULT_UNK_TOKEN = "<unk>" # nosec
|
||||||
@@ -85,7 +85,11 @@ class PromptTokenizingStrategy(abc.ABC):
|
|||||||
result["input_ids"].append(self.tokenizer.eos_token_id)
|
result["input_ids"].append(self.tokenizer.eos_token_id)
|
||||||
result["attention_mask"].append(1)
|
result["attention_mask"].append(1)
|
||||||
|
|
||||||
if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
|
if (
|
||||||
|
len(result["input_ids"]) > 0
|
||||||
|
and result["input_ids"][0] == self.tokenizer.bos_token_id
|
||||||
|
and strip_bos_token
|
||||||
|
):
|
||||||
result["input_ids"] = result["input_ids"][1:]
|
result["input_ids"] = result["input_ids"][1:]
|
||||||
result["attention_mask"] = result["attention_mask"][1:]
|
result["attention_mask"] = result["attention_mask"][1:]
|
||||||
|
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ class AlpacaPrompter:
|
|||||||
|
|
||||||
system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
|
system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
|
||||||
system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
|
system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
|
||||||
system_format: str
|
system_format: str = "{system}"
|
||||||
turn_format: str
|
turn_format: str
|
||||||
turn_no_input_format: str
|
turn_no_input_format: str
|
||||||
prompt_style: Optional[PromptStyle] = None
|
prompt_style: Optional[PromptStyle] = None
|
||||||
@@ -63,13 +63,17 @@ class AlpacaPrompter:
|
|||||||
# returns the full prompt from instruction and optional input
|
# returns the full prompt from instruction and optional input
|
||||||
# if a label (=response, =output) is provided, it's also appended.
|
# if a label (=response, =output) is provided, it's also appended.
|
||||||
if input:
|
if input:
|
||||||
res = self.system_prompt + self.turn_format.format(
|
res = (
|
||||||
instruction=instruction, input=input
|
self.system_format.format(system=self.system_prompt)
|
||||||
)
|
if self.system_prompt
|
||||||
|
else ""
|
||||||
|
) + self.turn_format.format(instruction=instruction, input=input)
|
||||||
else:
|
else:
|
||||||
res = self.system_no_input_prompt + self.turn_no_input_format.format(
|
res = (
|
||||||
instruction=instruction
|
self.system_format.format(system=self.system_no_input_prompt)
|
||||||
)
|
if self.system_prompt
|
||||||
|
else ""
|
||||||
|
) + self.turn_no_input_format.format(instruction=instruction)
|
||||||
if output:
|
if output:
|
||||||
res = f"{res}{output}"
|
res = f"{res}{output}"
|
||||||
yield res
|
yield res
|
||||||
|
|||||||
@@ -1,9 +1,19 @@
|
|||||||
"""Callbacks for Trainer class"""
|
"""Callbacks for Trainer class"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
from typing import TYPE_CHECKING, Dict, List
|
||||||
|
|
||||||
|
import evaluate
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import torch
|
||||||
|
import torch.distributed as dist
|
||||||
|
from datasets import load_dataset
|
||||||
from optimum.bettertransformer import BetterTransformer
|
from optimum.bettertransformer import BetterTransformer
|
||||||
|
from tqdm import tqdm
|
||||||
from transformers import (
|
from transformers import (
|
||||||
TrainerCallback,
|
TrainerCallback,
|
||||||
TrainerControl,
|
TrainerControl,
|
||||||
@@ -13,8 +23,19 @@ from transformers import (
|
|||||||
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy
|
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy
|
||||||
|
|
||||||
from axolotl.utils.bench import log_gpu_memory_usage
|
from axolotl.utils.bench import log_gpu_memory_usage
|
||||||
|
from axolotl.utils.distributed import (
|
||||||
|
barrier,
|
||||||
|
gather_scalar_from_all_ranks,
|
||||||
|
get_world_size,
|
||||||
|
is_main_process,
|
||||||
|
zero_first,
|
||||||
|
)
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from axolotl.utils.trainer import AxolotlTrainingArguments
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.callbacks")
|
LOG = logging.getLogger("axolotl.callbacks")
|
||||||
|
IGNORE_INDEX = -100
|
||||||
|
|
||||||
|
|
||||||
class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public-methods
|
class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public-methods
|
||||||
@@ -33,7 +54,9 @@ class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public-
|
|||||||
)
|
)
|
||||||
|
|
||||||
peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
|
peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
|
||||||
kwargs["model"].save_pretrained(peft_model_path)
|
kwargs["model"].save_pretrained(
|
||||||
|
peft_model_path, save_safetensors=args.save_safetensors
|
||||||
|
)
|
||||||
|
|
||||||
return control
|
return control
|
||||||
|
|
||||||
@@ -94,3 +117,192 @@ class GPUStatsCallback(
|
|||||||
log_gpu_memory_usage(LOG, "while training", self.cfg.device)
|
log_gpu_memory_usage(LOG, "while training", self.cfg.device)
|
||||||
self.logged = True
|
self.logged = True
|
||||||
return control
|
return control
|
||||||
|
|
||||||
|
|
||||||
|
def bench_eval_callback_factory(trainer, tokenizer):
|
||||||
|
accuracy = evaluate.load("accuracy")
|
||||||
|
abcd_idx = [
|
||||||
|
tokenizer("A", add_special_tokens=False).input_ids[0],
|
||||||
|
tokenizer("B", add_special_tokens=False).input_ids[0],
|
||||||
|
tokenizer("C", add_special_tokens=False).input_ids[0],
|
||||||
|
tokenizer("D", add_special_tokens=False).input_ids[0],
|
||||||
|
tokenizer("E", add_special_tokens=False).input_ids[0],
|
||||||
|
tokenizer("F", add_special_tokens=False).input_ids[0],
|
||||||
|
tokenizer("G", add_special_tokens=False).input_ids[0],
|
||||||
|
]
|
||||||
|
bench_split = "eval"
|
||||||
|
|
||||||
|
def transform_bench_subject(example):
|
||||||
|
# Split on ':' and trim whitespace
|
||||||
|
parts = example["subject"].split(":")
|
||||||
|
first_part = (
|
||||||
|
parts[0].strip().lower().replace("-", "_")
|
||||||
|
) # Lowercase the first part
|
||||||
|
second_part = (
|
||||||
|
parts[1].strip().replace("-", "_") if len(parts) > 1 else "all"
|
||||||
|
) # Replace hyphens with underscores
|
||||||
|
|
||||||
|
# Return the transformed values
|
||||||
|
return {"name": first_part, "subject": second_part}
|
||||||
|
|
||||||
|
if trainer.args.bench_dataset == "mmlu-zs":
|
||||||
|
bench_dataset = load_dataset(
|
||||||
|
"openaccess-ai-collective/mmlu-evals",
|
||||||
|
data_files={
|
||||||
|
"eval": "zero_shot_mmlu_val.json",
|
||||||
|
"test": "zero_shot_mmlu_test.json",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# bench_dataset = bench_dataset.remove_columns("subject")
|
||||||
|
# MMLU Five-shot (Eval/Test only)
|
||||||
|
elif trainer.args.bench_dataset in ["mmlu", "mmlu-fs"]:
|
||||||
|
bench_dataset = load_dataset(
|
||||||
|
"openaccess-ai-collective/mmlu-evals",
|
||||||
|
data_files={
|
||||||
|
"eval": "five_shot_mmlu_val.json",
|
||||||
|
"test": "five_shot_mmlu_test.json",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# bench_dataset = bench_dataset.remove_columns('subject')
|
||||||
|
elif "/" in trainer.args.bench_dataset:
|
||||||
|
bench_ds = trainer.args.bench_dataset
|
||||||
|
bench_ds_name = "/".join(bench_ds.split("/", 2)[:2])
|
||||||
|
bench_ds_data_file = "/".join(bench_ds.split("/", 2)[2:])
|
||||||
|
bench_dataset = load_dataset(
|
||||||
|
bench_ds_name,
|
||||||
|
data_files={
|
||||||
|
"eval": bench_ds_data_file,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
bench_dataset["eval"] = bench_dataset["eval"].map(transform_bench_subject)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"unhandled value `{trainer.args.bench_dataset}` for bench_dataset training args"
|
||||||
|
)
|
||||||
|
bench_dataset = bench_dataset[trainer.args.bench_split]
|
||||||
|
if trainer.args.max_bench_samples is not None:
|
||||||
|
bench_dataset = bench_dataset.select(range(trainer.args.max_bench_samples))
|
||||||
|
|
||||||
|
def tokenize_evals(example):
|
||||||
|
source = f"{tokenizer.bos_token}{example['input']}"
|
||||||
|
target = f"{example['output']}{tokenizer.eos_token}"
|
||||||
|
|
||||||
|
tokenized_source = tokenizer(
|
||||||
|
source,
|
||||||
|
max_length=2048,
|
||||||
|
truncation=True,
|
||||||
|
add_special_tokens=False,
|
||||||
|
)
|
||||||
|
tokenized_target = tokenizer(
|
||||||
|
target,
|
||||||
|
max_length=2048,
|
||||||
|
truncation=True,
|
||||||
|
add_special_tokens=False,
|
||||||
|
)
|
||||||
|
input_ids = tokenized_source["input_ids"] + tokenized_target["input_ids"]
|
||||||
|
labels = [IGNORE_INDEX] * len(tokenized_source["input_ids"]) + tokenized_target[
|
||||||
|
"input_ids"
|
||||||
|
]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"input_ids": input_ids,
|
||||||
|
"labels": labels,
|
||||||
|
"subject": example["subject"],
|
||||||
|
}
|
||||||
|
|
||||||
|
with zero_first(is_main_process()):
|
||||||
|
bench_dataset = bench_dataset.map(tokenize_evals)
|
||||||
|
bench_dataset = bench_dataset.filter(lambda x: x["labels"][-2] in abcd_idx)
|
||||||
|
|
||||||
|
class BenchEvalCallback(TrainerCallback):
|
||||||
|
"""
|
||||||
|
TrainerCallback that runs the MMLU evals
|
||||||
|
"""
|
||||||
|
|
||||||
|
def on_evaluate(
|
||||||
|
self,
|
||||||
|
args: AxolotlTrainingArguments,
|
||||||
|
state: TrainerState, # pylint: disable=unused-argument
|
||||||
|
control: TrainerControl, # pylint: disable=unused-argument
|
||||||
|
metrics: Dict[str, float], # pylint: disable=unused-argument
|
||||||
|
**kwargs, # pylint: disable=unused-argument
|
||||||
|
):
|
||||||
|
data_loader = trainer.get_bench_dataloader(
|
||||||
|
bench_dataset.remove_columns(["input", "subject", "output", "name"])
|
||||||
|
)
|
||||||
|
trainer.model.eval()
|
||||||
|
preds, refs = [], []
|
||||||
|
loss_bench = 0
|
||||||
|
for batch in tqdm(data_loader, total=len(data_loader)):
|
||||||
|
(loss, logits, labels) = trainer.prediction_step(
|
||||||
|
trainer.model,
|
||||||
|
batch,
|
||||||
|
prediction_loss_only=False,
|
||||||
|
)
|
||||||
|
# There are two tokens, the output, and eos token.
|
||||||
|
for i, logit in enumerate(logits):
|
||||||
|
label_non_zero_id = (batch["labels"][i] != IGNORE_INDEX).nonzero()[
|
||||||
|
0
|
||||||
|
][0]
|
||||||
|
logit_abcd = logit[label_non_zero_id - 1][abcd_idx]
|
||||||
|
preds.append(torch.argmax(logit_abcd).item())
|
||||||
|
labels = labels[labels != IGNORE_INDEX].view(-1, 2)[:, 0]
|
||||||
|
refs += [
|
||||||
|
abcd_idx.index(label) if label in abcd_idx else -1
|
||||||
|
for label in labels.tolist()
|
||||||
|
]
|
||||||
|
loss_bench += loss.item()
|
||||||
|
# Extract results by subject.
|
||||||
|
bench_name = bench_dataset["name"]
|
||||||
|
bench_names: dict = {s: {"refs": [], "preds": []} for s in set(bench_name)}
|
||||||
|
for s, p, r in zip(bench_name, preds, refs): # pylint: disable=invalid-name
|
||||||
|
bench_names[s]["preds"].append(p)
|
||||||
|
bench_names[s]["refs"].append(r)
|
||||||
|
barrier()
|
||||||
|
local_bench_names = bench_names
|
||||||
|
gathered_bench_names: List[Dict] = [{} for _ in range(get_world_size())]
|
||||||
|
# Gather results from all GPUs to GPU 0
|
||||||
|
|
||||||
|
loss_bench_ranks = gather_scalar_from_all_ranks(
|
||||||
|
lambda: loss_bench, get_world_size()
|
||||||
|
)
|
||||||
|
len_data_loader_ranks = gather_scalar_from_all_ranks(
|
||||||
|
lambda: len(data_loader), get_world_size()
|
||||||
|
)
|
||||||
|
|
||||||
|
if not is_main_process():
|
||||||
|
dist.gather_object(local_bench_names, dst=0)
|
||||||
|
else:
|
||||||
|
dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
|
||||||
|
bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
|
||||||
|
results = {"bench_loss": bench_loss}
|
||||||
|
|
||||||
|
# Combine results from all GPUs
|
||||||
|
combined_bench_names: Dict[str, Dict[str, List]] = {}
|
||||||
|
for bench_name in gathered_bench_names:
|
||||||
|
for name, data in bench_name.items():
|
||||||
|
if name not in combined_bench_names:
|
||||||
|
combined_bench_names[name] = {"refs": [], "preds": []}
|
||||||
|
combined_bench_names[name]["refs"].extend(data["refs"])
|
||||||
|
combined_bench_names[name]["preds"].extend(data["preds"])
|
||||||
|
|
||||||
|
bench_scores = []
|
||||||
|
for (
|
||||||
|
bench_name
|
||||||
|
) in combined_bench_names: # pylint: disable=consider-using-dict-items
|
||||||
|
bench_score = accuracy.compute(
|
||||||
|
references=combined_bench_names[bench_name]["refs"],
|
||||||
|
predictions=combined_bench_names[bench_name]["preds"],
|
||||||
|
)["accuracy"]
|
||||||
|
if not pd.isna(bench_score):
|
||||||
|
results[
|
||||||
|
f"bench_{bench_split}_accuracy_{bench_name}"
|
||||||
|
] = bench_score
|
||||||
|
bench_scores.append(bench_score)
|
||||||
|
else:
|
||||||
|
results[f"bench_{bench_split}_accuracy_{bench_name}"] = 0.0
|
||||||
|
bench_scores.append(0.0)
|
||||||
|
results[f"bench_{bench_split}_accuracy"] = np.mean(bench_scores)
|
||||||
|
trainer.log(results)
|
||||||
|
|
||||||
|
return BenchEvalCallback
|
||||||
|
|||||||
@@ -62,6 +62,13 @@ def normalize_config(cfg):
|
|||||||
else:
|
else:
|
||||||
torch.backends.cuda.matmul.allow_tf32 = cfg.tf32 or False
|
torch.backends.cuda.matmul.allow_tf32 = cfg.tf32 or False
|
||||||
|
|
||||||
|
if cfg.bf16 or cfg.bfloat16:
|
||||||
|
cfg.torch_dtype = torch.bfloat16
|
||||||
|
elif cfg.load_in_8bit or cfg.fp16 or cfg.float16:
|
||||||
|
cfg.torch_dtype = torch.float16
|
||||||
|
else:
|
||||||
|
cfg.torch_dtype = torch.float32
|
||||||
|
|
||||||
log_gpu_memory_usage(LOG, "baseline", cfg.device)
|
log_gpu_memory_usage(LOG, "baseline", cfg.device)
|
||||||
|
|
||||||
|
|
||||||
@@ -119,6 +126,19 @@ def validate_config(cfg):
|
|||||||
if not cfg.load_in_8bit and cfg.adapter == "lora":
|
if not cfg.load_in_8bit and cfg.adapter == "lora":
|
||||||
LOG.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")
|
LOG.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")
|
||||||
|
|
||||||
|
if cfg.relora_steps:
|
||||||
|
if cfg.adapter not in ("lora", "qlora"):
|
||||||
|
raise ValueError("cfg.adapter must be lora or qlora to use ReLoRA")
|
||||||
|
|
||||||
|
if cfg.fsdp:
|
||||||
|
raise ValueError("fsdp not supported with ReLoRA")
|
||||||
|
|
||||||
|
if cfg.deepspeed:
|
||||||
|
raise ValueError("deepspeed not supported with ReLoRA")
|
||||||
|
|
||||||
|
if cfg.lr_scheduler == "one_cycle":
|
||||||
|
raise ValueError("ReLoRA is not compatible with the one_cycle scheduler")
|
||||||
|
|
||||||
if cfg.trust_remote_code:
|
if cfg.trust_remote_code:
|
||||||
LOG.warning(
|
LOG.warning(
|
||||||
"`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model."
|
"`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model."
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ from axolotl.prompters import (
|
|||||||
ShareGPTPrompter,
|
ShareGPTPrompter,
|
||||||
SummarizeTLDRPrompter,
|
SummarizeTLDRPrompter,
|
||||||
)
|
)
|
||||||
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.distributed import is_main_process, zero_first
|
from axolotl.utils.distributed import is_main_process, zero_first
|
||||||
from axolotl.utils.trainer import (
|
from axolotl.utils.trainer import (
|
||||||
calculate_total_num_steps,
|
calculate_total_num_steps,
|
||||||
@@ -53,9 +54,10 @@ DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
|
|||||||
|
|
||||||
def prepare_dataset(cfg, tokenizer):
|
def prepare_dataset(cfg, tokenizer):
|
||||||
if not cfg.pretraining_dataset:
|
if not cfg.pretraining_dataset:
|
||||||
train_dataset, eval_dataset = load_prepare_datasets(
|
with zero_first(is_main_process()):
|
||||||
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
|
train_dataset, eval_dataset = load_prepare_datasets(
|
||||||
)
|
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
train_dataset = load_pretraining_dataset(
|
train_dataset = load_pretraining_dataset(
|
||||||
cfg.pretraining_dataset,
|
cfg.pretraining_dataset,
|
||||||
@@ -160,8 +162,15 @@ def load_tokenized_prepared_datasets(
|
|||||||
split=None,
|
split=None,
|
||||||
)
|
)
|
||||||
elif local_path.is_file():
|
elif local_path.is_file():
|
||||||
|
ds_type = "json"
|
||||||
|
if d.ds_type:
|
||||||
|
ds_type = d.ds_type
|
||||||
|
elif ".parquet" in d.path:
|
||||||
|
ds_type = "parquet"
|
||||||
|
elif ".arrow" in d.path:
|
||||||
|
ds_type = "arrow"
|
||||||
ds = load_dataset(
|
ds = load_dataset(
|
||||||
"json",
|
ds_type,
|
||||||
name=d.name,
|
name=d.name,
|
||||||
data_files=d.path,
|
data_files=d.path,
|
||||||
streaming=False,
|
streaming=False,
|
||||||
@@ -198,13 +207,27 @@ def load_tokenized_prepared_datasets(
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
ds = ds.shuffle(seed=seed).shard(num_shards=d.shards, index=0)
|
ds = ds.shuffle(seed=seed).shard(num_shards=d.shards, index=0)
|
||||||
|
|
||||||
|
d_base_type = d_prompt_style = None
|
||||||
d_type = d.type
|
d_type = d.type
|
||||||
d_type_split = d_type.split(":")
|
if isinstance(d_type, str):
|
||||||
d_base_type = d_type_split[0]
|
d_type_split = d_type.split(":")
|
||||||
d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
|
d_base_type = d_type_split[0]
|
||||||
|
d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
|
||||||
if "train" in ds:
|
if "train" in ds:
|
||||||
ds = ds["train"]
|
ds = ds["train"]
|
||||||
if ds_strategy := load(d.type, tokenizer, cfg):
|
if (
|
||||||
|
"input_ids" in ds.features
|
||||||
|
and "attention_mask" in ds.features
|
||||||
|
and "labels" in ds.features
|
||||||
|
):
|
||||||
|
# dataset is already tokenized, just drop it straight in
|
||||||
|
datasets.append(ds)
|
||||||
|
elif isinstance(d.type, DictDefault):
|
||||||
|
ds_strategy = load("user_defined", tokenizer, cfg, d.type.to_dict())
|
||||||
|
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
|
||||||
|
datasets.append(ds_wrapper)
|
||||||
|
elif ds_strategy := load(d.type, tokenizer, cfg, d):
|
||||||
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
|
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
|
||||||
datasets.append(ds_wrapper)
|
datasets.append(ds_wrapper)
|
||||||
elif d_base_type == "alpaca":
|
elif d_base_type == "alpaca":
|
||||||
|
|||||||
@@ -243,6 +243,18 @@ class MultipackDistributedDataloader:
|
|||||||
len_remaining -= 1
|
len_remaining -= 1
|
||||||
if not len_remaining:
|
if not len_remaining:
|
||||||
return
|
return
|
||||||
|
# yield a no-op for cases where we don't have any data left to pack
|
||||||
|
for i in range(0, len_remaining):
|
||||||
|
yield self.collate_fn(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"input_ids": [0],
|
||||||
|
"labels": [-100],
|
||||||
|
"attention_mask": [True],
|
||||||
|
"position_ids": [0],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
def _len_est(self):
|
def _len_est(self):
|
||||||
lengths_sum = np.sum(self.lengths)
|
lengths_sum = np.sum(self.lengths)
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
"""
|
"""
|
||||||
utility helpers for distributed checks
|
utility helpers for distributed checks
|
||||||
"""
|
"""
|
||||||
|
import os
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from accelerate import Accelerator
|
from accelerate import Accelerator
|
||||||
|
|
||||||
@@ -43,6 +45,10 @@ def is_main_process():
|
|||||||
return dist.get_rank() == 0
|
return dist.get_rank() == 0
|
||||||
|
|
||||||
|
|
||||||
|
def get_world_size():
|
||||||
|
return int(os.getenv("WORLD_SIZE", "1"))
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def zero_first(is_main):
|
def zero_first(is_main):
|
||||||
"""
|
"""
|
||||||
@@ -53,3 +59,35 @@ def zero_first(is_main):
|
|||||||
yield
|
yield
|
||||||
if is_main: # then rank 0 waits after it has run the context
|
if is_main: # then rank 0 waits after it has run the context
|
||||||
barrier()
|
barrier()
|
||||||
|
|
||||||
|
|
||||||
|
def gather_scalar_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-name
|
||||||
|
"""
|
||||||
|
Run a callable 'fn' on all ranks and gather the results on the specified rank.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- fn (callable): A function that computes the value. This should not have any side effects.
|
||||||
|
- rank (int, optional): The rank that gathers the values. Default is 0.
|
||||||
|
- world_size (int, optional): Total number of processes in the current distributed setup.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- A list of computed values from all ranks if on the gathering rank, otherwise None.
|
||||||
|
"""
|
||||||
|
value_scalar = fn()
|
||||||
|
value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
|
||||||
|
|
||||||
|
if not is_main_process():
|
||||||
|
dist.gather(value_tensor, dst=0)
|
||||||
|
else:
|
||||||
|
gathered_tensors = [torch.zeros_like(value_tensor) for _ in range(world_size)]
|
||||||
|
dist.gather(value_tensor, gather_list=gathered_tensors, dst=0)
|
||||||
|
|
||||||
|
# Convert tensors back to their original type (int or float)
|
||||||
|
gathered_values = []
|
||||||
|
for tensor in gathered_tensors:
|
||||||
|
if tensor == tensor.int():
|
||||||
|
gathered_values.append(int(tensor.item()))
|
||||||
|
else:
|
||||||
|
gathered_values.append(float(tensor.item()))
|
||||||
|
return gathered_values
|
||||||
|
return None
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ from transformers import ( # noqa: F401
|
|||||||
PreTrainedTokenizerBase,
|
PreTrainedTokenizerBase,
|
||||||
)
|
)
|
||||||
|
|
||||||
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
|
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
|
||||||
from axolotl.utils.bench import log_gpu_memory_usage
|
from axolotl.utils.bench import log_gpu_memory_usage
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl")
|
LOG = logging.getLogger("axolotl")
|
||||||
@@ -54,11 +54,18 @@ def load_tokenizer(cfg):
|
|||||||
**tokenizer_kwargs,
|
**tokenizer_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
if tokenizer.__class__.__name__ in [
|
if (
|
||||||
"LlamaTokenizer",
|
tokenizer.__class__.__name__
|
||||||
"LlamaTokenizerFast",
|
in [
|
||||||
]:
|
"LlamaTokenizer",
|
||||||
tokenizer.pad_token = LLAMA_DEFAULT_PAD_TOKEN
|
"LlamaTokenizerFast",
|
||||||
|
"CodeLlamaTokenizer",
|
||||||
|
]
|
||||||
|
and hasattr(tokenizer, "pad_token")
|
||||||
|
and not tokenizer.pad_token
|
||||||
|
):
|
||||||
|
# set a pad_token, but use eos_token so we don't add a new token
|
||||||
|
tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN
|
||||||
|
|
||||||
LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
|
LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
|
||||||
LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
|
LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
|
||||||
@@ -103,7 +110,7 @@ def load_model(
|
|||||||
)
|
)
|
||||||
|
|
||||||
LOG.info("patching with flash attention")
|
LOG.info("patching with flash attention")
|
||||||
replace_llama_attn_with_flash_attn()
|
replace_llama_attn_with_flash_attn(packed=cfg.sample_packing)
|
||||||
elif cfg.is_llama_derived_model and cfg.xformers_attention:
|
elif cfg.is_llama_derived_model and cfg.xformers_attention:
|
||||||
from axolotl.monkeypatch.llama_attn_hijack_xformers import (
|
from axolotl.monkeypatch.llama_attn_hijack_xformers import (
|
||||||
hijack_llama_attention,
|
hijack_llama_attention,
|
||||||
@@ -112,9 +119,7 @@ def load_model(
|
|||||||
LOG.info("patching with xformers attention")
|
LOG.info("patching with xformers attention")
|
||||||
hijack_llama_attention()
|
hijack_llama_attention()
|
||||||
elif cfg.is_llama_derived_model and cfg.sdp_attention:
|
elif cfg.is_llama_derived_model and cfg.sdp_attention:
|
||||||
from axolotl.monkeypatch.llama_attn_hijack_xformers import (
|
from axolotl.monkeypatch.llama_attn_hijack_sdp import hijack_llama_sdp_attention
|
||||||
hijack_llama_sdp_attention,
|
|
||||||
)
|
|
||||||
|
|
||||||
LOG.info("patching with sdp attention")
|
LOG.info("patching with sdp attention")
|
||||||
hijack_llama_sdp_attention()
|
hijack_llama_sdp_attention()
|
||||||
@@ -148,12 +153,6 @@ def load_model(
|
|||||||
LOG.info("patching _expand_mask")
|
LOG.info("patching _expand_mask")
|
||||||
hijack_expand_mask()
|
hijack_expand_mask()
|
||||||
|
|
||||||
if cfg.bf16 or cfg.bfloat16:
|
|
||||||
torch_dtype = torch.bfloat16
|
|
||||||
elif cfg.load_in_8bit or cfg.fp16 or cfg.float16:
|
|
||||||
torch_dtype = torch.float16
|
|
||||||
else:
|
|
||||||
torch_dtype = torch.float32
|
|
||||||
try:
|
try:
|
||||||
if cfg.gptq:
|
if cfg.gptq:
|
||||||
from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
|
from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
|
||||||
@@ -185,7 +184,7 @@ def load_model(
|
|||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
llm_int8_threshold=6.0,
|
llm_int8_threshold=6.0,
|
||||||
llm_int8_has_fp16_weight=False,
|
llm_int8_has_fp16_weight=False,
|
||||||
bnb_4bit_compute_dtype=torch_dtype,
|
bnb_4bit_compute_dtype=cfg.torch_dtype,
|
||||||
bnb_4bit_use_double_quant=True,
|
bnb_4bit_use_double_quant=True,
|
||||||
bnb_4bit_quant_type="nf4",
|
bnb_4bit_quant_type="nf4",
|
||||||
)
|
)
|
||||||
@@ -244,7 +243,7 @@ def load_model(
|
|||||||
device_map=cfg.device_map,
|
device_map=cfg.device_map,
|
||||||
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
||||||
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
||||||
torch_dtype=torch_dtype,
|
torch_dtype=cfg.torch_dtype,
|
||||||
**model_kwargs,
|
**model_kwargs,
|
||||||
)
|
)
|
||||||
# elif model_type == "GPTNeoXForCausalLM" and cfg.flash_attention:
|
# elif model_type == "GPTNeoXForCausalLM" and cfg.flash_attention:
|
||||||
@@ -279,7 +278,7 @@ def load_model(
|
|||||||
device_map=cfg.device_map,
|
device_map=cfg.device_map,
|
||||||
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
||||||
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
||||||
torch_dtype=torch_dtype,
|
torch_dtype=cfg.torch_dtype,
|
||||||
trust_remote_code=cfg.trust_remote_code or False,
|
trust_remote_code=cfg.trust_remote_code or False,
|
||||||
**model_kwargs,
|
**model_kwargs,
|
||||||
)
|
)
|
||||||
@@ -310,7 +309,7 @@ def load_model(
|
|||||||
device_map=cfg.device_map,
|
device_map=cfg.device_map,
|
||||||
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
||||||
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
||||||
torch_dtype=torch_dtype,
|
torch_dtype=cfg.torch_dtype,
|
||||||
trust_remote_code=cfg.trust_remote_code or False,
|
trust_remote_code=cfg.trust_remote_code or False,
|
||||||
**model_kwargs,
|
**model_kwargs,
|
||||||
)
|
)
|
||||||
@@ -324,7 +323,7 @@ def load_model(
|
|||||||
device_map=cfg.device_map,
|
device_map=cfg.device_map,
|
||||||
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
||||||
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
||||||
torch_dtype=torch_dtype,
|
torch_dtype=cfg.torch_dtype,
|
||||||
trust_remote_code=cfg.trust_remote_code or False,
|
trust_remote_code=cfg.trust_remote_code or False,
|
||||||
**model_kwargs,
|
**model_kwargs,
|
||||||
)
|
)
|
||||||
@@ -349,6 +348,15 @@ def load_model(
|
|||||||
if model.device.type == "cuda":
|
if model.device.type == "cuda":
|
||||||
log_gpu_memory_usage(LOG, "after model load", model.device)
|
log_gpu_memory_usage(LOG, "after model load", model.device)
|
||||||
|
|
||||||
|
# make sure these are fp32 per Ramesh et al. (2021)
|
||||||
|
for name, module in model.named_modules():
|
||||||
|
if "norm" in name:
|
||||||
|
module.to(torch.float32)
|
||||||
|
if "lm_head" in name or "embed_tokens" in name:
|
||||||
|
if hasattr(module, "weight"):
|
||||||
|
module.to(torch.float32)
|
||||||
|
|
||||||
|
needs_fa2_dtype = cfg.adapter or cfg.fsdp
|
||||||
if not cfg.gptq and (
|
if not cfg.gptq and (
|
||||||
(cfg.adapter == "lora" and load_in_8bit)
|
(cfg.adapter == "lora" and load_in_8bit)
|
||||||
or (cfg.adapter == "qlora" and cfg.load_in_4bit)
|
or (cfg.adapter == "qlora" and cfg.load_in_4bit)
|
||||||
@@ -357,16 +365,18 @@ def load_model(
|
|||||||
model = prepare_model_for_kbit_training(
|
model = prepare_model_for_kbit_training(
|
||||||
model, use_gradient_checkpointing=cfg.gradient_checkpointing
|
model, use_gradient_checkpointing=cfg.gradient_checkpointing
|
||||||
)
|
)
|
||||||
|
needs_fa2_dtype = True
|
||||||
|
|
||||||
# LlamaRMSNorm layers are in fp32 after kbit_training, so we need to
|
# LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
|
||||||
# convert them back to fp16/bf16 for flash-attn compatibility.
|
# convert them back to fp16/bf16 for flash-attn compatibility.
|
||||||
if cfg.flash_attention and cfg.is_llama_derived_model:
|
if needs_fa2_dtype and (cfg.flash_attention and cfg.is_llama_derived_model):
|
||||||
for name, module in model.named_modules():
|
LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
|
||||||
if "norm" in name:
|
for name, module in model.named_modules():
|
||||||
module.to(torch_dtype)
|
if "norm" in name:
|
||||||
if "lm_head" in name or "embed_tokens" in name:
|
module.to(cfg.torch_dtype)
|
||||||
if hasattr(module, "weight"):
|
if "lm_head" in name or "embed_tokens" in name:
|
||||||
module.to(torch_dtype)
|
if hasattr(module, "weight"):
|
||||||
|
module.to(cfg.torch_dtype)
|
||||||
|
|
||||||
model, lora_config = load_adapter(model, cfg, cfg.adapter)
|
model, lora_config = load_adapter(model, cfg, cfg.adapter)
|
||||||
|
|
||||||
@@ -440,7 +450,7 @@ def load_llama_adapter(model, cfg):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if cfg.lora_model_dir:
|
if cfg.lora_model_dir:
|
||||||
LOG.info("Loading pretained LORA")
|
LOG.debug("Loading pretained PEFT - llama_adapter")
|
||||||
model = PeftModel.from_pretrained(
|
model = PeftModel.from_pretrained(
|
||||||
model,
|
model,
|
||||||
cfg.lora_model_dir,
|
cfg.lora_model_dir,
|
||||||
@@ -502,6 +512,7 @@ def load_lora(model, cfg):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if cfg.lora_model_dir:
|
if cfg.lora_model_dir:
|
||||||
|
LOG.debug("Loading pretained PEFT - LoRA")
|
||||||
model = PeftModel.from_pretrained(
|
model = PeftModel.from_pretrained(
|
||||||
model,
|
model,
|
||||||
cfg.lora_model_dir,
|
cfg.lora_model_dir,
|
||||||
|
|||||||
@@ -10,28 +10,30 @@ from functools import partial
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import bitsandbytes as bnb
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch.cuda
|
import torch.cuda
|
||||||
import transformers
|
import transformers
|
||||||
from datasets import Dataset, set_caching_enabled
|
from datasets import Dataset, set_caching_enabled
|
||||||
from torch import nn
|
|
||||||
from torch.optim.lr_scheduler import OneCycleLR
|
from torch.optim.lr_scheduler import OneCycleLR
|
||||||
from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
|
from torch.utils.data import (
|
||||||
|
DataLoader,
|
||||||
|
DistributedSampler,
|
||||||
|
RandomSampler,
|
||||||
|
SequentialSampler,
|
||||||
|
)
|
||||||
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
|
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
|
||||||
from transformers.trainer_pt_utils import get_parameter_names
|
from transformers.trainer_pt_utils import SequentialDistributedSampler
|
||||||
|
|
||||||
|
from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
|
||||||
from axolotl.utils.callbacks import (
|
from axolotl.utils.callbacks import (
|
||||||
GPUStatsCallback,
|
GPUStatsCallback,
|
||||||
SaveBetterTransformerModelCallback,
|
SaveBetterTransformerModelCallback,
|
||||||
SavePeftModelCallback,
|
SavePeftModelCallback,
|
||||||
|
bench_eval_callback_factory,
|
||||||
)
|
)
|
||||||
from axolotl.utils.collators import DataCollatorForSeq2Seq
|
from axolotl.utils.collators import DataCollatorForSeq2Seq
|
||||||
from axolotl.utils.dataloader import MultipackDistributedDataloader
|
from axolotl.utils.dataloader import MultipackDistributedDataloader
|
||||||
from axolotl.utils.schedulers import (
|
from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup
|
||||||
InterpolatingLogScheduler,
|
|
||||||
get_cosine_schedule_with_quadratic_warmup,
|
|
||||||
)
|
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl")
|
LOG = logging.getLogger("axolotl")
|
||||||
|
|
||||||
@@ -124,6 +126,35 @@ class AxolotlTrainingArguments(TrainingArguments):
|
|||||||
default=1,
|
default=1,
|
||||||
metadata={"help": "the multiplier for the max len for packed sequences"},
|
metadata={"help": "the multiplier for the max len for packed sequences"},
|
||||||
)
|
)
|
||||||
|
relora_steps: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "how often to reset for ReLoRA"},
|
||||||
|
)
|
||||||
|
relora_warmup_steps: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
|
||||||
|
)
|
||||||
|
bench_split: Optional[str] = field(
|
||||||
|
default="eval", metadata={"help": "The benchmark split to run on"}
|
||||||
|
)
|
||||||
|
bench_dataset: Optional[str] = field(
|
||||||
|
default="pharaouk/dharma-1/dharma_1_mini.json",
|
||||||
|
metadata={
|
||||||
|
"help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
|
||||||
|
},
|
||||||
|
)
|
||||||
|
do_bench_eval: Optional[bool] = field(
|
||||||
|
default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
|
||||||
|
)
|
||||||
|
max_bench_samples: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={
|
||||||
|
"help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
bench_source_max_len: int = field(
|
||||||
|
default=2048, metadata={"help": "Maximum source sequence length for bench."}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AxolotlTrainer(Trainer):
|
class AxolotlTrainer(Trainer):
|
||||||
@@ -133,6 +164,10 @@ class AxolotlTrainer(Trainer):
|
|||||||
|
|
||||||
args = None # type: AxolotlTrainingArguments
|
args = None # type: AxolotlTrainingArguments
|
||||||
|
|
||||||
|
def __init__(self, *args, bench_data_collator=None, **kwargs):
|
||||||
|
self.bench_data_collator = bench_data_collator
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
def create_scheduler(
|
def create_scheduler(
|
||||||
self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
|
self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
|
||||||
):
|
):
|
||||||
@@ -171,6 +206,18 @@ class AxolotlTrainer(Trainer):
|
|||||||
)
|
)
|
||||||
return super()._get_train_sampler()
|
return super()._get_train_sampler()
|
||||||
|
|
||||||
|
def _get_eval_sampler(
|
||||||
|
self, eval_dataset: Dataset
|
||||||
|
) -> Optional[torch.utils.data.Sampler]:
|
||||||
|
if self.args.world_size > 1 and self.args.sample_packing:
|
||||||
|
return SequentialDistributedSampler(
|
||||||
|
eval_dataset,
|
||||||
|
num_replicas=self.args.world_size,
|
||||||
|
rank=self.args.process_index,
|
||||||
|
batch_size=self.args.per_device_eval_batch_size,
|
||||||
|
)
|
||||||
|
return super()._get_eval_sampler(eval_dataset)
|
||||||
|
|
||||||
def get_train_dataloader(self) -> Union[DataLoader, MultipackDistributedDataloader]:
|
def get_train_dataloader(self) -> Union[DataLoader, MultipackDistributedDataloader]:
|
||||||
if self.args.sample_packing:
|
if self.args.sample_packing:
|
||||||
train_sampler = self._get_train_sampler()
|
train_sampler = self._get_train_sampler()
|
||||||
@@ -195,6 +242,7 @@ class AxolotlTrainer(Trainer):
|
|||||||
eval_dataset = (
|
eval_dataset = (
|
||||||
eval_dataset if eval_dataset is not None else self.eval_dataset
|
eval_dataset if eval_dataset is not None else self.eval_dataset
|
||||||
)
|
)
|
||||||
|
|
||||||
eval_sampler = self._get_eval_sampler(eval_dataset)
|
eval_sampler = self._get_eval_sampler(eval_dataset)
|
||||||
return self.accelerator.prepare(
|
return self.accelerator.prepare(
|
||||||
MultipackDistributedDataloader(
|
MultipackDistributedDataloader(
|
||||||
@@ -210,6 +258,31 @@ class AxolotlTrainer(Trainer):
|
|||||||
)
|
)
|
||||||
return super().get_eval_dataloader(eval_dataset)
|
return super().get_eval_dataloader(eval_dataset)
|
||||||
|
|
||||||
|
def _get_bench_sampler(
|
||||||
|
self, bench_dataset: Dataset
|
||||||
|
) -> Optional[torch.utils.data.Sampler]:
|
||||||
|
if self.args.world_size <= 1:
|
||||||
|
return SequentialSampler(bench_dataset)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_bench_dataloader(
|
||||||
|
self,
|
||||||
|
bench_dataset: Dataset,
|
||||||
|
) -> Union[DataLoader, MultipackDistributedDataloader]:
|
||||||
|
dataloader_params = {
|
||||||
|
"batch_size": self.args.eval_batch_size,
|
||||||
|
"collate_fn": self.bench_data_collator,
|
||||||
|
"num_workers": self.args.dataloader_num_workers,
|
||||||
|
"pin_memory": self.args.dataloader_pin_memory,
|
||||||
|
}
|
||||||
|
|
||||||
|
if not isinstance(bench_dataset, torch.utils.data.IterableDataset):
|
||||||
|
dataloader_params["sampler"] = self._get_bench_sampler(bench_dataset)
|
||||||
|
dataloader_params["drop_last"] = self.args.dataloader_drop_last
|
||||||
|
|
||||||
|
return DataLoader(bench_dataset, **dataloader_params)
|
||||||
|
# return self.accelerator.prepare(DataLoader(bench_dataset, **dataloader_params))
|
||||||
|
|
||||||
def compute_loss(self, model, inputs, return_outputs=False):
|
def compute_loss(self, model, inputs, return_outputs=False):
|
||||||
# use one's weighted cross entropy loss calc
|
# use one's weighted cross entropy loss calc
|
||||||
# if self.args.sample_packing:
|
# if self.args.sample_packing:
|
||||||
@@ -249,6 +322,39 @@ class OneCycleLRSchedulerTrainer(AxolotlTrainer):
|
|||||||
return self.lr_scheduler
|
return self.lr_scheduler
|
||||||
|
|
||||||
|
|
||||||
|
class ReLoRATrainer(AxolotlTrainer):
|
||||||
|
"""
|
||||||
|
Trainer subclass that uses the OneCycleLR scheduler
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.lr_scheduler = None
|
||||||
|
|
||||||
|
def create_scheduler(
|
||||||
|
self,
|
||||||
|
num_training_steps: int,
|
||||||
|
optimizer: Optional[torch.optim.Optimizer] = None,
|
||||||
|
):
|
||||||
|
optimizer = self.optimizer if optimizer is None else optimizer
|
||||||
|
lr_scheduler = super().create_scheduler(num_training_steps, optimizer)
|
||||||
|
|
||||||
|
if self.args.relora_steps:
|
||||||
|
warmup_steps = (
|
||||||
|
self.args.relora_warmup_steps if self.args.relora_warmup_steps else 10
|
||||||
|
)
|
||||||
|
self.lr_scheduler = ReLoRAScheduler(
|
||||||
|
optimizer,
|
||||||
|
lr_scheduler,
|
||||||
|
self.args.relora_steps,
|
||||||
|
warmup_steps,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.lr_scheduler = lr_scheduler
|
||||||
|
|
||||||
|
return self.lr_scheduler
|
||||||
|
|
||||||
|
|
||||||
def add_position_ids(sample):
|
def add_position_ids(sample):
|
||||||
sample["position_ids"] = torch.arange(len(sample["input_ids"]))
|
sample["position_ids"] = torch.arange(len(sample["input_ids"]))
|
||||||
return sample
|
return sample
|
||||||
@@ -268,15 +374,15 @@ def disable_datasets_caching():
|
|||||||
|
|
||||||
|
|
||||||
def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
||||||
|
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
||||||
|
train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count())
|
||||||
|
if eval_dataset:
|
||||||
|
eval_dataset = eval_dataset.filter(drop_long, num_proc=os.cpu_count())
|
||||||
|
|
||||||
if cfg.sample_packing:
|
if cfg.sample_packing:
|
||||||
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
train_dataset = train_dataset.map(add_position_ids, num_proc=os.cpu_count())
|
||||||
train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count()).map(
|
|
||||||
add_position_ids, num_proc=os.cpu_count()
|
|
||||||
)
|
|
||||||
if eval_dataset:
|
if eval_dataset:
|
||||||
eval_dataset = eval_dataset.filter(drop_long, num_proc=os.cpu_count()).map(
|
eval_dataset = eval_dataset.map(add_position_ids, num_proc=os.cpu_count())
|
||||||
add_position_ids, num_proc=os.cpu_count()
|
|
||||||
)
|
|
||||||
return train_dataset, eval_dataset
|
return train_dataset, eval_dataset
|
||||||
|
|
||||||
|
|
||||||
@@ -355,10 +461,16 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
|
|||||||
|
|
||||||
def setup_fsdp_envs(cfg):
|
def setup_fsdp_envs(cfg):
|
||||||
os.environ["ACCELERATE_USE_FSDP"] = "true"
|
os.environ["ACCELERATE_USE_FSDP"] = "true"
|
||||||
|
if cfg.fsdp_config.fsdp_offload_params:
|
||||||
|
os.environ["FSDP_OFFLOAD_PARAMS"] = "true"
|
||||||
if cfg.fsdp_config.fsdp_sync_module_states:
|
if cfg.fsdp_config.fsdp_sync_module_states:
|
||||||
os.environ["FSDP_SYNC_MODULE_STATES"] = "true"
|
os.environ["FSDP_SYNC_MODULE_STATES"] = "true"
|
||||||
if cfg.fsdp_config.fsdp_state_dict_type:
|
if cfg.fsdp_config.fsdp_state_dict_type:
|
||||||
os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.fsdp_state_dict_type
|
os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.fsdp_state_dict_type
|
||||||
|
if cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap:
|
||||||
|
os.environ[
|
||||||
|
"FSDP_TRANSFORMER_CLS_TO_WRAP"
|
||||||
|
] = cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap
|
||||||
|
|
||||||
|
|
||||||
def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
|
def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
|
||||||
@@ -455,6 +567,18 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
# we have an eval set, but no steps defined, use epoch
|
# we have an eval set, but no steps defined, use epoch
|
||||||
training_arguments_kwargs["evaluation_strategy"] = "epoch"
|
training_arguments_kwargs["evaluation_strategy"] = "epoch"
|
||||||
|
|
||||||
|
if cfg.save_strategy:
|
||||||
|
training_arguments_kwargs["save_strategy"] = cfg.save_strategy
|
||||||
|
else:
|
||||||
|
training_arguments_kwargs["save_strategy"] = (
|
||||||
|
"steps" if cfg.save_steps else "epoch"
|
||||||
|
)
|
||||||
|
|
||||||
|
if cfg.do_bench_eval:
|
||||||
|
training_arguments_kwargs["do_bench_eval"] = cfg.do_bench_eval
|
||||||
|
if cfg.bench_dataset:
|
||||||
|
training_arguments_kwargs["bench_dataset"] = cfg.bench_dataset
|
||||||
|
|
||||||
training_args = AxolotlTrainingArguments( # pylint: disable=unexpected-keyword-arg
|
training_args = AxolotlTrainingArguments( # pylint: disable=unexpected-keyword-arg
|
||||||
max_steps=total_num_steps if cfg.max_steps else -1,
|
max_steps=total_num_steps if cfg.max_steps else -1,
|
||||||
max_seq_length=cfg.sequence_len,
|
max_seq_length=cfg.sequence_len,
|
||||||
@@ -466,7 +590,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
eval_accumulation_steps=cfg.gradient_accumulation_steps,
|
eval_accumulation_steps=cfg.gradient_accumulation_steps,
|
||||||
num_train_epochs=cfg.num_epochs,
|
num_train_epochs=cfg.num_epochs,
|
||||||
learning_rate=cfg.learning_rate,
|
learning_rate=cfg.learning_rate,
|
||||||
save_strategy="steps" if cfg.save_steps else "epoch",
|
|
||||||
save_steps=cfg.save_steps,
|
save_steps=cfg.save_steps,
|
||||||
output_dir=cfg.output_dir,
|
output_dir=cfg.output_dir,
|
||||||
save_total_limit=cfg.save_total_limit if cfg.save_total_limit else 4,
|
save_total_limit=cfg.save_total_limit if cfg.save_total_limit else 4,
|
||||||
@@ -489,6 +612,8 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
weight_decay=cfg.weight_decay if cfg.weight_decay is not None else 0.0,
|
weight_decay=cfg.weight_decay if cfg.weight_decay is not None else 0.0,
|
||||||
sample_packing=cfg.sample_packing if cfg.sample_packing else False,
|
sample_packing=cfg.sample_packing if cfg.sample_packing else False,
|
||||||
sample_packing_seq_len_multiplier=cfg.micro_batch_size,
|
sample_packing_seq_len_multiplier=cfg.micro_batch_size,
|
||||||
|
relora_steps=cfg.relora_steps,
|
||||||
|
relora_warmup_steps=cfg.relora_warmup_steps,
|
||||||
**training_arguments_kwargs,
|
**training_arguments_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -498,69 +623,13 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
if Path(cfg.torchdistx_path).exists():
|
if Path(cfg.torchdistx_path).exists():
|
||||||
sys.path.append(cfg.torchdistx_path)
|
sys.path.append(cfg.torchdistx_path)
|
||||||
importlib.import_module("torchdistx")
|
importlib.import_module("torchdistx")
|
||||||
if (
|
|
||||||
cfg.optimizer == "adamw_bnb_8bit"
|
|
||||||
and not cfg.gptq
|
|
||||||
and "deepspeed" not in training_arguments_kwargs
|
|
||||||
and not cfg.fsdp
|
|
||||||
):
|
|
||||||
decay_parameters = get_parameter_names(model, [nn.LayerNorm])
|
|
||||||
decay_parameters = [name for name in decay_parameters if "bias" not in name]
|
|
||||||
optimizer_grouped_parameters = [
|
|
||||||
{
|
|
||||||
"params": [
|
|
||||||
p
|
|
||||||
for n, p in model.named_parameters()
|
|
||||||
if (n in decay_parameters and p.requires_grad)
|
|
||||||
],
|
|
||||||
"weight_decay": training_args.weight_decay,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"params": [
|
|
||||||
p
|
|
||||||
for n, p in model.named_parameters()
|
|
||||||
if (n not in decay_parameters and p.requires_grad)
|
|
||||||
],
|
|
||||||
"weight_decay": 0.0,
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
optimizer = bnb.optim.Adam8bit(
|
|
||||||
optimizer_grouped_parameters,
|
|
||||||
betas=(training_args.adam_beta1, training_args.adam_beta2),
|
|
||||||
eps=training_args.adam_epsilon,
|
|
||||||
lr=training_args.learning_rate,
|
|
||||||
)
|
|
||||||
|
|
||||||
if cfg.lr_scheduler == "one_cycle":
|
|
||||||
lr_scheduler_kwargs = (
|
|
||||||
cfg.lr_scheduler_kwargs if cfg.lr_scheduler_kwargs else {}
|
|
||||||
)
|
|
||||||
lr_scheduler = OneCycleLR(
|
|
||||||
optimizer,
|
|
||||||
cfg.learning_rate,
|
|
||||||
total_steps=total_num_steps,
|
|
||||||
epochs=cfg.num_epochs,
|
|
||||||
div_factor=cfg.lr_div_factor if cfg.lr_div_factor else 6,
|
|
||||||
**lr_scheduler_kwargs,
|
|
||||||
)
|
|
||||||
elif cfg.lr_scheduler == "log_sweep":
|
|
||||||
lr_scheduler = InterpolatingLogScheduler(
|
|
||||||
optimizer,
|
|
||||||
cfg.warmup_steps,
|
|
||||||
cfg.log_sweep_min_lr if cfg.log_sweep_min_lr else 1e-10,
|
|
||||||
cfg.log_sweep_max_lr if cfg.log_sweep_max_lr else 10,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
lr_scheduler = transformers.get_cosine_schedule_with_warmup(
|
|
||||||
optimizer,
|
|
||||||
training_args.warmup_steps,
|
|
||||||
total_num_steps,
|
|
||||||
)
|
|
||||||
trainer_kwargs["optimizers"] = (optimizer, lr_scheduler)
|
|
||||||
|
|
||||||
callbacks = []
|
callbacks = []
|
||||||
callbacks.append(GPUStatsCallback(cfg))
|
callbacks.append(GPUStatsCallback(cfg))
|
||||||
|
|
||||||
|
if cfg.relora_steps:
|
||||||
|
callbacks.append(ReLoRACallback(cfg))
|
||||||
|
|
||||||
# TODO on_save callback to sync checkpoints to GCP/AWS in background
|
# TODO on_save callback to sync checkpoints to GCP/AWS in background
|
||||||
if cfg.early_stopping_patience:
|
if cfg.early_stopping_patience:
|
||||||
early_stop_cb = EarlyStoppingCallback(
|
early_stop_cb = EarlyStoppingCallback(
|
||||||
@@ -605,11 +674,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
num_proc=32,
|
num_proc=32,
|
||||||
)
|
)
|
||||||
|
|
||||||
trainer_cls = (
|
trainer_cls = AxolotlTrainer
|
||||||
OneCycleLRSchedulerTrainer
|
if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora"):
|
||||||
if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
|
trainer_cls = OneCycleLRSchedulerTrainer
|
||||||
else AxolotlTrainer
|
elif cfg.relora_steps:
|
||||||
)
|
trainer_cls = ReLoRATrainer
|
||||||
trainer = trainer_cls(
|
trainer = trainer_cls(
|
||||||
model=model,
|
model=model,
|
||||||
train_dataset=train_dataset,
|
train_dataset=train_dataset,
|
||||||
@@ -620,8 +689,16 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
**data_collator_kwargs,
|
**data_collator_kwargs,
|
||||||
),
|
),
|
||||||
|
bench_data_collator=transformers.DataCollatorForSeq2Seq(
|
||||||
|
tokenizer,
|
||||||
|
return_tensors="pt",
|
||||||
|
**data_collator_kwargs,
|
||||||
|
),
|
||||||
callbacks=callbacks,
|
callbacks=callbacks,
|
||||||
**trainer_kwargs,
|
**trainer_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if cfg.do_bench_eval:
|
||||||
|
trainer.add_callback(bench_eval_callback_factory(trainer, tokenizer))
|
||||||
|
|
||||||
return trainer
|
return trainer
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user