Compare commits
25 Commits
multi-gpu-
...
v0.3.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
772cd870d4 | ||
|
|
6c5fbe6223 | ||
|
|
bcbc9597e9 | ||
|
|
6d57f2f0f0 | ||
|
|
20ed4c1f9e | ||
|
|
c5dedb17ad | ||
|
|
b56503d423 | ||
|
|
a94f9cb99e | ||
|
|
c1921c9acb | ||
|
|
0b4cf5bc8c | ||
|
|
78ee2cdab2 | ||
|
|
34c0a86a11 | ||
|
|
5e2d8a42d9 | ||
|
|
e30f1e3cf7 | ||
|
|
343714972b | ||
|
|
245c5c41e2 | ||
|
|
a546ca2813 | ||
|
|
3355706e22 | ||
|
|
daa4faca12 | ||
|
|
fc8766e502 | ||
|
|
72a6fe1c1f | ||
|
|
5fe30b1497 | ||
|
|
44454ae4c4 | ||
|
|
09f154397e | ||
|
|
995557bdf3 |
10
.github/workflows/main.yml
vendored
10
.github/workflows/main.yml
vendored
@@ -23,11 +23,6 @@ jobs:
|
|||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
pytorch: 2.0.1
|
pytorch: 2.0.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: 118
|
|
||||||
cuda_version: 11.8.0
|
|
||||||
python_version: "3.9"
|
|
||||||
pytorch: 2.0.1
|
|
||||||
axolotl_extras: gptq
|
|
||||||
runs-on: self-hosted
|
runs-on: self-hosted
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -73,11 +68,6 @@ jobs:
|
|||||||
pytorch: 2.0.1
|
pytorch: 2.0.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
is_latest: true
|
is_latest: true
|
||||||
- cuda: 118
|
|
||||||
cuda_version: 11.8.0
|
|
||||||
python_version: "3.9"
|
|
||||||
pytorch: 2.0.1
|
|
||||||
axolotl_extras: gptq
|
|
||||||
runs-on: self-hosted
|
runs-on: self-hosted
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
|
|||||||
45
.github/workflows/pypi.yml
vendored
Normal file
45
.github/workflows/pypi.yml
vendored
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
name: publish pypi
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- '*'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
pypi-publish:
|
||||||
|
name: Upload release to PyPI
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
environment:
|
||||||
|
name: pypi
|
||||||
|
url: https://pypi.org/p/axolotl
|
||||||
|
permissions:
|
||||||
|
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
|
||||||
|
steps:
|
||||||
|
- name: Check out repository code
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Setup Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.10"
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip3 install wheel
|
||||||
|
pip3 install -e .
|
||||||
|
pip3 install -r requirements-tests.txt
|
||||||
|
|
||||||
|
- name: Extract tag name
|
||||||
|
id: tag
|
||||||
|
run: echo ::set-output name=TAG_NAME::$(echo $GITHUB_REF | cut -d / -f 3)
|
||||||
|
|
||||||
|
- name: Update version in setup.py
|
||||||
|
run: >-
|
||||||
|
sed -i -E 's/version="([0-9.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py
|
||||||
|
|
||||||
|
- name: Build a binary wheel
|
||||||
|
run: >-
|
||||||
|
python setup.py sdist bdist_wheel
|
||||||
|
|
||||||
|
- name: Publish package distributions to PyPI
|
||||||
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
4
.github/workflows/tests.yml
vendored
4
.github/workflows/tests.yml
vendored
@@ -24,8 +24,8 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install -e .[peft]
|
pip3 install -e .
|
||||||
pip install -r requirements-tests.txt
|
pip3 install -r requirements-tests.txt
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
48
README.md
48
README.md
@@ -90,8 +90,7 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
|
|||||||
```bash
|
```bash
|
||||||
docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.10-cu118-2.0.1
|
docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.10-cu118-2.0.1
|
||||||
```
|
```
|
||||||
- `winglian/axolotl-runpod:main-py3.10-cu118-2.0.1`: for runpod
|
- `winglian/axolotl-runpod:main-latest`: for runpod or use this [direct link](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
|
||||||
- `winglian/axolotl-runpod:main-py3.9-cu118-2.0.1-gptq`: for gptq
|
|
||||||
|
|
||||||
Or run on the current files for development:
|
Or run on the current files for development:
|
||||||
|
|
||||||
@@ -104,19 +103,9 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
|
|||||||
|
|
||||||
2. Install pytorch stable https://pytorch.org/get-started/locally/
|
2. Install pytorch stable https://pytorch.org/get-started/locally/
|
||||||
|
|
||||||
3. Install python dependencies with ONE of the following:
|
3. Install axolotl along with python dependencies
|
||||||
- Recommended, supports QLoRA, NO gptq/int4 support
|
|
||||||
```bash
|
```bash
|
||||||
pip3 install -e .
|
pip3 install -e .[flash-attn]
|
||||||
pip3 install -U git+https://github.com/huggingface/peft.git
|
|
||||||
```
|
|
||||||
- gptq/int4 support, NO QLoRA
|
|
||||||
```bash
|
|
||||||
pip3 install -e .[gptq]
|
|
||||||
```
|
|
||||||
- same as above but not recommended
|
|
||||||
```bash
|
|
||||||
pip3 install -e .[gptq_triton]
|
|
||||||
```
|
```
|
||||||
|
|
||||||
- LambdaLabs
|
- LambdaLabs
|
||||||
@@ -151,10 +140,9 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
|
|||||||
git clone https://github.com/OpenAccess-AI-Collective/axolotl
|
git clone https://github.com/OpenAccess-AI-Collective/axolotl
|
||||||
cd axolotl
|
cd axolotl
|
||||||
|
|
||||||
pip3 install -e . # change depend on needs
|
pip3 install -e .
|
||||||
pip3 install protobuf==3.20.3
|
pip3 install protobuf==3.20.3
|
||||||
pip3 install -U --ignore-installed requests Pillow psutil scipy
|
pip3 install -U --ignore-installed requests Pillow psutil scipy
|
||||||
pip3 install git+https://github.com/huggingface/peft.git # not for gptq
|
|
||||||
```
|
```
|
||||||
|
|
||||||
5. Set path
|
5. Set path
|
||||||
@@ -572,6 +560,30 @@ log_sweep_min_lr:
|
|||||||
log_sweep_max_lr:
|
log_sweep_max_lr:
|
||||||
|
|
||||||
# specify optimizer
|
# specify optimizer
|
||||||
|
# Valid values are driven by the Transformers OptimizerNames class, see:
|
||||||
|
# https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
|
||||||
|
#
|
||||||
|
# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
|
||||||
|
# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
|
||||||
|
# in the examples/ for your model and fine-tuning use case.
|
||||||
|
#
|
||||||
|
# Valid values for 'optimizer' include:
|
||||||
|
# - adamw_hf
|
||||||
|
# - adamw_torch
|
||||||
|
# - adamw_torch_fused
|
||||||
|
# - adamw_torch_xla
|
||||||
|
# - adamw_apex_fused
|
||||||
|
# - adafactor
|
||||||
|
# - adamw_anyprecision
|
||||||
|
# - sgd
|
||||||
|
# - adagrad
|
||||||
|
# - adamw_bnb_8bit
|
||||||
|
# - lion_8bit
|
||||||
|
# - lion_32bit
|
||||||
|
# - paged_adamw_32bit
|
||||||
|
# - paged_adamw_8bit
|
||||||
|
# - paged_lion_32bit
|
||||||
|
# - paged_lion_8bit
|
||||||
optimizer:
|
optimizer:
|
||||||
# specify weight decay
|
# specify weight decay
|
||||||
weight_decay:
|
weight_decay:
|
||||||
@@ -752,6 +764,10 @@ Try to turn off xformers.
|
|||||||
|
|
||||||
It's safe to ignore it.
|
It's safe to ignore it.
|
||||||
|
|
||||||
|
> NCCL Timeouts during training
|
||||||
|
|
||||||
|
See the [NCCL](docs/nccl.md) guide.
|
||||||
|
|
||||||
## Need help? 🙋♂️
|
## Need help? 🙋♂️
|
||||||
|
|
||||||
Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
|
Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
|
||||||
|
|||||||
@@ -9,6 +9,11 @@ services:
|
|||||||
- ~/.cache/huggingface/:/root/.cache/huggingface/
|
- ~/.cache/huggingface/:/root/.cache/huggingface/
|
||||||
# set environment variables
|
# set environment variables
|
||||||
environment:
|
environment:
|
||||||
|
# Set environment variables
|
||||||
|
- GIT_AUTHOR_NAME=${GIT_AUTHOR_NAME}
|
||||||
|
- GIT_AUTHOR_EMAIL=${GIT_AUTHOR_EMAIL}
|
||||||
|
- GIT_COMMITTER_NAME=${GIT_COMMITTER_NAME}
|
||||||
|
- GIT_COMMITTER_EMAIL=${GIT_COMMITTER_EMAIL}
|
||||||
- WANDB_API_KEY=${WANDB_API_KEY}
|
- WANDB_API_KEY=${WANDB_API_KEY}
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ RUN apt-get update && \
|
|||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
RUN pip3 install "peft @ git+https://github.com/huggingface/peft.git@main"
|
|
||||||
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
||||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||||
RUN cd axolotl && \
|
RUN cd axolotl && \
|
||||||
|
|||||||
46
docs/nccl.md
Normal file
46
docs/nccl.md
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
# NCCL
|
||||||
|
|
||||||
|
NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several [environment variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html). A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:
|
||||||
|
|
||||||
|
```text
|
||||||
|
Watchdog caught collective operation timeout: WorkNCCL(SeqNum=42, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1806948 milliseconds before timing out.
|
||||||
|
```
|
||||||
|
|
||||||
|
Often, this timeout will happen after 30 minutes (the default setting) and is accompanied by below-average power consumption with near 100% GPU utilization before the error is raised. Nvidia recommends [disabling PCI access control services (ACS)](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#pci-access-control-services-acs) as a possible solution if this is available to you.
|
||||||
|
|
||||||
|
Forcing cross-GPU communication via [NVLink](https://en.wikipedia.org/wiki/NVLink) may help without increasing timeouts. To verify that your configuration is leveraging NVLink run the following command:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
nvidia-smi nvlink --status
|
||||||
|
```
|
||||||
|
|
||||||
|
To force NCCL to use NVLink, simply set this in the environment:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
export NCCL_P2P_LEVEL=NVL
|
||||||
|
```
|
||||||
|
|
||||||
|
If NVLink is not available in your environment there are other options for ``NCCL_P2P_LEVEL`` in the table below:
|
||||||
|
|
||||||
|
| NCCL_P2P_LEVEL | Description |
|
||||||
|
| -------------- | ----------- |
|
||||||
|
| PIX | P2P data transfers through no more than a single PCIe bridge. Faster data transfer rates vs to paths involving multiple bridges, but slower compared to direct GPU-to-GPU communication. |
|
||||||
|
| PXB | P2P data transfers through multiple PCIe bridges but not going through the PCIe Host Bridge; this path involves a complex routing process, potentially incurring a moderate level of latency. |
|
||||||
|
| PHB | P2P data transfers occur over the PCIe and through a PCIe Host Bridge, typically involving the CPU, which can facilitate direct memory access but might introduce additional latency compared to more direct paths (ex PIX, NVL) |
|
||||||
|
|
||||||
|
To validate that acceptable data transfer speeds exist for your training job, running [NCCL Tests](https://github.com/NVIDIA/nccl-tests/blob/master/README.md) can help pinpoint bottlenecks, for example:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3
|
||||||
|
```
|
||||||
|
|
||||||
|
It can be useful when debugging NCCL communication timeouts to activate additional logging in both PyTorch and NCCL:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
export NCCL_DEBUG=INFO
|
||||||
|
export NCCL_DEBUG_SUBSYS=ALL
|
||||||
|
export TORCH_DISTRIBUTED_DEBUG=INFO
|
||||||
|
export TORCHELASTIC_ERROR_FILE=/PATH/TO/torcherror.log
|
||||||
|
```
|
||||||
|
|
||||||
|
Finally, if you believe your training job needs more time you can increase the timeout past 30 minutes by setting the ``ddp_timeout`` value in the Axolotl configuration. See [PyTorch init_process_group](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for documentation on this value.
|
||||||
@@ -17,6 +17,7 @@ output_dir: ./lora-out
|
|||||||
|
|
||||||
sequence_len: 100000
|
sequence_len: 100000
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
adapter: lora
|
adapter: lora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ lora_model_dir:
|
|||||||
|
|
||||||
sequence_len: 100000
|
sequence_len: 100000
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ output_dir: ./lora-out
|
|||||||
|
|
||||||
sequence_len: 100000
|
sequence_len: 100000
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
adapter: lora
|
adapter: lora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ lora_model_dir:
|
|||||||
|
|
||||||
sequence_len: 100000
|
sequence_len: 100000
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ output_dir: ./lora-out
|
|||||||
|
|
||||||
sequence_len: 100000
|
sequence_len: 100000
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
adapter: lora
|
adapter: lora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ lora_model_dir:
|
|||||||
|
|
||||||
sequence_len: 100000
|
sequence_len: 100000
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
|
|||||||
@@ -1,8 +0,0 @@
|
|||||||
# LLaMa 7B using LoRA
|
|
||||||
|
|
||||||
This is a good place to start for beginners. This will run on an NVIDIA RTX4090 with no other changes needed.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
accelerate launch scripts/finetune.py examples/gptq-lora-7b/config.yml
|
|
||||||
|
|
||||||
```
|
|
||||||
@@ -1,63 +0,0 @@
|
|||||||
base_model: Neko-Institute-of-Science/LLaMA-7B-4bit-128g
|
|
||||||
base_model_config: Neko-Institute-of-Science/LLaMA-7B-4bit-128g
|
|
||||||
model_type: LlamaForCausalLM
|
|
||||||
tokenizer_type: LlamaTokenizer
|
|
||||||
trust_remote_code:
|
|
||||||
load_in_8bit: true
|
|
||||||
gptq: true
|
|
||||||
datasets:
|
|
||||||
- path: vicgalle/alpaca-gpt4
|
|
||||||
type: alpaca
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.02
|
|
||||||
adapter:
|
|
||||||
lora_model_dir:
|
|
||||||
sequence_len: 2048
|
|
||||||
max_packed_sequence_len:
|
|
||||||
lora_r: 8
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_modules:
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
lora_fan_in_fan_out: false
|
|
||||||
wandb_project: llama-7b-lora-int4
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_run_id:
|
|
||||||
wandb_log_model:
|
|
||||||
output_dir: ./llama-7b-lora-int4
|
|
||||||
gradient_accumulation_steps: 1
|
|
||||||
micro_batch_size: 1
|
|
||||||
num_epochs: 3
|
|
||||||
optimizer: adamw_bnb_8bit
|
|
||||||
torchdistx_path:
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0000002
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
fp16: true
|
|
||||||
bf16: false
|
|
||||||
tf32: true
|
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
|
||||||
local_rank:
|
|
||||||
logging_steps: 5
|
|
||||||
xformers_attention:
|
|
||||||
flash_attention:
|
|
||||||
gradient_checkpointing: true
|
|
||||||
gptq_groupsize: 128
|
|
||||||
gptq_model_v1: false
|
|
||||||
warmup_steps: 20
|
|
||||||
eval_steps: 110
|
|
||||||
save_steps: 660
|
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0001
|
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
tokens:
|
|
||||||
pad_token: "<pad>"
|
|
||||||
bos_token: "<s>"
|
|
||||||
eos_token: "</s>"
|
|
||||||
unk_token: "<unk>"
|
|
||||||
76
examples/llama-2/gptq-lora.yml
Normal file
76
examples/llama-2/gptq-lora.yml
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
base_model: TheBloke/Llama-2-7B-GPTQ
|
||||||
|
base_model_config: TheBloke/Llama-2-7B-GPTQ
|
||||||
|
is_llama_derived_model: false
|
||||||
|
gptq: true
|
||||||
|
gptq_bits: 4
|
||||||
|
model_type: AutoModelForCausalLM
|
||||||
|
tokenizer_type: LlamaTokenizer
|
||||||
|
tokenizer_use_fast: true
|
||||||
|
tokenizer_legacy: true
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
push_dataset_to_hub:
|
||||||
|
hf_use_auth_token: true
|
||||||
|
datasets:
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.01
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing:
|
||||||
|
lora_r: 8
|
||||||
|
lora_alpha: 32
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
|
- k_proj
|
||||||
|
- o_proj
|
||||||
|
- q_proj
|
||||||
|
- v_proj
|
||||||
|
lora_target_linear:
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
wandb_project:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_run_id:
|
||||||
|
wandb_log_model:
|
||||||
|
output_dir: ./model-out
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 3
|
||||||
|
optimizer: adamw_torch
|
||||||
|
adam_beta2: 0.95
|
||||||
|
adam_eps: 0.00001
|
||||||
|
max_grad_norm: 1.0
|
||||||
|
torchdistx_path:
|
||||||
|
lr_scheduler: cosine
|
||||||
|
lr_quadratic_warmup: true
|
||||||
|
learning_rate: 0.000017
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: false
|
||||||
|
fp16: false
|
||||||
|
float16: true
|
||||||
|
tf32: true
|
||||||
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention:
|
||||||
|
sdp_attention:
|
||||||
|
flash_optimum:
|
||||||
|
gptq_groupsize:
|
||||||
|
gptq_model_v1:
|
||||||
|
warmup_steps: 100
|
||||||
|
eval_steps:
|
||||||
|
save_steps:
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.1
|
||||||
|
special_tokens:
|
||||||
|
bos_token: "<s>"
|
||||||
|
eos_token: "</s>"
|
||||||
|
unk_token: "<unk>"
|
||||||
@@ -17,6 +17,7 @@ output_dir: ./lora-out
|
|||||||
|
|
||||||
sequence_len: 4096
|
sequence_len: 4096
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
adapter: lora
|
adapter: lora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ lora_model_dir:
|
|||||||
|
|
||||||
sequence_len: 4096
|
sequence_len: 4096
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ lora_model_dir:
|
|||||||
|
|
||||||
sequence_len: 4096
|
sequence_len: 4096
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
lora_r: 8
|
lora_r: 8
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
|
|||||||
@@ -1,14 +1,18 @@
|
|||||||
|
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||||
|
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||||
|
torch==2.0.1
|
||||||
|
auto-gptq
|
||||||
packaging
|
packaging
|
||||||
peft @ git+https://github.com/huggingface/peft.git
|
peft @ git+https://github.com/huggingface/peft.git
|
||||||
transformers @ git+https://github.com/huggingface/transformers.git
|
transformers @ git+https://github.com/huggingface/transformers.git
|
||||||
bitsandbytes>=0.41.1
|
bitsandbytes>=0.41.1
|
||||||
accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
|
accelerate @ git+https://github.com/huggingface/accelerate
|
||||||
addict
|
addict
|
||||||
evaluate
|
evaluate
|
||||||
fire
|
fire
|
||||||
PyYAML>=6.0
|
PyYAML>=6.0
|
||||||
datasets
|
datasets
|
||||||
flash-attn>=2.0.8
|
flash-attn>=2.2.1
|
||||||
sentencepiece
|
sentencepiece
|
||||||
wandb
|
wandb
|
||||||
einops
|
einops
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ from axolotl.utils.config import normalize_config, validate_config
|
|||||||
from axolotl.utils.data import prepare_dataset
|
from axolotl.utils.data import prepare_dataset
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.distributed import is_main_process
|
from axolotl.utils.distributed import is_main_process
|
||||||
from axolotl.utils.models import load_model_config, load_tokenizer
|
from axolotl.utils.models import load_tokenizer
|
||||||
from axolotl.utils.tokenization import check_dataset_labels
|
from axolotl.utils.tokenization import check_dataset_labels
|
||||||
from axolotl.utils.wandb import setup_wandb_env_vars
|
from axolotl.utils.wandb import setup_wandb_env_vars
|
||||||
|
|
||||||
@@ -216,15 +216,6 @@ def load_cfg(config: Path = Path("examples/"), **kwargs):
|
|||||||
else:
|
else:
|
||||||
cfg[k] = kwargs[k]
|
cfg[k] = kwargs[k]
|
||||||
|
|
||||||
model_config = load_model_config(cfg)
|
|
||||||
|
|
||||||
# figure out if the model is llama
|
|
||||||
cfg.is_llama_derived_model = (
|
|
||||||
(hasattr(model_config, "model_type") and model_config.model_type == "llama")
|
|
||||||
or cfg.is_llama_derived_model
|
|
||||||
or "llama" in cfg.base_model
|
|
||||||
or (cfg.model_type and "llama" in cfg.model_type.lower())
|
|
||||||
)
|
|
||||||
validate_config(cfg)
|
validate_config(cfg)
|
||||||
|
|
||||||
normalize_config(cfg)
|
normalize_config(cfg)
|
||||||
|
|||||||
45
setup.py
45
setup.py
@@ -2,38 +2,41 @@
|
|||||||
|
|
||||||
from setuptools import find_packages, setup
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
install_requires = []
|
|
||||||
with open("./requirements.txt", encoding="utf-8") as requirements_file:
|
def parse_requirements():
|
||||||
# don't include peft yet until we check the int4
|
_install_requires = []
|
||||||
# need to manually install peft for now...
|
_dependency_links = []
|
||||||
reqs = [r.strip() for r in requirements_file.readlines() if "peft" not in r]
|
with open("./requirements.txt", encoding="utf-8") as requirements_file:
|
||||||
reqs = [r for r in reqs if "flash-attn" not in r]
|
lines = [r.strip() for r in requirements_file.readlines()]
|
||||||
reqs = [r for r in reqs if r and r[0] != "#"]
|
for line in lines:
|
||||||
for r in reqs:
|
if line.startswith("--extra-index-url"):
|
||||||
install_requires.append(r)
|
# Handle custom index URLs
|
||||||
|
_, url = line.split()
|
||||||
|
_dependency_links.append(url)
|
||||||
|
elif "flash-attn" not in line and line and line[0] != "#":
|
||||||
|
# Handle standard packages
|
||||||
|
_install_requires.append(line)
|
||||||
|
return _install_requires, _dependency_links
|
||||||
|
|
||||||
|
|
||||||
|
install_requires, dependency_links = parse_requirements()
|
||||||
|
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="axolotl",
|
name="axolotl",
|
||||||
version="0.1",
|
version="0.3.0",
|
||||||
description="You know you're going to axolotl questions",
|
description="LLM Trainer",
|
||||||
|
long_description="Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.",
|
||||||
package_dir={"": "src"},
|
package_dir={"": "src"},
|
||||||
packages=find_packages(),
|
packages=find_packages(),
|
||||||
install_requires=install_requires,
|
install_requires=install_requires,
|
||||||
|
dependency_links=dependency_links,
|
||||||
extras_require={
|
extras_require={
|
||||||
"gptq": [
|
|
||||||
"alpaca_lora_4bit @ git+https://github.com/winglian/alpaca_lora_4bit.git@setup_pip",
|
|
||||||
],
|
|
||||||
"gptq_triton": [
|
|
||||||
"alpaca_lora_4bit[triton] @ git+https://github.com/winglian/alpaca_lora_4bit.git@setup_pip",
|
|
||||||
],
|
|
||||||
"flash-attn": [
|
"flash-attn": [
|
||||||
"flash-attn==2.0.8",
|
"flash-attn>=2.2.1",
|
||||||
],
|
],
|
||||||
"extras": [
|
"extras": [
|
||||||
"deepspeed",
|
"deepspeed",
|
||||||
],
|
],
|
||||||
"peft": [
|
|
||||||
"peft @ git+https://github.com/huggingface/peft.git",
|
|
||||||
],
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ class ColorfulFormatter(Formatter):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def format(self, record):
|
def format(self, record):
|
||||||
|
record.rank = int(os.getenv("LOCAL_RANK", "0"))
|
||||||
log_message = super().format(record)
|
log_message = super().format(record)
|
||||||
return self.COLORS.get(record.levelname, "") + log_message + Fore.RESET
|
return self.COLORS.get(record.levelname, "") + log_message + Fore.RESET
|
||||||
|
|
||||||
@@ -35,7 +36,7 @@ DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
|
|||||||
},
|
},
|
||||||
"colorful": {
|
"colorful": {
|
||||||
"()": ColorfulFormatter,
|
"()": ColorfulFormatter,
|
||||||
"format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
|
"format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] [RANK:%(rank)d] %(message)s",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"filters": {},
|
"filters": {},
|
||||||
|
|||||||
@@ -2,7 +2,9 @@
|
|||||||
|
|
||||||
# copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
|
# copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
|
||||||
|
|
||||||
|
import logging
|
||||||
import warnings
|
import warnings
|
||||||
|
from functools import partial
|
||||||
from typing import List, Optional, Tuple, Union
|
from typing import List, Optional, Tuple, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@@ -33,6 +35,9 @@ except ImportError:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
LOG = logging.getLogger("axolotl")
|
||||||
|
|
||||||
|
|
||||||
def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
|
def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
|
||||||
transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( # pylint: disable=protected-access
|
transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( # pylint: disable=protected-access
|
||||||
_prepare_decoder_attention_mask
|
_prepare_decoder_attention_mask
|
||||||
@@ -44,6 +49,34 @@ def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
|
|||||||
llama_model_forward
|
llama_model_forward
|
||||||
)
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from flash_attn.losses.cross_entropy import CrossEntropyLoss
|
||||||
|
|
||||||
|
LOG.info("patching with flash_attn.losses.cross_entropy")
|
||||||
|
transformers.models.llama.modeling_llama.CrossEntropyLoss = partial(
|
||||||
|
CrossEntropyLoss, inplace_backward=True
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
LOG.info(
|
||||||
|
"optimized flash-attention CrossEntropyLoss not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=xentropy_cuda_lib&subdirectory=csrc/xentropy'`)"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from flash_attn.ops.rms_norm import RMSNorm
|
||||||
|
|
||||||
|
class LlamaRMSNorm(RMSNorm):
|
||||||
|
"""Patched LLamaRMSNorm"""
|
||||||
|
|
||||||
|
def __init__(self, hidden_size, eps=1e-6):
|
||||||
|
super().__init__(hidden_size, eps=eps)
|
||||||
|
|
||||||
|
LOG.info("patching with flash_attn.ops.rms_norm")
|
||||||
|
transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
|
||||||
|
except ImportError:
|
||||||
|
LOG.info(
|
||||||
|
"optimized flash-attention RMSNorm not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=dropout_layer_norm&subdirectory=csrc/layer_norm'`)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Disable the transformation of the attention mask in LlamaModel as the flash attention
|
# Disable the transformation of the attention mask in LlamaModel as the flash attention
|
||||||
# requires the attention mask to be the same as the key_padding_mask
|
# requires the attention mask to be the same as the key_padding_mask
|
||||||
|
|||||||
@@ -309,10 +309,6 @@ class ShareGPTPrompter: # pylint: disable=too-few-public-methods
|
|||||||
)
|
)
|
||||||
|
|
||||||
def build_prompt(self, source) -> Generator[str, None, None]:
|
def build_prompt(self, source) -> Generator[str, None, None]:
|
||||||
# ignore the system prompt if provided
|
|
||||||
if source[0]["from"] == "system":
|
|
||||||
source.pop(0)
|
|
||||||
|
|
||||||
if len(source) < 2:
|
if len(source) < 2:
|
||||||
# If there isn't a back and forth conversation, ignore it
|
# If there isn't a back and forth conversation, ignore it
|
||||||
# also happens on the data splitting leaving empty conversations
|
# also happens on the data splitting leaving empty conversations
|
||||||
@@ -321,6 +317,12 @@ class ShareGPTPrompter: # pylint: disable=too-few-public-methods
|
|||||||
)
|
)
|
||||||
|
|
||||||
conv = self._conversation.copy()
|
conv = self._conversation.copy()
|
||||||
|
|
||||||
|
# Add the conversation system prompt if provided, otherwise use the default one
|
||||||
|
if source[0]["from"] == "system":
|
||||||
|
conv.system = source[0]["value"]
|
||||||
|
source.pop(0)
|
||||||
|
|
||||||
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
|
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -88,6 +88,11 @@ def train(
|
|||||||
if peft_config:
|
if peft_config:
|
||||||
LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
|
LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
|
||||||
peft_config.save_pretrained(cfg.output_dir)
|
peft_config.save_pretrained(cfg.output_dir)
|
||||||
|
# additionally presave the tokenizer and model configs
|
||||||
|
if not Path(cfg.output_dir).is_dir():
|
||||||
|
os.makedirs(cfg.output_dir, exist_ok=True)
|
||||||
|
tokenizer.save_pretrained(str(Path(cfg.output_dir)))
|
||||||
|
model.config.save_pretrained(str(Path(cfg.output_dir)))
|
||||||
|
|
||||||
# In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
|
# In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
|
||||||
if cfg.local_rank == 0:
|
if cfg.local_rank == 0:
|
||||||
@@ -106,9 +111,6 @@ def train(
|
|||||||
if cfg.group_by_length:
|
if cfg.group_by_length:
|
||||||
LOG.info("hang tight... sorting dataset for group_by_length")
|
LOG.info("hang tight... sorting dataset for group_by_length")
|
||||||
|
|
||||||
if not Path(cfg.output_dir).is_dir():
|
|
||||||
os.makedirs(cfg.output_dir, exist_ok=True)
|
|
||||||
tokenizer.save_pretrained(cfg.output_dir)
|
|
||||||
if cfg.flash_optimum:
|
if cfg.flash_optimum:
|
||||||
with torch.backends.cuda.sdp_kernel(
|
with torch.backends.cuda.sdp_kernel(
|
||||||
enable_flash=True, enable_math=True, enable_mem_efficient=True
|
enable_flash=True, enable_math=True, enable_mem_efficient=True
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from accelerate.state import PartialState
|
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from optimum.bettertransformer import BetterTransformer
|
from optimum.bettertransformer import BetterTransformer
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
@@ -25,9 +24,13 @@ from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy
|
|||||||
|
|
||||||
from axolotl.utils.bench import log_gpu_memory_usage
|
from axolotl.utils.bench import log_gpu_memory_usage
|
||||||
from axolotl.utils.distributed import (
|
from axolotl.utils.distributed import (
|
||||||
|
barrier,
|
||||||
|
broadcast_dict,
|
||||||
gather_scalar_from_all_ranks,
|
gather_scalar_from_all_ranks,
|
||||||
get_world_size,
|
get_world_size,
|
||||||
|
is_distributed,
|
||||||
is_main_process,
|
is_main_process,
|
||||||
|
zero_first,
|
||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -35,7 +38,6 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
LOG = logging.getLogger("axolotl.callbacks")
|
LOG = logging.getLogger("axolotl.callbacks")
|
||||||
IGNORE_INDEX = -100
|
IGNORE_INDEX = -100
|
||||||
dist_state = PartialState()
|
|
||||||
|
|
||||||
|
|
||||||
class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public-methods
|
class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public-methods
|
||||||
@@ -210,7 +212,7 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
|||||||
"subject": example["subject"],
|
"subject": example["subject"],
|
||||||
}
|
}
|
||||||
|
|
||||||
with dist_state.main_process_first():
|
with zero_first(is_main_process()):
|
||||||
bench_dataset = bench_dataset.map(tokenize_evals)
|
bench_dataset = bench_dataset.map(tokenize_evals)
|
||||||
bench_dataset = bench_dataset.filter(lambda x: x["labels"][-2] in abcd_idx)
|
bench_dataset = bench_dataset.filter(lambda x: x["labels"][-2] in abcd_idx)
|
||||||
|
|
||||||
@@ -258,7 +260,7 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
|||||||
for s, p, r in zip(bench_name, preds, refs): # pylint: disable=invalid-name
|
for s, p, r in zip(bench_name, preds, refs): # pylint: disable=invalid-name
|
||||||
bench_names[s]["preds"].append(p)
|
bench_names[s]["preds"].append(p)
|
||||||
bench_names[s]["refs"].append(r)
|
bench_names[s]["refs"].append(r)
|
||||||
dist_state.wait_for_everyone()
|
barrier()
|
||||||
local_bench_names = bench_names
|
local_bench_names = bench_names
|
||||||
gathered_bench_names: List[Dict] = [{} for _ in range(get_world_size())]
|
gathered_bench_names: List[Dict] = [{} for _ in range(get_world_size())]
|
||||||
# Gather results from all GPUs to GPU 0
|
# Gather results from all GPUs to GPU 0
|
||||||
@@ -270,10 +272,14 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
|||||||
lambda: len(data_loader), get_world_size()
|
lambda: len(data_loader), get_world_size()
|
||||||
)
|
)
|
||||||
|
|
||||||
if not is_main_process():
|
results = {}
|
||||||
|
if is_distributed() and not is_main_process():
|
||||||
dist.gather_object(local_bench_names, dst=0)
|
dist.gather_object(local_bench_names, dst=0)
|
||||||
else:
|
else:
|
||||||
dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
|
if is_distributed():
|
||||||
|
dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
|
||||||
|
else:
|
||||||
|
gathered_bench_names = [local_bench_names]
|
||||||
bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
|
bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
|
||||||
results = {f"{bench_split}_bench_loss": bench_loss}
|
results = {f"{bench_split}_bench_loss": bench_loss}
|
||||||
|
|
||||||
@@ -312,4 +318,8 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
|||||||
)["accuracy"]
|
)["accuracy"]
|
||||||
trainer.log(results)
|
trainer.log(results)
|
||||||
|
|
||||||
|
results = broadcast_dict(results)
|
||||||
|
for key, val in results.items():
|
||||||
|
metrics[key] = val
|
||||||
|
|
||||||
return BenchEvalCallback
|
return BenchEvalCallback
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import os
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from axolotl.utils.bench import log_gpu_memory_usage
|
from axolotl.utils.bench import log_gpu_memory_usage
|
||||||
|
from axolotl.utils.models import load_model_config
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl")
|
LOG = logging.getLogger("axolotl")
|
||||||
|
|
||||||
@@ -69,6 +70,16 @@ def normalize_config(cfg):
|
|||||||
else:
|
else:
|
||||||
cfg.torch_dtype = torch.float32
|
cfg.torch_dtype = torch.float32
|
||||||
|
|
||||||
|
model_config = load_model_config(cfg)
|
||||||
|
|
||||||
|
# figure out if the model is llama
|
||||||
|
cfg.is_llama_derived_model = (
|
||||||
|
(hasattr(model_config, "model_type") and model_config.model_type == "llama")
|
||||||
|
or cfg.is_llama_derived_model
|
||||||
|
or "llama" in cfg.base_model
|
||||||
|
or (cfg.model_type and "llama" in cfg.model_type.lower())
|
||||||
|
)
|
||||||
|
|
||||||
log_gpu_memory_usage(LOG, "baseline", cfg.device)
|
log_gpu_memory_usage(LOG, "baseline", cfg.device)
|
||||||
|
|
||||||
|
|
||||||
@@ -86,6 +97,11 @@ def validate_config(cfg):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if cfg.sample_packing and not cfg.pad_to_sequence_len:
|
||||||
|
LOG.warning(
|
||||||
|
"`pad_to_sequence_len: true` is recommended when using sample_packing"
|
||||||
|
)
|
||||||
|
|
||||||
if cfg.gradient_accumulation_steps and cfg.batch_size:
|
if cfg.gradient_accumulation_steps and cfg.batch_size:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"please set only one of gradient_accumulation_steps or batch_size"
|
"please set only one of gradient_accumulation_steps or batch_size"
|
||||||
@@ -97,9 +113,7 @@ def validate_config(cfg):
|
|||||||
"To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
|
"To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
|
||||||
)
|
)
|
||||||
if cfg.load_4bit:
|
if cfg.load_4bit:
|
||||||
raise ValueError(
|
raise ValueError("cfg.load_4bit parameter has been deprecated")
|
||||||
"cfg.load_4bit parameter has been deprecated and replaced by cfg.gptq"
|
|
||||||
)
|
|
||||||
|
|
||||||
if cfg.adapter == "qlora":
|
if cfg.adapter == "qlora":
|
||||||
if cfg.merge_lora:
|
if cfg.merge_lora:
|
||||||
@@ -206,6 +220,15 @@ def validate_config(cfg):
|
|||||||
"sample_packing not compatible with xformers_attention. Use flash_attention"
|
"sample_packing not compatible with xformers_attention. Use flash_attention"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if cfg.early_stopping_patience:
|
||||||
|
if not cfg.save_steps or not cfg.eval_steps:
|
||||||
|
raise ValueError(
|
||||||
|
"`early_stopping_patience` requires save_steps and eval_steps to be set. eval_steps should evenly divide save_steps."
|
||||||
|
)
|
||||||
|
if cfg.save_steps % cfg.eval_steps != 0:
|
||||||
|
raise ValueError(
|
||||||
|
"`early_stopping_patience` requires that eval_steps should evenly divide save_steps."
|
||||||
|
)
|
||||||
# TODO
|
# TODO
|
||||||
# MPT 7b
|
# MPT 7b
|
||||||
# https://github.com/facebookresearch/bitsandbytes/issues/25
|
# https://github.com/facebookresearch/bitsandbytes/issues/25
|
||||||
|
|||||||
@@ -2,12 +2,10 @@
|
|||||||
import functools
|
import functools
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
from hashlib import md5
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Tuple, Union
|
from typing import Tuple, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from accelerate.state import PartialState
|
|
||||||
from datasets import (
|
from datasets import (
|
||||||
Dataset,
|
Dataset,
|
||||||
DatasetDict,
|
DatasetDict,
|
||||||
@@ -43,6 +41,7 @@ from axolotl.prompters import (
|
|||||||
SummarizeTLDRPrompter,
|
SummarizeTLDRPrompter,
|
||||||
)
|
)
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
|
from axolotl.utils.distributed import is_main_process, zero_first
|
||||||
from axolotl.utils.trainer import (
|
from axolotl.utils.trainer import (
|
||||||
calculate_total_num_steps,
|
calculate_total_num_steps,
|
||||||
process_datasets_for_packing,
|
process_datasets_for_packing,
|
||||||
@@ -50,12 +49,18 @@ from axolotl.utils.trainer import (
|
|||||||
|
|
||||||
LOG = logging.getLogger("axolotl")
|
LOG = logging.getLogger("axolotl")
|
||||||
DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
|
DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
|
||||||
state = PartialState()
|
|
||||||
|
|
||||||
|
def md5(to_hash: str, encoding: str = "utf-8") -> str:
|
||||||
|
try:
|
||||||
|
return hashlib.md5(to_hash.encode(encoding), usedforsecurity=False).hexdigest()
|
||||||
|
except TypeError:
|
||||||
|
return hashlib.md5(to_hash.encode(encoding)).hexdigest() # nosec
|
||||||
|
|
||||||
|
|
||||||
def prepare_dataset(cfg, tokenizer):
|
def prepare_dataset(cfg, tokenizer):
|
||||||
if not cfg.pretraining_dataset:
|
if not cfg.pretraining_dataset:
|
||||||
with state.main_process_first():
|
with zero_first(is_main_process()):
|
||||||
train_dataset, eval_dataset = load_prepare_datasets(
|
train_dataset, eval_dataset = load_prepare_datasets(
|
||||||
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
|
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
|
||||||
)
|
)
|
||||||
@@ -70,7 +75,7 @@ def prepare_dataset(cfg, tokenizer):
|
|||||||
train_dataset = train_dataset.with_format("torch")
|
train_dataset = train_dataset.with_format("torch")
|
||||||
eval_dataset = None
|
eval_dataset = None
|
||||||
|
|
||||||
with state.main_process_first():
|
with zero_first(is_main_process()):
|
||||||
train_dataset, eval_dataset = process_datasets_for_packing(
|
train_dataset, eval_dataset = process_datasets_for_packing(
|
||||||
cfg, train_dataset, eval_dataset
|
cfg, train_dataset, eval_dataset
|
||||||
)
|
)
|
||||||
@@ -89,7 +94,7 @@ def load_tokenized_prepared_datasets(
|
|||||||
) -> DatasetDict:
|
) -> DatasetDict:
|
||||||
tokenizer_name = tokenizer.__class__.__name__
|
tokenizer_name = tokenizer.__class__.__name__
|
||||||
ds_hash = str(
|
ds_hash = str(
|
||||||
md5( # nosec
|
md5(
|
||||||
(
|
(
|
||||||
str(cfg.sequence_len)
|
str(cfg.sequence_len)
|
||||||
+ "@"
|
+ "@"
|
||||||
@@ -98,8 +103,8 @@ def load_tokenized_prepared_datasets(
|
|||||||
)
|
)
|
||||||
+ "|"
|
+ "|"
|
||||||
+ tokenizer_name
|
+ tokenizer_name
|
||||||
).encode("utf-8")
|
)
|
||||||
).hexdigest()
|
)
|
||||||
)
|
)
|
||||||
prepared_ds_path = (
|
prepared_ds_path = (
|
||||||
Path(cfg.dataset_prepared_path) / ds_hash
|
Path(cfg.dataset_prepared_path) / ds_hash
|
||||||
@@ -375,7 +380,7 @@ def load_prepare_datasets(
|
|||||||
# see if we can go ahead and load the stacked dataset
|
# see if we can go ahead and load the stacked dataset
|
||||||
seed = f"@{str(cfg.seed)}" if cfg.seed else ""
|
seed = f"@{str(cfg.seed)}" if cfg.seed else ""
|
||||||
ds_hash = str(
|
ds_hash = str(
|
||||||
md5( # nosec
|
md5(
|
||||||
(
|
(
|
||||||
str(cfg.sequence_len)
|
str(cfg.sequence_len)
|
||||||
+ "@"
|
+ "@"
|
||||||
@@ -386,8 +391,8 @@ def load_prepare_datasets(
|
|||||||
)
|
)
|
||||||
+ "|"
|
+ "|"
|
||||||
+ tokenizer_name
|
+ tokenizer_name
|
||||||
).encode("utf-8")
|
)
|
||||||
).hexdigest()
|
)
|
||||||
)
|
)
|
||||||
prepared_ds_path = (
|
prepared_ds_path = (
|
||||||
Path(cfg.dataset_prepared_path) / ds_hash
|
Path(cfg.dataset_prepared_path) / ds_hash
|
||||||
@@ -501,14 +506,10 @@ def load_prepare_datasets(
|
|||||||
+ "|"
|
+ "|"
|
||||||
+ str(cfg.seed or 42)
|
+ str(cfg.seed or 42)
|
||||||
)
|
)
|
||||||
train_fingerprint = hashlib.md5(
|
train_fingerprint = md5(to_hash_train)
|
||||||
to_hash_train.encode(), usedforsecurity=False
|
test_fingerprint = md5(to_hash_test)
|
||||||
).hexdigest()
|
|
||||||
test_fingerprint = hashlib.md5(
|
|
||||||
to_hash_test.encode(), usedforsecurity=False
|
|
||||||
).hexdigest()
|
|
||||||
|
|
||||||
with state.main_process_first():
|
with zero_first(is_main_process()):
|
||||||
dataset = dataset.train_test_split(
|
dataset = dataset.train_test_split(
|
||||||
test_size=cfg.val_set_size,
|
test_size=cfg.val_set_size,
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
|
|||||||
@@ -1,27 +1,30 @@
|
|||||||
"""
|
"""
|
||||||
utility helpers for distributed checks
|
utility helpers for distributed checks
|
||||||
"""
|
"""
|
||||||
|
import os
|
||||||
|
import pickle # nosec
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from accelerate import DistributedType
|
from accelerate import Accelerator
|
||||||
from accelerate.state import PartialState
|
|
||||||
from accelerate.utils import wait_for_everyone
|
|
||||||
|
|
||||||
accelerate = None # pylint: disable=invalid-name
|
accelerate = None # pylint: disable=invalid-name
|
||||||
|
|
||||||
state = PartialState()
|
|
||||||
|
def load_accelerate():
|
||||||
|
global accelerate # pylint: disable=global-statement
|
||||||
|
accelerate = Accelerator()
|
||||||
|
|
||||||
|
|
||||||
def is_distributed():
|
def is_distributed():
|
||||||
"""
|
"""
|
||||||
Check if distributed training is initialized.
|
Check if distributed training is initialized.
|
||||||
"""
|
"""
|
||||||
return state.distributed_type in (
|
global accelerate # pylint: disable=global-statement
|
||||||
DistributedType.MULTI_GPU,
|
if not accelerate:
|
||||||
DistributedType.MULTI_CPU,
|
accelerate = Accelerator()
|
||||||
DistributedType.DEEPSPEED,
|
return dist.is_available() and dist.is_initialized()
|
||||||
DistributedType.FSDP,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def barrier():
|
def barrier():
|
||||||
@@ -29,19 +32,34 @@ def barrier():
|
|||||||
Acts as a barrier to wait for all processes. This ensures that all processes
|
Acts as a barrier to wait for all processes. This ensures that all processes
|
||||||
reach the barrier before proceeding further.
|
reach the barrier before proceeding further.
|
||||||
"""
|
"""
|
||||||
wait_for_everyone()
|
if is_distributed():
|
||||||
|
dist.barrier()
|
||||||
|
|
||||||
|
|
||||||
def is_main_process() -> bool:
|
def is_main_process():
|
||||||
"""
|
"""
|
||||||
Check if the current process is the main process.
|
Check if the current process is the main process.
|
||||||
If not in distributed mode, always return True.
|
If not in distributed mode, always return True.
|
||||||
"""
|
"""
|
||||||
return state.is_main_process
|
if not is_distributed():
|
||||||
|
return True
|
||||||
|
return dist.get_rank() == 0
|
||||||
|
|
||||||
|
|
||||||
def get_world_size() -> int:
|
def get_world_size():
|
||||||
return state.num_processes
|
return int(os.getenv("WORLD_SIZE", "1"))
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def zero_first(is_main):
|
||||||
|
"""
|
||||||
|
runs the wrapped context so that rank 0 runs first before other ranks
|
||||||
|
"""
|
||||||
|
if not is_main: # other ranks wait first
|
||||||
|
barrier()
|
||||||
|
yield
|
||||||
|
if is_main: # then rank 0 waits after it has run the context
|
||||||
|
barrier()
|
||||||
|
|
||||||
|
|
||||||
def gather_scalar_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-name
|
def gather_scalar_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-name
|
||||||
@@ -57,9 +75,11 @@ def gather_scalar_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-n
|
|||||||
- A list of computed values from all ranks if on the gathering rank, otherwise None.
|
- A list of computed values from all ranks if on the gathering rank, otherwise None.
|
||||||
"""
|
"""
|
||||||
value_scalar = fn()
|
value_scalar = fn()
|
||||||
|
if not is_distributed():
|
||||||
|
return [value_scalar]
|
||||||
value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
|
value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
|
||||||
|
|
||||||
if not state.is_main_process:
|
if not is_main_process():
|
||||||
dist.gather(value_tensor, dst=0)
|
dist.gather(value_tensor, dst=0)
|
||||||
else:
|
else:
|
||||||
gathered_tensors = [torch.zeros_like(value_tensor) for _ in range(world_size)]
|
gathered_tensors = [torch.zeros_like(value_tensor) for _ in range(world_size)]
|
||||||
@@ -74,3 +94,30 @@ def gather_scalar_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-n
|
|||||||
gathered_values.append(float(tensor.item()))
|
gathered_values.append(float(tensor.item()))
|
||||||
return gathered_values
|
return gathered_values
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def broadcast_dict(vals: dict):
|
||||||
|
if not is_distributed():
|
||||||
|
return vals
|
||||||
|
|
||||||
|
if is_main_process():
|
||||||
|
data_byte = pickle.dumps(vals)
|
||||||
|
data_tensor = torch.ByteTensor(list(data_byte)).to("cuda")
|
||||||
|
data_size = torch.IntTensor([len(data_byte)]).to("cuda")
|
||||||
|
else:
|
||||||
|
data_tensor = torch.empty([1024], dtype=torch.uint8, device="cuda")
|
||||||
|
data_size = torch.IntTensor([0]).to("cuda")
|
||||||
|
|
||||||
|
dist.broadcast(data_size, 0)
|
||||||
|
if not is_main_process():
|
||||||
|
# resize
|
||||||
|
data_tensor = data_tensor.new_empty([data_size.item()])
|
||||||
|
|
||||||
|
dist.broadcast(data_tensor, 0)
|
||||||
|
|
||||||
|
if not is_main_process():
|
||||||
|
data_list = data_tensor.cpu().tolist()
|
||||||
|
data_byte = bytes(data_list[: data_size.item()])
|
||||||
|
vals = pickle.loads(data_byte) # nosec
|
||||||
|
|
||||||
|
return vals
|
||||||
|
|||||||
@@ -4,19 +4,19 @@
|
|||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional, Tuple # noqa: F401
|
from typing import Optional, Tuple # noqa: F401
|
||||||
|
|
||||||
import bitsandbytes as bnb
|
import bitsandbytes as bnb
|
||||||
import torch
|
import torch
|
||||||
import transformers
|
import transformers
|
||||||
from optimum.bettertransformer import BetterTransformer
|
from optimum.bettertransformer import BetterTransformer
|
||||||
from peft import PeftConfig
|
from peft import PeftConfig, prepare_model_for_kbit_training
|
||||||
from transformers import ( # noqa: F401
|
from transformers import ( # noqa: F401
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoModelForCausalLM,
|
AutoModelForCausalLM,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
BitsAndBytesConfig,
|
BitsAndBytesConfig,
|
||||||
|
GPTQConfig,
|
||||||
LlamaConfig,
|
LlamaConfig,
|
||||||
PreTrainedModel,
|
PreTrainedModel,
|
||||||
PreTrainedTokenizerBase,
|
PreTrainedTokenizerBase,
|
||||||
@@ -155,32 +155,17 @@ def load_model(
|
|||||||
LOG.info("patching _expand_mask")
|
LOG.info("patching _expand_mask")
|
||||||
hijack_expand_mask()
|
hijack_expand_mask()
|
||||||
|
|
||||||
try:
|
|
||||||
if cfg.gptq:
|
|
||||||
from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
|
|
||||||
replace_peft_model_with_int4_lora_model,
|
|
||||||
)
|
|
||||||
|
|
||||||
replace_peft_model_with_int4_lora_model()
|
|
||||||
except Exception as err:
|
|
||||||
LOG.exception(err)
|
|
||||||
raise err
|
|
||||||
|
|
||||||
if not cfg.gptq and (
|
|
||||||
(cfg.adapter == "lora" and load_in_8bit)
|
|
||||||
or (cfg.adapter == "qlora" and cfg.load_in_4bit)
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
from peft import prepare_model_for_kbit_training
|
|
||||||
except ImportError:
|
|
||||||
# For backward compatibility
|
|
||||||
from peft import (
|
|
||||||
prepare_model_for_int8_training as prepare_model_for_kbit_training,
|
|
||||||
)
|
|
||||||
|
|
||||||
model_kwargs = {}
|
model_kwargs = {}
|
||||||
if cfg.model_revision:
|
if cfg.model_revision:
|
||||||
model_kwargs["revision"] = cfg.model_revision
|
model_kwargs["revision"] = cfg.model_revision
|
||||||
|
if cfg.gptq:
|
||||||
|
model_config = load_model_config(cfg)
|
||||||
|
if not hasattr(model_config, "quantization_config"):
|
||||||
|
LOG.warning("model config does not contain quantization_config information")
|
||||||
|
else:
|
||||||
|
model_kwargs["quantization_config"] = GPTQConfig(
|
||||||
|
**model_config.quantization_config
|
||||||
|
)
|
||||||
if cfg.adapter == "qlora" and cfg.load_in_4bit:
|
if cfg.adapter == "qlora" and cfg.load_in_4bit:
|
||||||
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
@@ -191,45 +176,7 @@ def load_model(
|
|||||||
bnb_4bit_quant_type="nf4",
|
bnb_4bit_quant_type="nf4",
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
if cfg.gptq and cfg.is_llama_derived_model:
|
if cfg.is_llama_derived_model and not cfg.trust_remote_code and not cfg.gptq:
|
||||||
from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
|
|
||||||
try:
|
|
||||||
snapshot_download_kwargs = {}
|
|
||||||
if cfg.base_model_ignore_patterns:
|
|
||||||
snapshot_download_kwargs[
|
|
||||||
"ignore_patterns"
|
|
||||||
] = cfg.base_model_ignore_patterns
|
|
||||||
cache_model_path = Path(
|
|
||||||
snapshot_download(base_model, **snapshot_download_kwargs)
|
|
||||||
)
|
|
||||||
files = (
|
|
||||||
list(cache_model_path.glob("*.pt"))
|
|
||||||
+ list(cache_model_path.glob("*.safetensors"))
|
|
||||||
+ list(cache_model_path.glob("*.bin"))
|
|
||||||
)
|
|
||||||
if len(files) > 0:
|
|
||||||
model_path = str(files[0])
|
|
||||||
else:
|
|
||||||
LOG.warning(
|
|
||||||
"unable to find a cached model file, this will likely fail..."
|
|
||||||
)
|
|
||||||
model_path = str(cache_model_path)
|
|
||||||
except Exception: # pylint: disable=broad-exception-caught
|
|
||||||
model_path = cfg.base_model
|
|
||||||
model, _ = load_llama_model_4bit_low_ram(
|
|
||||||
base_model_config if base_model_config else base_model,
|
|
||||||
model_path,
|
|
||||||
device_map=cfg.device_map,
|
|
||||||
half=cfg.fp16,
|
|
||||||
groupsize=cfg.gptq_groupsize if cfg.gptq_groupsize else -1,
|
|
||||||
is_v1_model=cfg.gptq_model_v1
|
|
||||||
if cfg.gptq_model_v1 is not None
|
|
||||||
else True,
|
|
||||||
)
|
|
||||||
load_in_8bit = False
|
|
||||||
elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
|
|
||||||
from transformers import LlamaForCausalLM
|
from transformers import LlamaForCausalLM
|
||||||
|
|
||||||
config_kwargs = {}
|
config_kwargs = {}
|
||||||
@@ -275,15 +222,24 @@ def load_model(
|
|||||||
# )
|
# )
|
||||||
# model.train() # sets to train instead of eval mode
|
# model.train() # sets to train instead of eval mode
|
||||||
elif model_type and not cfg.trust_remote_code:
|
elif model_type and not cfg.trust_remote_code:
|
||||||
model = getattr(transformers, model_type).from_pretrained(
|
if cfg.gptq:
|
||||||
base_model,
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
device_map=cfg.device_map,
|
base_model,
|
||||||
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
device_map=cfg.device_map,
|
||||||
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
torch_dtype=cfg.torch_dtype,
|
||||||
torch_dtype=cfg.torch_dtype,
|
trust_remote_code=cfg.trust_remote_code or False,
|
||||||
trust_remote_code=cfg.trust_remote_code or False,
|
**model_kwargs,
|
||||||
**model_kwargs,
|
)
|
||||||
)
|
else:
|
||||||
|
model = getattr(transformers, model_type).from_pretrained(
|
||||||
|
base_model,
|
||||||
|
device_map=cfg.device_map,
|
||||||
|
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
||||||
|
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
||||||
|
torch_dtype=cfg.torch_dtype,
|
||||||
|
trust_remote_code=cfg.trust_remote_code or False,
|
||||||
|
**model_kwargs,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
config = AutoConfig.from_pretrained(
|
config = AutoConfig.from_pretrained(
|
||||||
base_model,
|
base_model,
|
||||||
@@ -359,11 +315,12 @@ def load_model(
|
|||||||
module.to(torch.float32)
|
module.to(torch.float32)
|
||||||
|
|
||||||
needs_fa2_dtype = cfg.adapter or cfg.fsdp
|
needs_fa2_dtype = cfg.adapter or cfg.fsdp
|
||||||
if not cfg.gptq and (
|
if (cfg.adapter == "lora" and load_in_8bit) or (
|
||||||
(cfg.adapter == "lora" and load_in_8bit)
|
cfg.adapter == "qlora" and cfg.load_in_4bit
|
||||||
or (cfg.adapter == "qlora" and cfg.load_in_4bit)
|
|
||||||
):
|
):
|
||||||
LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
|
LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
|
||||||
|
if cfg.gradient_checkpointing:
|
||||||
|
model.gradient_checkpointing_enable()
|
||||||
model = prepare_model_for_kbit_training(
|
model = prepare_model_for_kbit_training(
|
||||||
model, use_gradient_checkpointing=cfg.gradient_checkpointing
|
model, use_gradient_checkpointing=cfg.gradient_checkpointing
|
||||||
)
|
)
|
||||||
@@ -385,22 +342,10 @@ def load_model(
|
|||||||
if cfg.ddp and not load_in_8bit:
|
if cfg.ddp and not load_in_8bit:
|
||||||
model.to(f"cuda:{cfg.local_rank}")
|
model.to(f"cuda:{cfg.local_rank}")
|
||||||
|
|
||||||
if cfg.gptq:
|
|
||||||
# Scales to half
|
|
||||||
LOG.info("Fitting 4bit scales and zeros to half")
|
|
||||||
for _, module in model.named_modules():
|
|
||||||
if "Autograd4bitQuantLinear" in str(type(module)) or "Linear4bitLt" in str(
|
|
||||||
type(module)
|
|
||||||
):
|
|
||||||
if hasattr(module, "is_v1_model") and module.is_v1_model:
|
|
||||||
module.zeros = module.zeros.half()
|
|
||||||
module.scales = module.scales.half()
|
|
||||||
module.bias = module.bias.half()
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
torch.cuda.device_count() > 1
|
torch.cuda.device_count() > 1
|
||||||
and int(os.getenv("WORLD_SIZE", "1")) > 1
|
and int(os.getenv("WORLD_SIZE", "1")) > 1
|
||||||
and (cfg.gptq or cfg.load_in_4bit)
|
and (cfg.load_in_4bit)
|
||||||
):
|
):
|
||||||
# llama is PROBABLY model parallelizable, but the default isn't that it is
|
# llama is PROBABLY model parallelizable, but the default isn't that it is
|
||||||
# so let's only set it for the 4bit, see
|
# so let's only set it for the 4bit, see
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ from axolotl.utils.callbacks import (
|
|||||||
)
|
)
|
||||||
from axolotl.utils.collators import DataCollatorForSeq2Seq
|
from axolotl.utils.collators import DataCollatorForSeq2Seq
|
||||||
from axolotl.utils.dataloader import MultipackDistributedDataloader
|
from axolotl.utils.dataloader import MultipackDistributedDataloader
|
||||||
|
from axolotl.utils.distributed import is_main_process, zero_first
|
||||||
from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup
|
from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl")
|
LOG = logging.getLogger("axolotl")
|
||||||
@@ -375,14 +376,17 @@ def disable_datasets_caching():
|
|||||||
|
|
||||||
def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
||||||
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
||||||
train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count())
|
with zero_first(is_main_process()):
|
||||||
if eval_dataset:
|
train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count())
|
||||||
eval_dataset = eval_dataset.filter(drop_long, num_proc=os.cpu_count())
|
|
||||||
|
|
||||||
if cfg.sample_packing:
|
|
||||||
train_dataset = train_dataset.map(add_position_ids, num_proc=os.cpu_count())
|
|
||||||
if eval_dataset:
|
if eval_dataset:
|
||||||
eval_dataset = eval_dataset.map(add_position_ids, num_proc=os.cpu_count())
|
eval_dataset = eval_dataset.filter(drop_long, num_proc=os.cpu_count())
|
||||||
|
|
||||||
|
if cfg.sample_packing:
|
||||||
|
train_dataset = train_dataset.map(add_position_ids, num_proc=os.cpu_count())
|
||||||
|
if eval_dataset:
|
||||||
|
eval_dataset = eval_dataset.map(
|
||||||
|
add_position_ids, num_proc=os.cpu_count()
|
||||||
|
)
|
||||||
return train_dataset, eval_dataset
|
return train_dataset, eval_dataset
|
||||||
|
|
||||||
|
|
||||||
@@ -514,23 +518,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
training_arguments_kwargs["seed"] = cfg.seed
|
training_arguments_kwargs["seed"] = cfg.seed
|
||||||
|
|
||||||
if cfg.gradient_checkpointing:
|
if cfg.gradient_checkpointing:
|
||||||
if cfg.gptq:
|
training_arguments_kwargs["gradient_checkpointing"] = cfg.gradient_checkpointing
|
||||||
from alpaca_lora_4bit.gradient_checkpointing import (
|
|
||||||
apply_gradient_checkpointing,
|
|
||||||
)
|
|
||||||
|
|
||||||
gradient_checkpointing_ratio = (
|
|
||||||
cfg.gradient_checkpointing_ratio
|
|
||||||
if cfg.gradient_checkpointing_ratio
|
|
||||||
else 1.0
|
|
||||||
)
|
|
||||||
apply_gradient_checkpointing(
|
|
||||||
model, checkpoint_ratio=gradient_checkpointing_ratio
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
training_arguments_kwargs[
|
|
||||||
"gradient_checkpointing"
|
|
||||||
] = cfg.gradient_checkpointing
|
|
||||||
if cfg.fsdp:
|
if cfg.fsdp:
|
||||||
training_arguments_kwargs["fsdp"] = cfg.fsdp
|
training_arguments_kwargs["fsdp"] = cfg.fsdp
|
||||||
if cfg.fsdp_config:
|
if cfg.fsdp_config:
|
||||||
@@ -588,6 +576,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
training_arguments_kwargs["do_bench_eval"] = cfg.do_bench_eval
|
training_arguments_kwargs["do_bench_eval"] = cfg.do_bench_eval
|
||||||
if cfg.bench_dataset:
|
if cfg.bench_dataset:
|
||||||
training_arguments_kwargs["bench_dataset"] = cfg.bench_dataset
|
training_arguments_kwargs["bench_dataset"] = cfg.bench_dataset
|
||||||
|
if cfg.metric_for_best_model:
|
||||||
|
training_arguments_kwargs["metric_for_best_model"] = cfg.metric_for_best_model
|
||||||
|
if cfg.greater_is_better:
|
||||||
|
training_arguments_kwargs["greater_is_better"] = cfg.greater_is_better
|
||||||
|
|
||||||
# DDP Config
|
# DDP Config
|
||||||
if cfg.ddp_timeout:
|
if cfg.ddp_timeout:
|
||||||
@@ -613,11 +605,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
output_dir=cfg.output_dir,
|
output_dir=cfg.output_dir,
|
||||||
save_total_limit=cfg.save_total_limit if cfg.save_total_limit else 4,
|
save_total_limit=cfg.save_total_limit if cfg.save_total_limit else 4,
|
||||||
load_best_model_at_end=(
|
load_best_model_at_end=(
|
||||||
cfg.load_best_model_at_end is not False
|
(cfg.load_best_model_at_end is not False or cfg.early_stopping_patience)
|
||||||
and cfg.val_set_size > 0
|
and cfg.val_set_size > 0
|
||||||
and cfg.save_steps
|
and cfg.save_steps
|
||||||
and cfg.save_steps % cfg.eval_steps == 0
|
and cfg.save_steps % cfg.eval_steps == 0
|
||||||
and cfg.load_in_8bit is not True
|
|
||||||
)
|
)
|
||||||
or False,
|
or False,
|
||||||
ddp_find_unused_parameters=False if cfg.ddp else None,
|
ddp_find_unused_parameters=False if cfg.ddp else None,
|
||||||
@@ -649,13 +640,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
if cfg.relora_steps:
|
if cfg.relora_steps:
|
||||||
callbacks.append(ReLoRACallback(cfg))
|
callbacks.append(ReLoRACallback(cfg))
|
||||||
|
|
||||||
# TODO on_save callback to sync checkpoints to GCP/AWS in background
|
|
||||||
if cfg.early_stopping_patience:
|
|
||||||
early_stop_cb = EarlyStoppingCallback(
|
|
||||||
cfg.early_stopping_patience,
|
|
||||||
)
|
|
||||||
callbacks.append(early_stop_cb)
|
|
||||||
|
|
||||||
if cfg.local_rank == 0 and cfg.adapter in [
|
if cfg.local_rank == 0 and cfg.adapter in [
|
||||||
"lora",
|
"lora",
|
||||||
"qlora",
|
"qlora",
|
||||||
@@ -722,4 +706,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
if cfg.do_bench_eval:
|
if cfg.do_bench_eval:
|
||||||
trainer.add_callback(bench_eval_callback_factory(trainer, tokenizer))
|
trainer.add_callback(bench_eval_callback_factory(trainer, tokenizer))
|
||||||
|
|
||||||
|
# TODO on_save callback to sync checkpoints to GCP/AWS in background
|
||||||
|
if cfg.early_stopping_patience:
|
||||||
|
early_stop_cb = EarlyStoppingCallback(
|
||||||
|
cfg.early_stopping_patience,
|
||||||
|
)
|
||||||
|
trainer.add_callback(early_stop_cb)
|
||||||
|
|
||||||
return trainer
|
return trainer
|
||||||
|
|||||||
64
tests/test_data.py
Normal file
64
tests/test_data.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
"""
|
||||||
|
test module for the axolotl.utis.data module
|
||||||
|
"""
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from transformers import LlamaTokenizer
|
||||||
|
|
||||||
|
from axolotl.utils.data import encode_pretraining, md5
|
||||||
|
|
||||||
|
|
||||||
|
class TestEncodePretraining(unittest.TestCase):
|
||||||
|
"""
|
||||||
|
test class for encode pretraining and md5 helper
|
||||||
|
"""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b")
|
||||||
|
self.tokenizer.add_special_tokens(
|
||||||
|
{
|
||||||
|
"eos_token": "</s>",
|
||||||
|
"bos_token": "<s>",
|
||||||
|
"unk_token": "<unk>",
|
||||||
|
"pad_token": "<pad>",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
self.max_tokens = 15 # set a small number for easy inspection
|
||||||
|
|
||||||
|
def test_encode_pretraining(self):
|
||||||
|
examples = {
|
||||||
|
"text": [
|
||||||
|
"Hello, world!",
|
||||||
|
"Nice to meet you.",
|
||||||
|
"lorem ipsum dolor sit amet.",
|
||||||
|
"Nice to meet you again!.",
|
||||||
|
"hello, hello",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
result = encode_pretraining(self.tokenizer, self.max_tokens, examples)
|
||||||
|
|
||||||
|
self.assertEqual(len(result["input_ids"]), 3)
|
||||||
|
|
||||||
|
# Assert the length of input_ids and attention_mask is correct
|
||||||
|
self.assertEqual(len(result["input_ids"][0]), self.max_tokens)
|
||||||
|
self.assertEqual(len(result["attention_mask"][0]), self.max_tokens)
|
||||||
|
|
||||||
|
# Assert EOS and PAD tokens are correctly added
|
||||||
|
# hello world! is 4 tokens
|
||||||
|
self.assertEqual(result["input_ids"][0][0], self.tokenizer.bos_token_id)
|
||||||
|
self.assertEqual(result["input_ids"][0][5], self.tokenizer.eos_token_id)
|
||||||
|
self.assertEqual(result["input_ids"][0][6], self.tokenizer.pad_token_id)
|
||||||
|
# second part, 5 tokens
|
||||||
|
self.assertEqual(result["input_ids"][0][7], self.tokenizer.bos_token_id)
|
||||||
|
self.assertEqual(result["input_ids"][0][13], self.tokenizer.eos_token_id)
|
||||||
|
self.assertEqual(result["input_ids"][0][14], self.tokenizer.pad_token_id)
|
||||||
|
|
||||||
|
def test_md5(self):
|
||||||
|
self.assertEqual(md5("hello world"), "5eb63bbbe01eeed093cb22bb8f5acdc3")
|
||||||
|
self.assertEqual(
|
||||||
|
md5("hello world", "utf-8"), "5eb63bbbe01eeed093cb22bb8f5acdc3"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -328,6 +328,20 @@ class ValidationTest(unittest.TestCase):
|
|||||||
for record in self._caplog.records
|
for record in self._caplog.records
|
||||||
)
|
)
|
||||||
|
|
||||||
|
cfg = DictDefault(
|
||||||
|
{
|
||||||
|
"sample_packing": True,
|
||||||
|
"pad_to_sequence_len": None,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
with self._caplog.at_level(logging.WARNING):
|
||||||
|
validate_config(cfg)
|
||||||
|
assert any(
|
||||||
|
"`pad_to_sequence_len: true` is recommended when using sample_packing"
|
||||||
|
in record.message
|
||||||
|
for record in self._caplog.records
|
||||||
|
)
|
||||||
|
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"max_packed_sequence_len": 2048,
|
"max_packed_sequence_len": 2048,
|
||||||
|
|||||||
Reference in New Issue
Block a user