Compare commits

...

38 Commits

Author SHA1 Message Date
Wing Lian
a6f5e5eaec Merge pull request #134 from OpenAccess-AI-Collective/gas-batch-fix
fix batch size calculation
2023-05-31 14:24:48 -04:00
Wing Lian
5a631b305b fix batch size calculation 2023-05-31 14:11:32 -04:00
Wing Lian
f94dd626f0 Merge pull request #130 from OpenAccess-AI-Collective/gas
swap batch size for gradient accumulation steps to decouple from num gpu
2023-05-31 13:03:51 -04:00
Wing Lian
5079753b7a Merge pull request #131 from OpenAccess-AI-Collective/fix-packing-mask
fix packing so that concatenated sequences reset the attention
2023-05-31 13:03:37 -04:00
Wing Lian
0136f510f2 don't worry about duplicate code here 2023-05-31 12:05:43 -04:00
Wing Lian
9b8585dc70 fix packing so that concatenated sequences reset the attention 2023-05-31 11:38:52 -04:00
Wing Lian
8eb5811d4e Merge pull request #129 from OpenAccess-AI-Collective/builder-badge
add badge info to readme
2023-05-31 10:37:59 -04:00
Wing Lian
e0011fdf55 Fix base builder, missing tags 2023-05-31 09:52:03 -04:00
Wing Lian
6e9e98720e Merge pull request #127 from OpenAccess-AI-Collective/py310-docker-runpod
add py310 support from base image
2023-05-31 09:39:42 -04:00
Wing Lian
c2a0792680 swap batch size for gradient accumulation steps to decouple from num gpu 2023-05-31 09:38:12 -04:00
Wing Lian
b267d24a2b add badge info to readme 2023-05-31 09:28:44 -04:00
Wing Lian
5c3f5db38b Add files via upload 2023-05-31 09:22:54 -04:00
Wing Lian
e3d03745ba add py310 support from base image 2023-05-31 09:07:28 -04:00
NanoCode012
fac46002d4 Merge pull request #119 from NanoCode012/feat/update-inference
Feat(inference): Swap to GenerationConfig
2023-05-31 14:09:18 +09:00
NanoCode012
33d40179ba Increase max_new_tokens
Co-authored-by: Wing Lian <wing.lian@gmail.com>
2023-05-31 14:04:49 +09:00
Wing Lian
dcb03d6da4 Merge pull request #114 from OpenAccess-AI-Collective/accelerate-dep
Add accelerate dep
2023-05-31 00:47:17 -04:00
NanoCode012
0e4be625ae Merge pull request #118 from NanoCode012/feat/torch-readme
Fix(readme): Fix torch missing from readme
2023-05-31 13:29:41 +09:00
NanoCode012
bdc4bd7d4e Update README.md 2023-05-31 13:24:28 +09:00
Wing Lian
2d0ba3b818 Merge pull request #124 from OpenAccess-AI-Collective/xformers-fix
copy xformers attn from ooba since we removed dep on alpaca_lora_4bit
2023-05-31 00:11:40 -04:00
Wing Lian
c7021e191f Merge pull request #120 from OpenAccess-AI-Collective/model-from-path
split up llama model loading so config can be loaded from base config and models can be loaded from a path
2023-05-31 00:08:38 -04:00
Wing Lian
c56818b119 don't worry about dupes 2023-05-31 00:06:47 -04:00
Wing Lian
2675fb756e update readme for SDP 2023-05-31 00:04:54 -04:00
Wing Lian
1076bcbbca Update src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>
2023-05-31 00:00:19 -04:00
Wing Lian
2daa6835f0 Update src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>
2023-05-30 23:59:05 -04:00
Wing Lian
e3c494ca7b remove unused import and update readme 2023-05-30 23:55:45 -04:00
Wing Lian
ad0ea6aaab black formatting
ignore copied file
fix linting
2023-05-30 23:50:29 -04:00
Wing Lian
876edd83d0 Merge pull request #123 from OpenAccess-AI-Collective/bas-batch
add support for gradient accumulation steps
2023-05-30 23:45:29 -04:00
Wing Lian
6cb2310592 copy xformers attn from ooba since we removed dep on alpaca_lora_4bit 2023-05-30 23:34:36 -04:00
Wing Lian
6fa40bf8ad black formatting 2023-05-30 23:33:37 -04:00
Wing Lian
3aad5f3b3e add support for gradient accumulation steps 2023-05-30 23:24:37 -04:00
Wing Lian
39a208c2bc fix up tokenizer config, isort fix 2023-05-30 23:00:02 -04:00
Wing Lian
2520ecd6df split up llama model loading so config can be loaded from base config and models can be loaded from a path 2023-05-30 22:32:44 -04:00
Wing Lian
c5b0af1a7e define python version (3.10) explicitly as string in yaml 2023-05-30 22:23:35 -04:00
NanoCode012
988aeb9c34 Feat: Swap to GenerationConfig 2023-05-31 10:48:19 +09:00
NanoCode012
cf61f14bff FIx(readme): Fix torch missing from readme 2023-05-31 10:28:49 +09:00
Wing Lian
0abcd71a85 Merge pull request #115 from OpenAccess-AI-Collective/docker-version-fixes
docker fixes: py310, fix cuda arg in deepspeed
2023-05-30 18:11:26 -04:00
Wing Lian
c43c5c84ff py310, fix cuda arg in deepspeed 2023-05-30 18:02:34 -04:00
Wing Lian
36ec6e1a0e Add accelerate dep 2023-05-30 16:36:13 -04:00
33 changed files with 476 additions and 45 deletions

View File

@@ -16,13 +16,22 @@ jobs:
include: include:
- cuda: "118" - cuda: "118"
cuda_version: 11.8.0 cuda_version: 11.8.0
python_version: "3.9"
pytorch: 2.0.0
axolotl_extras:
- cuda: "118"
cuda_version: 11.8.0
python_version: "3.10"
pytorch: 2.0.0
axolotl_extras: axolotl_extras:
- cuda: "117" - cuda: "117"
cuda_version: 11.7.0 cuda_version: 11.7.0
python_version: "3.9"
pytorch: 1.13.1 pytorch: 1.13.1
axolotl_extras: axolotl_extras:
- cuda: "118" - cuda: "118"
cuda_version: 11.8.0 cuda_version: 11.8.0
python_version: "3.9"
pytorch: 2.0.0 pytorch: 2.0.0
axolotl_extras: gptq axolotl_extras: gptq
steps: steps:
@@ -46,12 +55,13 @@ jobs:
context: . context: .
file: ./docker/Dockerfile-base file: ./docker/Dockerfile-base
push: ${{ github.event_name != 'pull_request' }} push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.metadata.outputs.tags }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
labels: ${{ steps.metadata.outputs.labels }} labels: ${{ steps.metadata.outputs.labels }}
cache-from: type=gha cache-from: type=gha
cache-to: type=gha,mode=max cache-to: type=gha,mode=max
build-args: | build-args: |
CUDA_VERSION=${{ matrix.cuda_version }} CUDA_VERSION=${{ matrix.cuda_version }}
CUDA=${{ matrix.cuda }} CUDA=${{ matrix.cuda }}
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch }} PYTORCH_VERSION=${{ matrix.pytorch }}
AXOLOTL_EXTRAS=${{ matrix.axolotl_extras }} AXOLOTL_EXTRAS=${{ matrix.axolotl_extras }}

View File

@@ -15,14 +15,22 @@ jobs:
include: include:
- cuda: cu118 - cuda: cu118
cuda_version: 11.8.0 cuda_version: 11.8.0
python_version: "3.9"
pytorch: 2.0.0 pytorch: 2.0.0
axolotl_extras: axolotl_extras:
- cuda: cu118 - cuda: cu118
cuda_version: 11.8.0 cuda_version: 11.8.0
python_version: "3.10"
pytorch: 2.0.0
axolotl_extras:
- cuda: cu118
cuda_version: 11.8.0
python_version: "3.9"
pytorch: 2.0.0 pytorch: 2.0.0
axolotl_extras: gptq axolotl_extras: gptq
- cuda: cu117 - cuda: cu117
cuda_version: 11.7.0 cuda_version: 11.7.0
python_version: "3.9"
pytorch: 1.13.1 pytorch: 1.13.1
axolotl_extras: axolotl_extras:
runs-on: self-hosted runs-on: self-hosted
@@ -46,10 +54,10 @@ jobs:
with: with:
context: . context: .
build-args: | build-args: |
BASE_TAG=${{ github.ref_name }}-base-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
file: ./docker/Dockerfile file: ./docker/Dockerfile
push: ${{ github.event_name != 'pull_request' }} push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.metadata.outputs.tags }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
labels: ${{ steps.metadata.outputs.labels }} labels: ${{ steps.metadata.outputs.labels }}
cache-from: type=gha cache-from: type=gha
cache-to: type=gha,mode=max cache-to: type=gha,mode=max
@@ -62,14 +70,22 @@ jobs:
include: include:
- cuda: cu118 - cuda: cu118
cuda_version: 11.8.0 cuda_version: 11.8.0
python_version: "3.9"
pytorch: 2.0.0 pytorch: 2.0.0
axolotl_extras: axolotl_extras:
- cuda: cu118 - cuda: cu118
cuda_version: 11.8.0 cuda_version: 11.8.0
python_version: "3.10"
pytorch: 2.0.0
axolotl_extras:
- cuda: cu118
cuda_version: 11.8.0
python_version: "3.9"
pytorch: 2.0.0 pytorch: 2.0.0
axolotl_extras: gptq axolotl_extras: gptq
- cuda: cu117 - cuda: cu117
cuda_version: 11.7.0 cuda_version: 11.7.0
python_version: "3.9"
pytorch: 1.13.1 pytorch: 1.13.1
axolotl_extras: axolotl_extras:
runs-on: self-hosted runs-on: self-hosted
@@ -93,10 +109,10 @@ jobs:
with: with:
context: . context: .
build-args: | build-args: |
BASE_TAG=${{ github.ref_name }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
file: ./docker/Dockerfile-runpod file: ./docker/Dockerfile-runpod
push: ${{ github.event_name != 'pull_request' }} push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.metadata.outputs.tags }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
labels: ${{ steps.metadata.outputs.labels }} labels: ${{ steps.metadata.outputs.labels }}
cache-from: type=gha cache-from: type=gha
cache-to: type=gha,mode=max cache-to: type=gha,mode=max

View File

@@ -5,6 +5,9 @@ exclude = venv
[mypy-alpaca_lora_4bit.*] [mypy-alpaca_lora_4bit.*]
ignore_missing_imports = True ignore_missing_imports = True
[mypy-axolotl.monkeypatch.*]
ignore_errors = True
[mypy-flash_attn.*] [mypy-flash_attn.*]
ignore_missing_imports = True ignore_missing_imports = True
@@ -31,3 +34,6 @@ ignore_missing_imports = True
[mypy-addict] [mypy-addict]
ignore_missing_imports = True ignore_missing_imports = True
[mypy-xformers.*]
ignore_missing_imports = True

View File

@@ -27,7 +27,7 @@
## Quickstart ⚡ ## Quickstart ⚡
**Requirements**: Python 3.9. **Requirements**: Python 3.9 and Pytorch 2.0.
```bash ```bash
git clone https://github.com/OpenAccess-AI-Collective/axolotl git clone https://github.com/OpenAccess-AI-Collective/axolotl
@@ -58,7 +58,9 @@ accelerate launch scripts/finetune.py examples/lora-openllama-3b/config.yml \
- Conda/Pip venv - Conda/Pip venv
1. Install python **3.9** 1. Install python **3.9**
2. Install python dependencies with ONE of the following: 2. Install pytorch stable https://pytorch.org/get-started/locally/
3. Install python dependencies with ONE of the following:
- `pip3 install -e .` (recommended, supports QLoRA, no gptq/int4 support) - `pip3 install -e .` (recommended, supports QLoRA, no gptq/int4 support)
- `pip3 install -e .[gptq]` (next best if you don't need QLoRA, but want to use gptq) - `pip3 install -e .[gptq]` (next best if you don't need QLoRA, but want to use gptq)
- `pip3 install -e .[gptq_triton]` - `pip3 install -e .[gptq_triton]`
@@ -171,6 +173,9 @@ base_model_ignore_patterns:
# if the base_model repo on hf hub doesn't include configuration .json files, # if the base_model repo on hf hub doesn't include configuration .json files,
# you can set that here, or leave this empty to default to base_model # you can set that here, or leave this empty to default to base_model
base_model_config: ./llama-7b-hf base_model_config: ./llama-7b-hf
# Optional tokenizer configuration override in case you want to use a different tokenizer
# than the one defined in the base model
tokenizer_config:
# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
model_type: AutoModelForCausalLM model_type: AutoModelForCausalLM
# Corresponding tokenizer for the model AutoTokenizer is a good choice # Corresponding tokenizer for the model AutoTokenizer is a good choice
@@ -260,7 +265,7 @@ wandb_log_model: # 'checkpoint'
output_dir: ./completed-model output_dir: ./completed-model
# training hyperparameters # training hyperparameters
batch_size: 8 gradient_accumulation_steps: 1
micro_batch_size: 2 micro_batch_size: 2
eval_batch_size: 2 eval_batch_size: 2
num_epochs: 3 num_epochs: 3
@@ -300,6 +305,9 @@ weight_decay:
xformers_attention: xformers_attention:
# whether to use flash attention patch https://github.com/HazyResearch/flash-attention: # whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
flash_attention: # require a100 for llama flash_attention: # require a100 for llama
# whether to use scaled-dot-product attention
# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
sdp_attention:
# resume from a specific checkpoint dir # resume from a specific checkpoint dir
resume_from_checkpoint: resume_from_checkpoint:
@@ -403,6 +411,16 @@ Try to turn off xformers.
Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
## Badge ❤🏷️
Building something cool with Axolotl? Consider adding a badge to your model card.
```markdown
[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
```
[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
## Contributing 🤝 ## Contributing 🤝
Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new). Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new).

View File

@@ -26,7 +26,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./lora-alpaca output_dir: ./lora-alpaca
batch_size: 32 gradient_accumulation_steps: 1
micro_batch_size: 4 micro_batch_size: 4
num_epochs: 5 num_epochs: 5
learning_rate: 0.0003 learning_rate: 0.0003

View File

@@ -23,7 +23,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./lora-llama-alpaca output_dir: ./lora-llama-alpaca
batch_size: 32 gradient_accumulation_steps: 1
micro_batch_size: 16 micro_batch_size: 16
num_epochs: 3 num_epochs: 3
learning_rate: 0.00003 learning_rate: 0.00003

View File

@@ -25,7 +25,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./gpt4all-neox-20b output_dir: ./gpt4all-neox-20b
batch_size: 48 gradient_accumulation_steps: 1
micro_batch_size: 4 micro_batch_size: 4
num_epochs: 5 num_epochs: 5
learning_rate: 0.00003 learning_rate: 0.00003

View File

@@ -23,7 +23,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./llama-13b-sharegpt output_dir: ./llama-13b-sharegpt
batch_size: 64 gradient_accumulation_steps: 1
micro_batch_size: 2 micro_batch_size: 2
warmup_steps: 1000 warmup_steps: 1000
save_steps: save_steps:

View File

@@ -29,7 +29,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./lora-llama-alpaca output_dir: ./lora-llama-alpaca
batch_size: 128 gradient_accumulation_steps: 1
micro_batch_size: 16 micro_batch_size: 16
warmup_steps: 1000 warmup_steps: 1000
save_steps: save_steps:

View File

@@ -26,7 +26,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./lora-test output_dir: ./lora-test
batch_size: 8 gradient_accumulation_steps: 1
micro_batch_size: 2 micro_batch_size: 2
num_epochs: 3 num_epochs: 3
warmup_steps: 100 warmup_steps: 100

View File

@@ -28,7 +28,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./lora-llama-alpaca output_dir: ./lora-llama-alpaca
batch_size: 128 gradient_accumulation_steps: 1
micro_batch_size: 16 micro_batch_size: 16
num_epochs: 5 num_epochs: 5
learning_rate: 0.00003 learning_rate: 0.00003

View File

@@ -24,7 +24,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./jeopardy-bot-7b output_dir: ./jeopardy-bot-7b
batch_size: 4 gradient_accumulation_steps: 2
micro_batch_size: 1 micro_batch_size: 1
num_epochs: 2 num_epochs: 2
optimizer: adamw_bnb_8bit optimizer: adamw_bnb_8bit

View File

@@ -28,7 +28,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./lora-alpaca output_dir: ./lora-alpaca
batch_size: 48 gradient_accumulation_steps: 1
micro_batch_size: 4 micro_batch_size: 4
num_epochs: 5 num_epochs: 5
learning_rate: 0.00001 learning_rate: 0.00001

View File

@@ -26,7 +26,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./lora-test output_dir: ./lora-test
batch_size: 4 gradient_accumulation_steps: 1
micro_batch_size: 1 micro_batch_size: 1
num_epochs: 3 num_epochs: 3
warmup_steps: 100 warmup_steps: 100

View File

@@ -53,7 +53,8 @@ wandb_log_model:
# where to save the finsihed model to # where to save the finsihed model to
output_dir: ./completed-model output_dir: ./completed-model
# training hyperparameters # training hyperparameters
batch_size: 8 gradient_accumulation_steps: 1
batch_size:
micro_batch_size: 2 micro_batch_size: 2
num_epochs: 3 num_epochs: 3
warmup_steps: 100 warmup_steps: 100

View File

@@ -22,7 +22,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./stable-alpaca-3b output_dir: ./stable-alpaca-3b
batch_size: 2 gradient_accumulation_steps: 1
micro_batch_size: 1 micro_batch_size: 1
num_epochs: 1 num_epochs: 1
optimizer: adamw_bnb_8bit optimizer: adamw_bnb_8bit

View File

@@ -30,7 +30,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./lora-reflect output_dir: ./lora-reflect
batch_size: 8 gradient_accumulation_steps: 1
micro_batch_size: 2 micro_batch_size: 2
num_epochs: 3 num_epochs: 3
learning_rate: 0.00003 learning_rate: 0.00003

View File

@@ -52,6 +52,8 @@ RUN git clone https://github.com/HazyResearch/flash-attention.git && \
FROM base-builder AS deepspeed-builder FROM base-builder AS deepspeed-builder
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
WORKDIR /workspace WORKDIR /workspace
RUN git clone https://github.com/microsoft/DeepSpeed.git && \ RUN git clone https://github.com/microsoft/DeepSpeed.git && \

View File

@@ -26,7 +26,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./llama-7b-lora-int4 output_dir: ./llama-7b-lora-int4
batch_size: 1 gradient_accumulation_steps: 1
micro_batch_size: 1 micro_batch_size: 1
num_epochs: 3 num_epochs: 3
optimizer: adamw_bnb_8bit optimizer: adamw_bnb_8bit

View File

@@ -24,7 +24,7 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./mpt-alpaca-7b output_dir: ./mpt-alpaca-7b
batch_size: 1 gradient_accumulation_steps: 1
micro_batch_size: 1 micro_batch_size: 1
num_epochs: 3 num_epochs: 3
optimizer: adamw_bnb_8bit optimizer: adamw_bnb_8bit

BIN
image/axolotl-badge-web.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

View File

@@ -1,6 +1,7 @@
peft @ git+https://github.com/huggingface/peft.git peft @ git+https://github.com/huggingface/peft.git
transformers @ git+https://github.com/huggingface/transformers.git transformers @ git+https://github.com/huggingface/transformers.git
bitsandbytes>=0.39.0 bitsandbytes>=0.39.0
accelerate
addict addict
fire fire
PyYAML==6.0 PyYAML==6.0

View File

@@ -12,6 +12,7 @@ from typing import Any, Dict, List, Optional, Union
import fire import fire
import torch import torch
import yaml import yaml
from transformers import GenerationConfig
from axolotl.utils.data import load_prepare_datasets from axolotl.utils.data import load_prepare_datasets
from axolotl.utils.dict import DictDefault from axolotl.utils.dict import DictDefault
@@ -73,26 +74,33 @@ def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):
instruction = get_multi_line_input() instruction = get_multi_line_input()
if not instruction: if not instruction:
return return
prompt: str = next(prompter_module().build_prompt(instruction=instruction)) prompt: str = next(
prompter_module().build_prompt(instruction=instruction.strip("\n"))
)
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
model.eval() model.eval()
with torch.no_grad(): with torch.no_grad():
# gc = GenerationConfig() # TODO swap out and use this generation_config = GenerationConfig(
generated = model.generate(
inputs=batch["input_ids"].to(cfg.device),
do_sample=True,
use_cache=True,
repetition_penalty=1.1, repetition_penalty=1.1,
max_new_tokens=100, max_new_tokens=1024,
temperature=0.9, temperature=0.9,
top_p=0.95, top_p=0.95,
top_k=40, top_k=40,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
do_sample=True,
use_cache=True,
return_dict_in_generate=True, return_dict_in_generate=True,
output_attentions=False, output_attentions=False,
output_hidden_states=False, output_hidden_states=False,
output_scores=False, output_scores=False,
) )
generated = model.generate(
inputs=batch["input_ids"].to(cfg.device),
generation_config=generation_config,
)
print(tokenizer.decode(generated["sequences"].cpu().tolist()[0])) print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))
@@ -149,17 +157,23 @@ def train(
else: else:
cfg[k] = kwargs[k] cfg[k] = kwargs[k]
validate_config(cfg)
# setup some derived config / hyperparams # setup some derived config / hyperparams
cfg.gradient_accumulation_steps = cfg.batch_size // cfg.micro_batch_size cfg.gradient_accumulation_steps = cfg.gradient_accumulation_steps or (
cfg.batch_size // cfg.micro_batch_size
)
cfg.batch_size = (
cfg.batch_size or cfg.micro_batch_size * cfg.gradient_accumulation_steps
)
cfg.world_size = int(os.environ.get("WORLD_SIZE", 1)) cfg.world_size = int(os.environ.get("WORLD_SIZE", 1))
cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0)) cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0))
choose_device(cfg) choose_device(cfg)
cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1 cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1
if cfg.ddp: if cfg.ddp:
cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))} cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))}
cfg.gradient_accumulation_steps = ( cfg.batch_size = cfg.batch_size * cfg.world_size
cfg.gradient_accumulation_steps // cfg.world_size
)
setup_wandb_env_vars(cfg) setup_wandb_env_vars(cfg)
if cfg.device == "mps": if cfg.device == "mps":
cfg.load_in_8bit = False cfg.load_in_8bit = False
@@ -168,11 +182,10 @@ def train(
cfg.fp16 = True cfg.fp16 = True
cfg.bf16 = False cfg.bf16 = False
validate_config(cfg)
# load the tokenizer first # load the tokenizer first
logging.info("loading tokenizer...") tokenizer_config = cfg.tokenizer_config or cfg.base_model_config
tokenizer = load_tokenizer(cfg.base_model_config, cfg.tokenizer_type, cfg) logging.info(f"loading tokenizer... {tokenizer_config}")
tokenizer = load_tokenizer(tokenizer_config, cfg.tokenizer_type, cfg)
if check_not_in( if check_not_in(
["inference", "shard", "merge_lora"], kwargs ["inference", "shard", "merge_lora"], kwargs

View File

@@ -127,6 +127,11 @@ class ConstantLengthDataset(IterableDataset):
input_ids = example["input_ids"] input_ids = example["input_ids"]
attention_mask = example["attention_mask"] attention_mask = example["attention_mask"]
labels = example["labels"] labels = example["labels"]
if (
buffer["input_ids"]
and input_ids[0] == self.tokenizer.bos_token_id
):
attention_mask[0] = 0
if add_concat_token: if add_concat_token:
input_ids.append(self.concat_token_id) input_ids.append(self.concat_token_id)

View File

@@ -25,6 +25,7 @@ def forward(
attention_mask: [bsz, q_len] attention_mask: [bsz, q_len]
""" """
# pylint: disable=duplicate-code
bsz, q_len, _ = hidden_states.size() bsz, q_len, _ = hidden_states.size()
query_states = ( query_states = (

View File

@@ -0,0 +1,233 @@
"""
Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments
"""
import logging
import math
from typing import Optional, Tuple
import torch
import transformers.models.llama.modeling_llama
from torch import nn
try:
import xformers.ops
except ImportError:
logging.error("xformers not found! Please install it before trying to use it.")
def hijack_llama_attention():
transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward
def hijack_llama_sdp_attention():
transformers.models.llama.modeling_llama.LlamaAttention.forward = (
sdp_attention_forward
)
def xformers_forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
# pylint: disable=duplicate-code
bsz, q_len, _ = hidden_states.size()
query_states = (
self.q_proj(hidden_states)
.view(bsz, q_len, self.num_heads, self.head_dim)
.transpose(1, 2)
)
key_states = (
self.k_proj(hidden_states)
.view(bsz, q_len, self.num_heads, self.head_dim)
.transpose(1, 2)
)
value_states = (
self.v_proj(hidden_states)
.view(bsz, q_len, self.num_heads, self.head_dim)
.transpose(1, 2)
)
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
kv_seq_len += past_key_value[0].shape[-2]
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
(
query_states,
key_states,
) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
query_states, key_states, cos, sin, position_ids
)
# [bsz, nh, t, hd]
if past_key_value is not None:
# reuse k, v, self_attention
key_states = torch.cat([past_key_value[0], key_states], dim=2)
value_states = torch.cat([past_key_value[1], value_states], dim=2)
past_key_value = (key_states, value_states) if use_cache else None
# We only apply xformers optimizations if we don't need to output the whole attention matrix
if not output_attentions:
query_states = query_states.transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)
# This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
# We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
# input and output should be of form (bsz, q_len, num_heads, head_dim)
attn_output = xformers.ops.memory_efficient_attention(
query_states, key_states, value_states, attn_bias=None
)
else:
# input and output should be of form (bsz, q_len, num_heads, head_dim)
attn_output = xformers.ops.memory_efficient_attention(
query_states,
key_states,
value_states,
attn_bias=xformers.ops.LowerTriangularMask(),
)
attn_weights = None
else:
attn_weights = torch.matmul(
query_states, key_states.transpose(2, 3)
) / math.sqrt(self.head_dim)
if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
raise ValueError(
f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
f" {attn_weights.size()}"
)
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
)
attn_weights = attn_weights + attention_mask
attn_weights = torch.max(
attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
)
# upcast attention to fp32
attn_weights = nn.functional.softmax(
attn_weights, dim=-1, dtype=torch.float32
).to(query_states.dtype)
attn_output = torch.matmul(attn_weights, value_states)
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2)
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
attn_output = self.o_proj(attn_output)
return attn_output, attn_weights, past_key_value
def sdp_attention_forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
# pylint: disable=duplicate-code
bsz, q_len, _ = hidden_states.size()
query_states = (
self.q_proj(hidden_states)
.view(bsz, q_len, self.num_heads, self.head_dim)
.transpose(1, 2)
)
key_states = (
self.k_proj(hidden_states)
.view(bsz, q_len, self.num_heads, self.head_dim)
.transpose(1, 2)
)
value_states = (
self.v_proj(hidden_states)
.view(bsz, q_len, self.num_heads, self.head_dim)
.transpose(1, 2)
)
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
kv_seq_len += past_key_value[0].shape[-2]
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
(
query_states,
key_states,
) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
query_states, key_states, cos, sin, position_ids
)
# [bsz, nh, t, hd]
if past_key_value is not None:
# reuse k, v, self_attention
key_states = torch.cat([past_key_value[0], key_states], dim=2)
value_states = torch.cat([past_key_value[1], value_states], dim=2)
past_key_value = (key_states, value_states) if use_cache else None
# We only apply sdp attention if we don't need to output the whole attention matrix
if not output_attentions:
attn_output = torch.nn.functional.scaled_dot_product_attention(
query_states,
key_states,
value_states,
attn_mask=attention_mask,
is_causal=False,
)
attn_weights = None
else:
attn_weights = torch.matmul(
query_states, key_states.transpose(2, 3)
) / math.sqrt(self.head_dim)
if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
raise ValueError(
f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
f" {attn_weights.size()}"
)
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
)
attn_weights = attn_weights + attention_mask
attn_weights = torch.max(
attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
)
# upcast attention to fp32
attn_weights = nn.functional.softmax(
attn_weights, dim=-1, dtype=torch.float32
).to(query_states.dtype)
attn_output = torch.matmul(attn_weights, value_states)
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2)
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
attn_output = self.o_proj(attn_output)
return attn_output, attn_weights, past_key_value

View File

@@ -233,6 +233,7 @@ def load_tokenized_prepared_datasets(
datasets.append(ds_wrapper) datasets.append(ds_wrapper)
else: else:
logging.error(f"unhandled prompt tokenization strategy: {d.type}") logging.error(f"unhandled prompt tokenization strategy: {d.type}")
raise ValueError(f"unhandled prompt tokenization strategy: {d.type}")
logging.info("tokenizing, merging, and shuffling master dataset") logging.info("tokenizing, merging, and shuffling master dataset")
samples: List[int] = [] samples: List[int] = []

View File

@@ -10,9 +10,14 @@ from typing import TYPE_CHECKING, Optional, Tuple # noqa: F401
import bitsandbytes as bnb import bitsandbytes as bnb
import torch import torch
import transformers import transformers
from transformers import AutoModelForCausalLM # noqa: F401
from transformers import PreTrainedModel # noqa: F401 from transformers import PreTrainedModel # noqa: F401
from transformers import AutoConfig, AutoTokenizer, BitsAndBytesConfig from transformers import ( # noqa: F401
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
LlamaConfig,
)
try: try:
from transformers import LlamaForCausalLM from transformers import LlamaForCausalLM
@@ -25,24 +30,23 @@ from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
if TYPE_CHECKING: if TYPE_CHECKING:
from peft import PeftConfig # noqa: F401 from peft import PeftConfig # noqa: F401
from transformers import PreTrainedTokenizer # noqa: F401
from axolotl.utils.dict import DictDefault # noqa: F401 from axolotl.utils.dict import DictDefault # noqa: F401
def load_tokenizer( def load_tokenizer(
base_model_config, tokenizer_config,
tokenizer_type, tokenizer_type,
cfg, cfg,
): ):
if tokenizer_type: if tokenizer_type:
tokenizer = getattr(transformers, tokenizer_type).from_pretrained( tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
base_model_config, tokenizer_config,
trust_remote_code=cfg.trust_remote_code or False, trust_remote_code=cfg.trust_remote_code or False,
) )
else: else:
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
base_model_config, tokenizer_config,
trust_remote_code=cfg.trust_remote_code or False, trust_remote_code=cfg.trust_remote_code or False,
) )
@@ -97,12 +101,19 @@ def load_model(
logging.info("patching with flash attention") logging.info("patching with flash attention")
replace_llama_attn_with_flash_attn() replace_llama_attn_with_flash_attn()
elif is_llama_derived_model and cfg.xformers_attention: elif is_llama_derived_model and cfg.xformers_attention:
from alpaca_lora_4bit.monkeypatch.llama_attn_hijack_xformers import ( from axolotl.monkeypatch.llama_attn_hijack_xformers import (
hijack_llama_attention, hijack_llama_attention,
) )
logging.info("patching with xformers attention") logging.info("patching with xformers attention")
hijack_llama_attention() hijack_llama_attention()
elif is_llama_derived_model and cfg.sdp_attention:
from axolotl.monkeypatch.llama_attn_hijack_xformers import (
hijack_llama_sdp_attention,
)
logging.info("patching with sdp attention")
hijack_llama_sdp_attention()
if cfg.bf16: if cfg.bf16:
torch_dtype = torch.bfloat16 torch_dtype = torch.bfloat16
@@ -172,8 +183,10 @@ def load_model(
) )
load_in_8bit = False load_in_8bit = False
elif is_llama_derived_model and "LlamaForCausalLM" in globals(): elif is_llama_derived_model and "LlamaForCausalLM" in globals():
config = LlamaConfig.from_pretrained(base_model_config)
model = LlamaForCausalLM.from_pretrained( model = LlamaForCausalLM.from_pretrained(
base_model, base_model,
config=config,
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None, load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None, load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
torch_dtype=torch_dtype, torch_dtype=torch_dtype,

View File

@@ -4,6 +4,10 @@ import logging
def validate_config(cfg): def validate_config(cfg):
if cfg.gradient_accumulation_steps and cfg.batch_size:
raise ValueError(
"please set only one of gradient_accumulation_steps or batch_size"
)
if cfg.load_4bit: if cfg.load_4bit:
raise ValueError( raise ValueError(
"cfg.load_4bit parameter has been deprecated and replaced by cfg.gptq" "cfg.load_4bit parameter has been deprecated and replaced by cfg.gptq"

12
tests/fixtures/alpaca/alpaca.json vendored Normal file
View File

@@ -0,0 +1,12 @@
[
{
"instruction": "You will be given a series of words. Output these words in reverse order, with each word on its own line.",
"input": "Words: ['Hello', 'world'].",
"output": "['world', 'Hello']"
},
{
"instruction": "In this task, you're given a short description of an event. Your job is to order the steps involved in the event from first to last. Note that there may be multiple correct answers for each event.",
"input": "Description: A man walks into a bar and orders a drink. He pays for his drink and leaves the bar.",
"output": "1. The man walks into the bar.\n2. He orders a drink.\n3. He pays for his drink.\n4. He leaves the bar."
}
]

View File

@@ -0,0 +1,65 @@
"""Module for testing dataset sequence packing"""
import unittest
from pathlib import Path
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer
from axolotl.datasets import ConstantLengthDataset, TokenizedPromptDataset
from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
from axolotl.prompters import AlpacaPrompter
class TestPacking(unittest.TestCase):
"""
Test class for packing dataset sequences
"""
def setUp(self) -> None:
# pylint: disable=duplicate-code
self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
self.tokenizer.add_special_tokens(
{
"bos_token": "<s>",
"eos_token": "</s>",
"unk_token": "<unk>",
}
)
def test_resets_attention(self):
prompter = AlpacaPrompter("chat")
strat = AlpacaPromptTokenizingStrategy(
prompter,
self.tokenizer,
False,
2048,
)
dateset = load_dataset(
"json",
data_files=str(Path(__file__).parent / "fixtures/alpaca/alpaca.json"),
)["train"]
dataset = Dataset.from_list(list(TokenizedPromptDataset(strat, dateset)))
constant_len_dataset = ConstantLengthDataset(
self.tokenizer,
[dataset],
seq_length=2048,
)
packed_dataset = Dataset.from_list(list(constant_len_dataset))
example = packed_dataset[0]
next_bos_index = (
example["input_ids"][1:].index(self.tokenizer.bos_token_id) + 1
) # add one since we sliced
# first example doesn't have mask reset
assert example["input_ids"][0] == self.tokenizer.bos_token_id
assert example["attention_mask"][0] == 1
# but subsequent one does
assert example["input_ids"][next_bos_index] == self.tokenizer.bos_token_id
assert example["attention_mask"][next_bos_index] == 0
if __name__ == "__main__":
unittest.main()

View File

@@ -18,6 +18,7 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
""" """
def setUp(self) -> None: def setUp(self) -> None:
# pylint: disable=duplicate-code
self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
self.tokenizer.add_special_tokens( self.tokenizer.add_special_tokens(
{ {

View File

@@ -117,3 +117,32 @@ class ValidationTest(unittest.TestCase):
} }
) )
validate_config(cfg) validate_config(cfg)
def test_gradient_accumulations_or_batch_size(self):
cfg = DictDefault(
{
"gradient_accumulation_steps": 1,
"batch_size": 1,
}
)
with pytest.raises(
ValueError, match=r".*gradient_accumulation_steps or batch_size.*"
):
validate_config(cfg)
cfg = DictDefault(
{
"batch_size": 1,
}
)
validate_config(cfg)
cfg = DictDefault(
{
"gradient_accumulation_steps": 1,
}
)
validate_config(cfg)