Compare commits
22 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dfe591435f | ||
|
|
5dd9364c00 | ||
|
|
6185cd5227 | ||
|
|
b357c93f23 | ||
|
|
21a5094226 | ||
|
|
3a9ad7c66e | ||
|
|
89134f2143 | ||
|
|
6086be85f7 | ||
|
|
4a92a3b9ee | ||
|
|
46a73e3d1a | ||
|
|
da3415bb5a | ||
|
|
8cb127abeb | ||
|
|
05b398a072 | ||
|
|
e634118f90 | ||
|
|
02af0820f7 | ||
|
|
4155e9988f | ||
|
|
25afd35842 | ||
|
|
da265dd796 | ||
|
|
e07347b188 | ||
|
|
bcdc9b1601 | ||
|
|
c19d060a74 | ||
|
|
601b77bc9d |
11
.github/workflows/base.yml
vendored
11
.github/workflows/base.yml
vendored
@@ -16,17 +16,22 @@ jobs:
|
|||||||
cuda_version: 11.8.0
|
cuda_version: 11.8.0
|
||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
pytorch: 2.1.2
|
pytorch: 2.1.2
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
- cuda: "121"
|
- cuda: "121"
|
||||||
cuda_version: 12.1.0
|
cuda_version: 12.1.0
|
||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
pytorch: 2.1.2
|
pytorch: 2.1.2
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
- cuda: "121"
|
- cuda: "121"
|
||||||
cuda_version: 12.1.0
|
cuda_version: 12.1.0
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.1.2
|
pytorch: 2.1.2
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
|
- cuda: "121"
|
||||||
|
cuda_version: 12.1.0
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.2.1
|
||||||
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
|||||||
8
.github/workflows/main.yml
vendored
8
.github/workflows/main.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
|||||||
- cuda: 121
|
- cuda: 121
|
||||||
cuda_version: 12.1.0
|
cuda_version: 12.1.0
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.1.2
|
pytorch: 2.2.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
@@ -63,7 +63,7 @@ jobs:
|
|||||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
|
|
||||||
build-axolotl-runpod:
|
build-axolotl-cloud:
|
||||||
needs: build-axolotl
|
needs: build-axolotl
|
||||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
|
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
@@ -84,7 +84,7 @@ jobs:
|
|||||||
- cuda: 121
|
- cuda: 121
|
||||||
cuda_version: 12.1.0
|
cuda_version: 12.1.0
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.1.2
|
pytorch: 2.2.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
@@ -113,7 +113,5 @@ jobs:
|
|||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: |
|
tags: |
|
||||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
winglian/axolotl-runpod:main-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
|
||||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
||||||
${{ (matrix.is_latest) && format('{0}-latest', 'winglian/axolotl-runpod:main') || '' }}
|
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
|
|||||||
118
.github/workflows/nightlies.yml
vendored
Normal file
118
.github/workflows/nightlies.yml
vendored
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
name: docker-nightlies
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 0 * * *' # Runs at 00:00 UTC every day
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-axolotl:
|
||||||
|
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- cuda: 118
|
||||||
|
cuda_version: 11.8.0
|
||||||
|
python_version: "3.10"
|
||||||
|
pytorch: 2.1.2
|
||||||
|
axolotl_extras:
|
||||||
|
axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
|
||||||
|
is_latest: true
|
||||||
|
- cuda: 121
|
||||||
|
cuda_version: 12.1.0
|
||||||
|
python_version: "3.10"
|
||||||
|
pytorch: 2.1.2
|
||||||
|
axolotl_extras:
|
||||||
|
- cuda: 121
|
||||||
|
cuda_version: 12.1.0
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.2.1
|
||||||
|
axolotl_extras:
|
||||||
|
runs-on: axolotl-gpu-runner
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Docker metadata
|
||||||
|
id: metadata
|
||||||
|
uses: docker/metadata-action@v5
|
||||||
|
with:
|
||||||
|
images: winglian/axolotl
|
||||||
|
tags: |
|
||||||
|
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
- name: Login to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
# guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
|
||||||
|
- name: Build and export to Docker
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
build-args: |
|
||||||
|
BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
||||||
|
CUDA=${{ matrix.cuda }}
|
||||||
|
PYTORCH_VERSION=${{ matrix.pytorch }}
|
||||||
|
AXOLOTL_ARGS=${{ matrix.axolotl_args }}
|
||||||
|
file: ./docker/Dockerfile
|
||||||
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
|
tags: |
|
||||||
|
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
|
|
||||||
|
build-axolotl-cloud:
|
||||||
|
needs: build-axolotl
|
||||||
|
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
|
||||||
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- cuda: 118
|
||||||
|
cuda_version: 11.8.0
|
||||||
|
python_version: "3.10"
|
||||||
|
pytorch: 2.1.2
|
||||||
|
axolotl_extras:
|
||||||
|
is_latest: true
|
||||||
|
- cuda: 121
|
||||||
|
cuda_version: 12.1.0
|
||||||
|
python_version: "3.10"
|
||||||
|
pytorch: 2.1.2
|
||||||
|
axolotl_extras:
|
||||||
|
- cuda: 121
|
||||||
|
cuda_version: 12.1.0
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.2.1
|
||||||
|
axolotl_extras:
|
||||||
|
runs-on: axolotl-gpu-runner
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Docker metadata
|
||||||
|
id: metadata
|
||||||
|
uses: docker/metadata-action@v5
|
||||||
|
with:
|
||||||
|
images: winglian/axolotl-cloud
|
||||||
|
tags: |
|
||||||
|
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
||||||
|
- name: Login to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v2
|
||||||
|
- name: Build
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
build-args: |
|
||||||
|
BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
|
CUDA=${{ matrix.cuda }}
|
||||||
|
file: ./docker/Dockerfile-cloud
|
||||||
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
|
tags: |
|
||||||
|
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
2
.github/workflows/pypi.yml
vendored
2
.github/workflows/pypi.yml
vendored
@@ -25,7 +25,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip3 install wheel
|
pip3 install wheel packaging
|
||||||
pip3 install -e .
|
pip3 install -e .
|
||||||
pip3 install -r requirements-tests.txt
|
pip3 install -r requirements-tests.txt
|
||||||
|
|
||||||
|
|||||||
7
.github/workflows/tests.yml
vendored
7
.github/workflows/tests.yml
vendored
@@ -48,6 +48,8 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
|
pip3 install --upgrade pip
|
||||||
|
pip3 install --upgrade packaging
|
||||||
pip3 install -U -e .
|
pip3 install -U -e .
|
||||||
pip3 install -r requirements-tests.txt
|
pip3 install -r requirements-tests.txt
|
||||||
|
|
||||||
@@ -77,6 +79,11 @@ jobs:
|
|||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
pytorch: 2.1.2
|
pytorch: 2.1.2
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
|
- cuda: 121
|
||||||
|
cuda_version: 12.1.0
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.2.1
|
||||||
|
num_gpus: 1
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ RUN git fetch origin +$GITHUB_REF && \
|
|||||||
git checkout FETCH_HEAD
|
git checkout FETCH_HEAD
|
||||||
|
|
||||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||||
|
RUN pip install causal_conv1d
|
||||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||||
else \
|
else \
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
|||||||
WORKDIR /workspace/axolotl
|
WORKDIR /workspace/axolotl
|
||||||
|
|
||||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||||
|
RUN pip install causal_conv1d
|
||||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||||
else \
|
else \
|
||||||
|
|||||||
@@ -21,7 +21,8 @@ lora_dropout: 0.05
|
|||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
|
||||||
sequence_len: 4096
|
sequence_len: 4096
|
||||||
sample_packing: false
|
sample_packing: true
|
||||||
|
eval_sample_packing: false
|
||||||
pad_to_sequence_len: true
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
|
|||||||
10
examples/jamba/README.md
Normal file
10
examples/jamba/README.md
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# Jamba
|
||||||
|
|
||||||
|
- ✅ qlora w/ deepspeed Zero-2 needs at least 2x GPUs and
|
||||||
|
- 35GiB VRAM per GPU w minimal context length
|
||||||
|
- 56GiB VRAM per GPU (w multipack enabled)
|
||||||
|
- ✅ qlora w/ deepspeed Zero-3 needs at least 2x GPUs and 67GiB VRAM (wtf?)
|
||||||
|
- ✅ qlora single-gpu, ~51GiB VRAM
|
||||||
|
- ✅ multipack
|
||||||
|
- ❓ FSDP
|
||||||
|
- ❓ 8-bit LoRA
|
||||||
62
examples/jamba/qlora.yaml
Normal file
62
examples/jamba/qlora.yaml
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
base_model: ai21labs/Jamba-v0.1
|
||||||
|
trust_remote_code: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: true
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path:
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: false
|
||||||
|
pad_to_sequence_len: false
|
||||||
|
eval_sample_packing: false
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 8
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
|
||||||
|
low_cpu_mem_usage: true
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 2
|
||||||
|
optimizer: paged_adamw_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.00001
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: auto
|
||||||
|
fp16:
|
||||||
|
tf32: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch:
|
||||||
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
62
examples/jamba/qlora_deepspeed.yaml
Normal file
62
examples/jamba/qlora_deepspeed.yaml
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
base_model: ai21labs/Jamba-v0.1
|
||||||
|
trust_remote_code: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: true
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path:
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: false
|
||||||
|
pad_to_sequence_len: false
|
||||||
|
eval_sample_packing: false
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 8
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
|
||||||
|
low_cpu_mem_usage: true
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 2
|
||||||
|
optimizer: paged_adamw_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.00001
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: auto
|
||||||
|
fp16:
|
||||||
|
tf32: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch:
|
||||||
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed: deepspeed_configs/zero2.json
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
75
examples/llama-2/lisa.yml
Normal file
75
examples/llama-2/lisa.yml
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
base_model: NousResearch/Llama-2-7b-hf
|
||||||
|
model_type: LlamaForCausalLM
|
||||||
|
tokenizer_type: LlamaTokenizer
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.05
|
||||||
|
output_dir: ./lisa-out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
adapter:
|
||||||
|
lora_model_dir:
|
||||||
|
lora_r:
|
||||||
|
lora_alpha:
|
||||||
|
lora_dropout:
|
||||||
|
lora_target_linear:
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
|
lisa_n_layers: 2
|
||||||
|
lisa_step_interval: 20
|
||||||
|
lisa_layers_attribute: model.layers
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 5e-5 # recommendation from lisa paper for 7b
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: auto
|
||||||
|
fp16:
|
||||||
|
tf32: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention: true
|
||||||
|
flash_attn_cross_entropy: false
|
||||||
|
flash_attn_rms_norm: true
|
||||||
|
flash_attn_fuse_qkv: false
|
||||||
|
flash_attn_fuse_mlp: true
|
||||||
|
|
||||||
|
warmup_steps: 100
|
||||||
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.1
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
special_tokens:
|
||||||
|
bos_token: "<s>"
|
||||||
|
eos_token: "</s>"
|
||||||
|
unk_token: "<unk>"
|
||||||
10
examples/qwen/README.md
Normal file
10
examples/qwen/README.md
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# Qwen
|
||||||
|
|
||||||
|
TODO
|
||||||
|
|
||||||
|
# Qwen2 MoE
|
||||||
|
|
||||||
|
✅ multipack
|
||||||
|
✅ qwen2_moe 4-bit QLoRA
|
||||||
|
✅ qwen2_moe 16-bit LoRA
|
||||||
|
❓ qwen2_moe 8-bit LoRA
|
||||||
64
examples/qwen/qwen2-moe-lora.yaml
Normal file
64
examples/qwen/qwen2-moe-lora.yaml
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
base_model: Qwen/Qwen1.5-MoE-A2.7B
|
||||||
|
trust_remote_code: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path:
|
||||||
|
val_set_size: 0.05
|
||||||
|
output_dir: ./out
|
||||||
|
|
||||||
|
sequence_len: 1024 # supports up to 32k
|
||||||
|
sample_packing: false
|
||||||
|
pad_to_sequence_len: false
|
||||||
|
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 4
|
||||||
|
optimizer: paged_adamw_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: auto
|
||||||
|
fp16:
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch: 4
|
||||||
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
special_tokens:
|
||||||
64
examples/qwen/qwen2-moe-qlora.yaml
Normal file
64
examples/qwen/qwen2-moe-qlora.yaml
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
base_model: Qwen/Qwen1.5-MoE-A2.7B
|
||||||
|
trust_remote_code: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: true
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path:
|
||||||
|
val_set_size: 0.05
|
||||||
|
output_dir: ./out
|
||||||
|
|
||||||
|
sequence_len: 1024 # supports up to 32k
|
||||||
|
sample_packing: false
|
||||||
|
pad_to_sequence_len: false
|
||||||
|
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 4
|
||||||
|
optimizer: paged_adamw_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: auto
|
||||||
|
fp16:
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch: 4
|
||||||
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
special_tokens:
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
peft==0.9.0
|
peft==0.10.0
|
||||||
transformers @ git+https://github.com/huggingface/transformers.git@73a73b415e36f41481369f6129cb4b62bb127a78
|
transformers @ git+https://github.com/huggingface/transformers.git@43d17c18360ac9c3d3491389328e2fe55fe8f9ce
|
||||||
tokenizers==0.15.0
|
tokenizers==0.15.0
|
||||||
bitsandbytes==0.43.0
|
bitsandbytes==0.43.0
|
||||||
accelerate==0.28.0
|
accelerate==0.28.0
|
||||||
@@ -32,11 +32,11 @@ fschat==0.2.36
|
|||||||
gradio==3.50.2
|
gradio==3.50.2
|
||||||
tensorboard
|
tensorboard
|
||||||
|
|
||||||
mamba-ssm==1.1.1
|
mamba-ssm==1.2.0.post1
|
||||||
|
|
||||||
# remote filesystems
|
# remote filesystems
|
||||||
s3fs
|
s3fs
|
||||||
gcsfs
|
gcsfs
|
||||||
# adlfs
|
# adlfs
|
||||||
|
|
||||||
trl @ git+https://github.com/huggingface/trl.git@304e208f778a5442c30cdda500348226cdc97d90
|
trl @ git+https://github.com/huggingface/trl.git@0ee349dcd43b0f4b3169449f16751c38ac4a609f
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -78,7 +78,7 @@ setup(
|
|||||||
"deepspeed-kernels",
|
"deepspeed-kernels",
|
||||||
],
|
],
|
||||||
"mamba-ssm": [
|
"mamba-ssm": [
|
||||||
"mamba-ssm==1.0.1",
|
"mamba-ssm==1.2.0.post1",
|
||||||
],
|
],
|
||||||
"auto-gptq": [
|
"auto-gptq": [
|
||||||
"auto-gptq==0.5.1",
|
"auto-gptq==0.5.1",
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ from axolotl.utils.callbacks import (
|
|||||||
causal_lm_bench_eval_callback_factory,
|
causal_lm_bench_eval_callback_factory,
|
||||||
log_prediction_callback_factory,
|
log_prediction_callback_factory,
|
||||||
)
|
)
|
||||||
|
from axolotl.utils.callbacks.lisa import lisa_callback_factory
|
||||||
from axolotl.utils.collators import (
|
from axolotl.utils.collators import (
|
||||||
BatchSamplerDataCollatorForSeq2Seq,
|
BatchSamplerDataCollatorForSeq2Seq,
|
||||||
DataCollatorForSeq2Seq,
|
DataCollatorForSeq2Seq,
|
||||||
@@ -200,6 +201,18 @@ class AxolotlTrainingArguments(TrainingArguments):
|
|||||||
orpo_alpha: Optional[float] = field(
|
orpo_alpha: Optional[float] = field(
|
||||||
default=None,
|
default=None,
|
||||||
)
|
)
|
||||||
|
lisa_n_layers: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "the number of activate layers in LISA"},
|
||||||
|
)
|
||||||
|
lisa_step_interval: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "how often to switch layers in LISA"},
|
||||||
|
)
|
||||||
|
lisa_layers_attribute: Optional[str] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "path under the model to access the layers"},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AxolotlTrainer(Trainer):
|
class AxolotlTrainer(Trainer):
|
||||||
@@ -938,6 +951,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
)
|
)
|
||||||
callbacks.append(early_stop_cb)
|
callbacks.append(early_stop_cb)
|
||||||
|
|
||||||
|
if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
|
||||||
|
callbacks.append(lisa_callback_factory(trainer))
|
||||||
return callbacks
|
return callbacks
|
||||||
|
|
||||||
def _get_trainer_cls(self):
|
def _get_trainer_cls(self):
|
||||||
@@ -1229,6 +1244,15 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
"relora_prune_ratio"
|
"relora_prune_ratio"
|
||||||
] = self.cfg.relora_prune_ratio
|
] = self.cfg.relora_prune_ratio
|
||||||
|
|
||||||
|
if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
|
||||||
|
training_arguments_kwargs["lisa_n_layers"] = self.cfg.lisa_n_layers
|
||||||
|
training_arguments_kwargs[
|
||||||
|
"lisa_step_interval"
|
||||||
|
] = self.cfg.lisa_step_interval
|
||||||
|
training_arguments_kwargs[
|
||||||
|
"lisa_layers_attribute"
|
||||||
|
] = self.cfg.lisa_layers_attribute
|
||||||
|
|
||||||
training_arguments_kwargs = self.hook_pre_create_training_args(
|
training_arguments_kwargs = self.hook_pre_create_training_args(
|
||||||
training_arguments_kwargs
|
training_arguments_kwargs
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -284,12 +284,7 @@ def flashattn_forward_with_s2attn(
|
|||||||
# [bsz, nh, q_len, hd]
|
# [bsz, nh, q_len, hd]
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
|
|
||||||
kv_seq_len = key_states.shape[-2]
|
cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
|
||||||
if past_key_value is not None:
|
|
||||||
kv_seq_len += past_key_value[0].shape[-2]
|
|
||||||
cos, sin = self.rotary_emb(
|
|
||||||
value_states, seq_len=kv_seq_len, position_ids=position_ids
|
|
||||||
)
|
|
||||||
query_states, key_states = apply_rotary_pos_emb(
|
query_states, key_states = apply_rotary_pos_emb(
|
||||||
query_states, key_states, cos, sin, position_ids
|
query_states, key_states, cos, sin, position_ids
|
||||||
)
|
)
|
||||||
@@ -435,13 +430,7 @@ def flashattn_forward(
|
|||||||
# [bsz, q_len, nh, hd]
|
# [bsz, q_len, nh, hd]
|
||||||
# [bsz, nh, q_len, hd]
|
# [bsz, nh, q_len, hd]
|
||||||
|
|
||||||
kv_seq_len = key_states.shape[-2]
|
cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
|
||||||
if past_key_value is not None:
|
|
||||||
kv_seq_len += past_key_value[0].shape[-2]
|
|
||||||
|
|
||||||
cos, sin = self.rotary_emb(
|
|
||||||
value_states, seq_len=kv_seq_len, position_ids=position_ids
|
|
||||||
)
|
|
||||||
query_states, key_states = apply_rotary_pos_emb(
|
query_states, key_states = apply_rotary_pos_emb(
|
||||||
query_states, key_states, cos, sin, position_ids
|
query_states, key_states, cos, sin, position_ids
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -80,11 +80,7 @@ def xformers_forward(
|
|||||||
# [bsz, q_len, nh, hd]
|
# [bsz, q_len, nh, hd]
|
||||||
# [bsz, nh, q_len, hd]
|
# [bsz, nh, q_len, hd]
|
||||||
|
|
||||||
kv_seq_len = key_states.shape[-2]
|
cos, sin = self.rotary_emb(value_states)
|
||||||
if past_key_value is not None:
|
|
||||||
kv_seq_len += past_key_value[0].shape[-2]
|
|
||||||
|
|
||||||
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
|
||||||
query_states, key_states = apply_rotary_pos_emb(
|
query_states, key_states = apply_rotary_pos_emb(
|
||||||
query_states, key_states, cos, sin, position_ids
|
query_states, key_states, cos, sin, position_ids
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ from axolotl.monkeypatch.utils import get_unpad_data
|
|||||||
SUPPORTED_MULTIPACK_MODEL_TYPES = [
|
SUPPORTED_MULTIPACK_MODEL_TYPES = [
|
||||||
"mixtral",
|
"mixtral",
|
||||||
"qwen2",
|
"qwen2",
|
||||||
|
"qwen2_moe",
|
||||||
"falcon",
|
"falcon",
|
||||||
"phi",
|
"phi",
|
||||||
"gemma",
|
"gemma",
|
||||||
@@ -31,6 +32,10 @@ def patch_for_multipack(model_type, model_name=None):
|
|||||||
transformers.models.qwen2.modeling_qwen2._get_unpad_data = ( # pylint: disable=protected-access
|
transformers.models.qwen2.modeling_qwen2._get_unpad_data = ( # pylint: disable=protected-access
|
||||||
get_unpad_data
|
get_unpad_data
|
||||||
)
|
)
|
||||||
|
elif model_type == "qwen2_moe":
|
||||||
|
transformers.models.qwen2_moe.modeling_qwen2_moe._get_unpad_data = ( # pylint: disable=protected-access
|
||||||
|
get_unpad_data
|
||||||
|
)
|
||||||
elif model_type == "falcon":
|
elif model_type == "falcon":
|
||||||
transformers.models.falcon.modeling_falcon._get_unpad_data = ( # pylint: disable=protected-access
|
transformers.models.falcon.modeling_falcon._get_unpad_data = ( # pylint: disable=protected-access
|
||||||
get_unpad_data
|
get_unpad_data
|
||||||
@@ -48,14 +53,16 @@ def patch_for_multipack(model_type, model_name=None):
|
|||||||
get_unpad_data
|
get_unpad_data
|
||||||
)
|
)
|
||||||
elif model_type == "gemmoe":
|
elif model_type == "gemmoe":
|
||||||
model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
patch_remote(model_name, ".configuration_gemmoe", ".modeling_gemmoe")
|
||||||
# we need to load the model here in order for modeling_gemmoe to be available
|
elif model_type == "jamba":
|
||||||
with init_empty_weights():
|
patch_remote(model_name, ".configuration_jamba", ".modeling_jamba")
|
||||||
AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
|
|
||||||
module_name = model_config.__class__.__module__.replace(
|
|
||||||
".configuration_gemmoe", ".modeling_gemmoe"
|
def patch_remote(model_name, config_name, modeling_name):
|
||||||
)
|
model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
||||||
modeling_gemmoe = importlib.import_module(module_name)
|
# we need to load the model here in order for modeling_* to be available
|
||||||
modeling_gemmoe._get_unpad_data = ( # pylint: disable=protected-access
|
with init_empty_weights():
|
||||||
get_unpad_data
|
AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
|
||||||
)
|
module_name = model_config.__class__.__module__.replace(config_name, modeling_name)
|
||||||
|
modeling_arch = importlib.import_module(module_name)
|
||||||
|
modeling_arch._get_unpad_data = get_unpad_data # pylint: disable=protected-access
|
||||||
|
|||||||
91
src/axolotl/utils/callbacks/lisa.py
Normal file
91
src/axolotl/utils/callbacks/lisa.py
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
"""
|
||||||
|
module for LISA
|
||||||
|
|
||||||
|
Adapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl
|
||||||
|
Arxiv: https://arxiv.org/abs/2403.17919
|
||||||
|
License: Apache 2.0
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from functools import reduce
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from transformers import TrainerCallback
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from axolotl.core.trainer_builder import AxolotlTrainer
|
||||||
|
|
||||||
|
LOG = logging.getLogger("axolotl.callbacks.lisa")
|
||||||
|
|
||||||
|
|
||||||
|
def lisa_callback_factory(trainer: "AxolotlTrainer"):
|
||||||
|
class LISACallback(TrainerCallback):
|
||||||
|
"""trainer callback for lisa layer switching"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, n_layers, step_interval, trainer, layers_attribute="model.layers"
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.step_interval = step_interval
|
||||||
|
self.layers_attribute = layers_attribute
|
||||||
|
self.trainer = trainer
|
||||||
|
|
||||||
|
reduce(getattr, self.layers_attribute.split("."), self.trainer.model)
|
||||||
|
|
||||||
|
self.total_layers = len(
|
||||||
|
reduce(getattr, self.layers_attribute.split("."), self.trainer.model)
|
||||||
|
)
|
||||||
|
self.active_layers_indices = []
|
||||||
|
|
||||||
|
layers = reduce(
|
||||||
|
getattr, self.layers_attribute.split("."), self.trainer.model
|
||||||
|
)
|
||||||
|
LOG.info(
|
||||||
|
f"LISA will activate {self.n_layers}/{len(layers)} layers ({self.n_layers*100/len(layers)}%) every {self.step_interval} steps"
|
||||||
|
)
|
||||||
|
|
||||||
|
def freeze_all_layers(self):
|
||||||
|
layers = reduce(
|
||||||
|
getattr, self.layers_attribute.split("."), self.trainer.model
|
||||||
|
)
|
||||||
|
for layer in layers:
|
||||||
|
for param in layer.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
def on_step_begin(
|
||||||
|
self, args, state, control, **kwargs
|
||||||
|
): # pylint: disable=unused-argument
|
||||||
|
# Check if it's time to switch active layers, including at step 0
|
||||||
|
if state.global_step % self.step_interval == 0 or state.global_step == 1:
|
||||||
|
self.switch_active_layers()
|
||||||
|
|
||||||
|
def switch_active_layers(self):
|
||||||
|
# First, disable gradients for all layers
|
||||||
|
self.freeze_all_layers()
|
||||||
|
|
||||||
|
# Randomly select n_layers to activate
|
||||||
|
layers = reduce(
|
||||||
|
getattr, self.layers_attribute.split("."), self.trainer.model
|
||||||
|
)
|
||||||
|
self.active_layers_indices = np.random.choice(
|
||||||
|
range(self.total_layers), self.n_layers, replace=False
|
||||||
|
)
|
||||||
|
LOG.info(
|
||||||
|
f"Activating layers at indices: {self.active_layers_indices} for the next steps."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Enable gradients only for the selected layers
|
||||||
|
for idx in self.active_layers_indices:
|
||||||
|
for param in layers[idx].parameters():
|
||||||
|
param.requires_grad = True
|
||||||
|
|
||||||
|
lisa_callback = LISACallback(
|
||||||
|
n_layers=trainer.args.lisa_n_layers,
|
||||||
|
step_interval=trainer.args.lisa_step_interval,
|
||||||
|
trainer=trainer,
|
||||||
|
layers_attribute=trainer.args.lisa_layers_attribute,
|
||||||
|
)
|
||||||
|
|
||||||
|
return lisa_callback
|
||||||
@@ -208,11 +208,11 @@ def validate_config(cfg: DictDefault, capabilities: Optional[dict] = None):
|
|||||||
dict(
|
dict(
|
||||||
AxolotlConfigWCapabilities(
|
AxolotlConfigWCapabilities(
|
||||||
**cfg.to_dict(), capabilities=capabilities
|
**cfg.to_dict(), capabilities=capabilities
|
||||||
).model_dump(exclude_unset=True)
|
).model_dump(exclude_none=True)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return DictDefault(
|
return DictDefault(
|
||||||
dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_unset=True))
|
dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_none=True))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ Module for pydantic models for configuration
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any, Dict, List, Literal, Optional, Union
|
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
||||||
|
|
||||||
from pydantic import BaseModel, Field, conlist, field_validator, model_validator
|
from pydantic import BaseModel, Field, conlist, field_validator, model_validator
|
||||||
from transformers import SchedulerType
|
from transformers import SchedulerType
|
||||||
@@ -151,12 +151,6 @@ class PeftConfig(BaseModel):
|
|||||||
loftq_config: Optional[LoftQConfig] = None
|
loftq_config: Optional[LoftQConfig] = None
|
||||||
|
|
||||||
|
|
||||||
class AutoType(str, Enum):
|
|
||||||
"""auto type string configuration subset - used for bf16"""
|
|
||||||
|
|
||||||
AUTO = "auto"
|
|
||||||
|
|
||||||
|
|
||||||
class SpecialTokensConfig(BaseModel):
|
class SpecialTokensConfig(BaseModel):
|
||||||
"""Special tokens configuration subset"""
|
"""Special tokens configuration subset"""
|
||||||
|
|
||||||
@@ -185,7 +179,8 @@ class LoraConfig(BaseModel):
|
|||||||
peft_layers_to_transform: Optional[List[int]] = None
|
peft_layers_to_transform: Optional[List[int]] = None
|
||||||
peft: Optional[PeftConfig] = None
|
peft: Optional[PeftConfig] = None
|
||||||
peft_use_dora: Optional[bool] = None
|
peft_use_dora: Optional[bool] = None
|
||||||
peft_use_relora: Optional[bool] = None
|
peft_use_rslora: Optional[bool] = None
|
||||||
|
peft_layer_replication: Optional[List[Tuple[int, int]]] = None
|
||||||
|
|
||||||
lora_on_cpu: Optional[bool] = None
|
lora_on_cpu: Optional[bool] = None
|
||||||
gptq: Optional[bool] = None
|
gptq: Optional[bool] = None
|
||||||
@@ -307,12 +302,14 @@ class HyperparametersConfig(BaseModel):
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
train_on_inputs: Optional[bool] = None
|
train_on_inputs: Optional[bool] = False
|
||||||
group_by_length: Optional[bool] = None
|
group_by_length: Optional[bool] = None
|
||||||
|
|
||||||
learning_rate: Union[str, float]
|
learning_rate: Union[str, float]
|
||||||
weight_decay: Optional[float] = None
|
weight_decay: Optional[float] = 0.0
|
||||||
optimizer: Optional[Union[OptimizerNames, Literal["lion_pytorch"]]] = None
|
optimizer: Optional[
|
||||||
|
Union[OptimizerNames, Literal["lion_pytorch"]]
|
||||||
|
] = OptimizerNames.ADAMW_HF.value
|
||||||
optim_args: Optional[Union[str, Dict[str, Any]]] = Field(
|
optim_args: Optional[Union[str, Dict[str, Any]]] = Field(
|
||||||
default=None, metadata={"help": "Optional arguments to supply to optimizer."}
|
default=None, metadata={"help": "Optional arguments to supply to optimizer."}
|
||||||
)
|
)
|
||||||
@@ -323,7 +320,7 @@ class HyperparametersConfig(BaseModel):
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
torchdistx_path: Optional[str] = None
|
torchdistx_path: Optional[str] = None
|
||||||
lr_scheduler: Optional[SchedulerType] = None
|
lr_scheduler: Optional[SchedulerType] = "cosine"
|
||||||
lr_scheduler_kwargs: Optional[Dict[str, Any]] = None
|
lr_scheduler_kwargs: Optional[Dict[str, Any]] = None
|
||||||
lr_quadratic_warmup: Optional[bool] = None
|
lr_quadratic_warmup: Optional[bool] = None
|
||||||
cosine_min_lr_ratio: Optional[float] = None
|
cosine_min_lr_ratio: Optional[float] = None
|
||||||
@@ -373,6 +370,23 @@ class MLFlowConfig(BaseModel):
|
|||||||
hf_mlflow_log_artifacts: Optional[bool] = None
|
hf_mlflow_log_artifacts: Optional[bool] = None
|
||||||
|
|
||||||
|
|
||||||
|
class LISAConfig(BaseModel):
|
||||||
|
"""LISA options"""
|
||||||
|
|
||||||
|
lisa_n_layers: Optional[int] = Field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "the number of activate layers in LISA"},
|
||||||
|
)
|
||||||
|
lisa_step_interval: Optional[int] = Field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "how often to switch layers in LISA"},
|
||||||
|
)
|
||||||
|
lisa_layers_attribute: Optional[str] = Field(
|
||||||
|
default="model.layers",
|
||||||
|
metadata={"help": "path under the model to access the layers"},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class WandbConfig(BaseModel):
|
class WandbConfig(BaseModel):
|
||||||
"""wandb configuration subset"""
|
"""wandb configuration subset"""
|
||||||
|
|
||||||
@@ -407,6 +421,7 @@ class AxolotlInputConfig(
|
|||||||
HyperparametersConfig,
|
HyperparametersConfig,
|
||||||
WandbConfig,
|
WandbConfig,
|
||||||
MLFlowConfig,
|
MLFlowConfig,
|
||||||
|
LISAConfig,
|
||||||
RemappedParameters,
|
RemappedParameters,
|
||||||
DeprecatedParameters,
|
DeprecatedParameters,
|
||||||
BaseModel,
|
BaseModel,
|
||||||
@@ -473,7 +488,7 @@ class AxolotlInputConfig(
|
|||||||
loss_watchdog_threshold: Optional[float] = None
|
loss_watchdog_threshold: Optional[float] = None
|
||||||
loss_watchdog_patience: Optional[int] = None
|
loss_watchdog_patience: Optional[int] = None
|
||||||
|
|
||||||
bf16: Optional[Union[AutoType, bool]] = AutoType.AUTO
|
bf16: Optional[Union[Literal["auto"], bool]] = "auto"
|
||||||
fp16: Optional[bool] = None
|
fp16: Optional[bool] = None
|
||||||
bfloat16: Optional[bool] = None # for non-AMP cases
|
bfloat16: Optional[bool] = None # for non-AMP cases
|
||||||
float16: Optional[bool] = None # for non-AMP cases
|
float16: Optional[bool] = None # for non-AMP cases
|
||||||
@@ -487,7 +502,7 @@ class AxolotlInputConfig(
|
|||||||
|
|
||||||
unfrozen_parameters: Optional[List[str]] = None
|
unfrozen_parameters: Optional[List[str]] = None
|
||||||
|
|
||||||
sequence_len: int = Field(default=1024)
|
sequence_len: int = Field(default=512)
|
||||||
sample_packing: Optional[bool] = None
|
sample_packing: Optional[bool] = None
|
||||||
eval_sample_packing: Optional[bool] = None
|
eval_sample_packing: Optional[bool] = None
|
||||||
pad_to_sequence_len: Optional[bool] = None
|
pad_to_sequence_len: Optional[bool] = None
|
||||||
@@ -536,6 +551,7 @@ class AxolotlInputConfig(
|
|||||||
Dict[Union[int, Literal["cpu", "disk"]], Union[int, str]]
|
Dict[Union[int, Literal["cpu", "disk"]], Union[int, str]]
|
||||||
] = None
|
] = None
|
||||||
gpu_memory_limit: Optional[Union[int, str]] = None
|
gpu_memory_limit: Optional[Union[int, str]] = None
|
||||||
|
low_cpu_mem_usage: Optional[bool] = None
|
||||||
|
|
||||||
chat_template: Optional[ChatTemplate] = None
|
chat_template: Optional[ChatTemplate] = None
|
||||||
default_system_message: Optional[str] = None
|
default_system_message: Optional[str] = None
|
||||||
@@ -548,10 +564,10 @@ class AxolotlInputConfig(
|
|||||||
sample_packing_eff_est: Optional[float] = None
|
sample_packing_eff_est: Optional[float] = None
|
||||||
axolotl_config_path: Optional[str] = None
|
axolotl_config_path: Optional[str] = None
|
||||||
|
|
||||||
is_falcon_derived_model: Optional[bool] = Field(default=False)
|
is_falcon_derived_model: Optional[bool] = Field(default=None)
|
||||||
is_llama_derived_model: Optional[bool] = Field(default=False)
|
is_llama_derived_model: Optional[bool] = Field(default=None)
|
||||||
is_mistral_derived_model: Optional[bool] = Field(default=False)
|
is_mistral_derived_model: Optional[bool] = Field(default=None)
|
||||||
is_qwen_derived_model: Optional[bool] = Field(default=False)
|
is_qwen_derived_model: Optional[bool] = Field(default=None)
|
||||||
|
|
||||||
@field_validator("datasets", mode="before")
|
@field_validator("datasets", mode="before")
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
"""Module containing data utilities"""
|
"""Module containing data utilities"""
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
@@ -223,7 +224,7 @@ def load_tokenized_prepared_datasets(
|
|||||||
token=use_auth_token,
|
token=use_auth_token,
|
||||||
)
|
)
|
||||||
ds_from_hub = True
|
ds_from_hub = True
|
||||||
except (FileNotFoundError, ConnectionError, HFValidationError):
|
except (FileNotFoundError, ConnectionError, HFValidationError, ValueError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
ds_from_cloud = False
|
ds_from_cloud = False
|
||||||
@@ -290,14 +291,17 @@ def load_tokenized_prepared_datasets(
|
|||||||
local_path = Path(config_dataset.path)
|
local_path = Path(config_dataset.path)
|
||||||
if local_path.exists():
|
if local_path.exists():
|
||||||
if local_path.is_dir():
|
if local_path.is_dir():
|
||||||
# TODO dirs with arrow or parquet files could be loaded with `load_from_disk`
|
if config_dataset.data_files:
|
||||||
ds = load_dataset(
|
ds_type = get_ds_type(config_dataset)
|
||||||
config_dataset.path,
|
ds = load_dataset(
|
||||||
name=config_dataset.name,
|
ds_type,
|
||||||
data_files=config_dataset.data_files,
|
name=config_dataset.name,
|
||||||
streaming=False,
|
data_files=config_dataset.data_files,
|
||||||
split=None,
|
streaming=False,
|
||||||
)
|
split=None,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ds = load_from_disk(config_dataset.path)
|
||||||
elif local_path.is_file():
|
elif local_path.is_file():
|
||||||
ds_type = get_ds_type(config_dataset)
|
ds_type = get_ds_type(config_dataset)
|
||||||
|
|
||||||
|
|||||||
@@ -402,7 +402,9 @@ def load_model(
|
|||||||
from accelerate import infer_auto_device_map
|
from accelerate import infer_auto_device_map
|
||||||
|
|
||||||
with init_empty_weights():
|
with init_empty_weights():
|
||||||
model_canvas = AutoModelForCausalLM.from_config(model_config)
|
model_canvas = AutoModelForCausalLM.from_config(
|
||||||
|
model_config, trust_remote_code=cfg.trust_remote_code or False
|
||||||
|
)
|
||||||
model_canvas.tie_weights()
|
model_canvas.tie_weights()
|
||||||
device_map = infer_auto_device_map(
|
device_map = infer_auto_device_map(
|
||||||
model_canvas,
|
model_canvas,
|
||||||
@@ -454,6 +456,10 @@ def load_model(
|
|||||||
"bnb_4bit_quant_type": "nf4",
|
"bnb_4bit_quant_type": "nf4",
|
||||||
"bnb_4bit_quant_storage": torch.bfloat16,
|
"bnb_4bit_quant_storage": torch.bfloat16,
|
||||||
}
|
}
|
||||||
|
if not cfg.deepspeed:
|
||||||
|
# for some reason, this causes the loss to be off by an order of magnitude
|
||||||
|
# but deepspeed needs this still in bfloat16
|
||||||
|
bnb_config["bnb_4bit_quant_storage"] = torch.float32
|
||||||
|
|
||||||
if cfg.bnb_config_kwargs:
|
if cfg.bnb_config_kwargs:
|
||||||
bnb_config.update(cfg.bnb_config_kwargs)
|
bnb_config.update(cfg.bnb_config_kwargs)
|
||||||
@@ -502,6 +508,9 @@ def load_model(
|
|||||||
model_kwargs["attn_implementation"] = "eager"
|
model_kwargs["attn_implementation"] = "eager"
|
||||||
model_config._attn_implementation = "eager" # pylint: disable=protected-access
|
model_config._attn_implementation = "eager" # pylint: disable=protected-access
|
||||||
|
|
||||||
|
if cfg.low_cpu_mem_usage:
|
||||||
|
model_kwargs["low_cpu_mem_usage"] = True
|
||||||
|
|
||||||
qlora_fsdp = cfg.fsdp and cfg.adapter == "qlora"
|
qlora_fsdp = cfg.fsdp and cfg.adapter == "qlora"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -849,7 +858,9 @@ def load_lora(model, cfg, inference=False, config_only=False):
|
|||||||
if cfg.peft_use_dora:
|
if cfg.peft_use_dora:
|
||||||
lora_config_kwargs["use_dora"] = cfg.peft_use_dora
|
lora_config_kwargs["use_dora"] = cfg.peft_use_dora
|
||||||
if cfg.peft_use_rslora:
|
if cfg.peft_use_rslora:
|
||||||
lora_config_kwargs["use_rslora"] = cfg.use_rslora
|
lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora
|
||||||
|
if cfg.peft_layer_replication:
|
||||||
|
lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication
|
||||||
|
|
||||||
lora_config = LoraConfig(
|
lora_config = LoraConfig(
|
||||||
r=cfg.lora_r,
|
r=cfg.lora_r,
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import torch.cuda
|
|||||||
from accelerate.logging import get_logger
|
from accelerate.logging import get_logger
|
||||||
from datasets import set_caching_enabled
|
from datasets import set_caching_enabled
|
||||||
from torch.utils.data import DataLoader, RandomSampler
|
from torch.utils.data import DataLoader, RandomSampler
|
||||||
|
from transformers.utils import is_torch_bf16_gpu_available
|
||||||
|
|
||||||
from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFDPOTrainerBuilder
|
from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFDPOTrainerBuilder
|
||||||
from axolotl.utils.distributed import is_main_process, reduce_and_broadcast, zero_first
|
from axolotl.utils.distributed import is_main_process, reduce_and_broadcast, zero_first
|
||||||
@@ -124,9 +125,10 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
|||||||
eval_dataset = eval_dataset.remove_columns("attention_mask")
|
eval_dataset = eval_dataset.remove_columns("attention_mask")
|
||||||
|
|
||||||
if cfg.model_config_type == "falcon":
|
if cfg.model_config_type == "falcon":
|
||||||
LOG.info("dropping token_type_ids column")
|
LOG.info("dropping token_type_ids column if it exists")
|
||||||
train_dataset = train_dataset.remove_columns("token_type_ids")
|
if "token_type_ids" in train_dataset.column_names:
|
||||||
if eval_dataset:
|
train_dataset = train_dataset.remove_columns("token_type_ids")
|
||||||
|
if eval_dataset and "token_type_ids" in eval_dataset.column_names:
|
||||||
eval_dataset = eval_dataset.remove_columns("token_type_ids")
|
eval_dataset = eval_dataset.remove_columns("token_type_ids")
|
||||||
|
|
||||||
train_dataset = train_dataset.filter(
|
train_dataset = train_dataset.filter(
|
||||||
@@ -310,6 +312,8 @@ def setup_fsdp_envs(cfg):
|
|||||||
os.environ["FSDP_USE_ORIG_PARAMS"] = "true"
|
os.environ["FSDP_USE_ORIG_PARAMS"] = "true"
|
||||||
if cfg.fsdp_config.fsdp_state_dict_type:
|
if cfg.fsdp_config.fsdp_state_dict_type:
|
||||||
os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.fsdp_state_dict_type
|
os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.fsdp_state_dict_type
|
||||||
|
if cfg.fsdp_config.fsdp_auto_wrap_policy:
|
||||||
|
os.environ["FSDP_AUTO_WRAP_POLICY"] = cfg.fsdp_config.fsdp_auto_wrap_policy
|
||||||
if cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap:
|
if cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap:
|
||||||
os.environ[
|
os.environ[
|
||||||
"FSDP_TRANSFORMER_CLS_TO_WRAP"
|
"FSDP_TRANSFORMER_CLS_TO_WRAP"
|
||||||
@@ -323,6 +327,11 @@ def prepare_optim_env(cfg):
|
|||||||
os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
|
os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
|
||||||
os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed
|
os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed
|
||||||
|
|
||||||
|
if (cfg.bf16 == "auto" and is_torch_bf16_gpu_available()) or cfg.bf16 is True:
|
||||||
|
os.environ["ACCELERATE_MIXED_PRECISION"] = "bf16"
|
||||||
|
elif cfg.fp16:
|
||||||
|
os.environ["ACCELERATE_MIXED_PRECISION"] = "fp16"
|
||||||
|
|
||||||
|
|
||||||
def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
|
def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
|
||||||
if cfg.rl in ["dpo", "ipo", "kto_pair"]:
|
if cfg.rl in ["dpo", "ipo", "kto_pair"]:
|
||||||
|
|||||||
272
tests/test_datasets.py
Normal file
272
tests/test_datasets.py
Normal file
@@ -0,0 +1,272 @@
|
|||||||
|
"""
|
||||||
|
Test dataset loading under various conditions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from datasets import Dataset
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
from axolotl.utils.data import load_tokenized_prepared_datasets
|
||||||
|
from axolotl.utils.dict import DictDefault
|
||||||
|
|
||||||
|
|
||||||
|
class TestDatasetPreparation(unittest.TestCase):
|
||||||
|
"""Test a configured dataloader."""
|
||||||
|
|
||||||
|
def setUp(self) -> None:
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
|
||||||
|
self.tokenizer.add_special_tokens(
|
||||||
|
{
|
||||||
|
"bos_token": "<s>",
|
||||||
|
"eos_token": "</s>",
|
||||||
|
"unk_token": "<unk>",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# Alpaca dataset.
|
||||||
|
self.dataset = Dataset.from_list(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"instruction": "Evaluate this sentence for spelling and grammar mistakes",
|
||||||
|
"input": "He finnished his meal and left the resturant",
|
||||||
|
"output": "He finished his meal and left the restaurant.",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_load_hub(self):
|
||||||
|
"""Core use case. Verify that processing data from the hub works"""
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
prepared_path = Path(tmp_dir) / "prepared"
|
||||||
|
cfg = DictDefault(
|
||||||
|
{
|
||||||
|
"tokenizer_config": "huggyllama/llama-7b",
|
||||||
|
"sequence_len": 1024,
|
||||||
|
"datasets": [
|
||||||
|
{
|
||||||
|
"path": "mhenrichsen/alpaca_2k_test",
|
||||||
|
"type": "alpaca",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset, _ = load_tokenized_prepared_datasets(
|
||||||
|
self.tokenizer, cfg, prepared_path
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(dataset) == 2000
|
||||||
|
assert "input_ids" in dataset.features
|
||||||
|
assert "attention_mask" in dataset.features
|
||||||
|
assert "labels" in dataset.features
|
||||||
|
|
||||||
|
def test_load_local_hub(self):
|
||||||
|
"""Niche use case. Verify that a local copy of a hub dataset can be loaded"""
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
tmp_ds_path = Path("mhenrichsen/alpaca_2k_test")
|
||||||
|
tmp_ds_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
snapshot_download(
|
||||||
|
repo_id="mhenrichsen/alpaca_2k_test",
|
||||||
|
repo_type="dataset",
|
||||||
|
local_dir=tmp_ds_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
prepared_path = Path(tmp_dir) / "prepared"
|
||||||
|
# Right now a local copy that doesn't fully conform to a dataset
|
||||||
|
# must list data_files and ds_type otherwise the loader won't know
|
||||||
|
# how to load it.
|
||||||
|
cfg = DictDefault(
|
||||||
|
{
|
||||||
|
"tokenizer_config": "huggyllama/llama-7b",
|
||||||
|
"sequence_len": 1024,
|
||||||
|
"datasets": [
|
||||||
|
{
|
||||||
|
"path": "mhenrichsen/alpaca_2k_test",
|
||||||
|
"ds_type": "parquet",
|
||||||
|
"type": "alpaca",
|
||||||
|
"data_files": [
|
||||||
|
"mhenrichsen/alpaca_2k_test/alpaca_2000.parquet",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset, _ = load_tokenized_prepared_datasets(
|
||||||
|
self.tokenizer, cfg, prepared_path
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(dataset) == 2000
|
||||||
|
assert "input_ids" in dataset.features
|
||||||
|
assert "attention_mask" in dataset.features
|
||||||
|
assert "labels" in dataset.features
|
||||||
|
shutil.rmtree(tmp_ds_path)
|
||||||
|
|
||||||
|
def test_load_from_save_to_disk(self):
|
||||||
|
"""Usual use case. Verify datasets saved via `save_to_disk` can be loaded."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
tmp_ds_name = Path(tmp_dir) / "tmp_dataset"
|
||||||
|
self.dataset.save_to_disk(tmp_ds_name)
|
||||||
|
|
||||||
|
prepared_path = Path(tmp_dir) / "prepared"
|
||||||
|
cfg = DictDefault(
|
||||||
|
{
|
||||||
|
"tokenizer_config": "huggyllama/llama-7b",
|
||||||
|
"sequence_len": 256,
|
||||||
|
"datasets": [
|
||||||
|
{
|
||||||
|
"path": str(tmp_ds_name),
|
||||||
|
"type": "alpaca",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset, _ = load_tokenized_prepared_datasets(
|
||||||
|
self.tokenizer, cfg, prepared_path
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(dataset) == 1
|
||||||
|
assert "input_ids" in dataset.features
|
||||||
|
assert "attention_mask" in dataset.features
|
||||||
|
assert "labels" in dataset.features
|
||||||
|
|
||||||
|
def test_load_from_dir_of_parquet(self):
|
||||||
|
"""Usual use case. Verify a directory of parquet files can be loaded."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
|
||||||
|
tmp_ds_dir.mkdir()
|
||||||
|
tmp_ds_path = tmp_ds_dir / "shard1.parquet"
|
||||||
|
self.dataset.to_parquet(tmp_ds_path)
|
||||||
|
|
||||||
|
prepared_path: Path = Path(tmp_dir) / "prepared"
|
||||||
|
cfg = DictDefault(
|
||||||
|
{
|
||||||
|
"tokenizer_config": "huggyllama/llama-7b",
|
||||||
|
"sequence_len": 256,
|
||||||
|
"datasets": [
|
||||||
|
{
|
||||||
|
"path": str(tmp_ds_dir),
|
||||||
|
"ds_type": "parquet",
|
||||||
|
"name": "test_data",
|
||||||
|
"data_files": [
|
||||||
|
str(tmp_ds_path),
|
||||||
|
],
|
||||||
|
"type": "alpaca",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset, _ = load_tokenized_prepared_datasets(
|
||||||
|
self.tokenizer, cfg, prepared_path
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(dataset) == 1
|
||||||
|
assert "input_ids" in dataset.features
|
||||||
|
assert "attention_mask" in dataset.features
|
||||||
|
assert "labels" in dataset.features
|
||||||
|
|
||||||
|
def test_load_from_dir_of_json(self):
|
||||||
|
"""Standard use case. Verify a directory of json files can be loaded."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
|
||||||
|
tmp_ds_dir.mkdir()
|
||||||
|
tmp_ds_path = tmp_ds_dir / "shard1.json"
|
||||||
|
self.dataset.to_json(tmp_ds_path)
|
||||||
|
|
||||||
|
prepared_path: Path = Path(tmp_dir) / "prepared"
|
||||||
|
cfg = DictDefault(
|
||||||
|
{
|
||||||
|
"tokenizer_config": "huggyllama/llama-7b",
|
||||||
|
"sequence_len": 256,
|
||||||
|
"datasets": [
|
||||||
|
{
|
||||||
|
"path": str(tmp_ds_dir),
|
||||||
|
"ds_type": "json",
|
||||||
|
"name": "test_data",
|
||||||
|
"data_files": [
|
||||||
|
str(tmp_ds_path),
|
||||||
|
],
|
||||||
|
"type": "alpaca",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset, _ = load_tokenized_prepared_datasets(
|
||||||
|
self.tokenizer, cfg, prepared_path
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(dataset) == 1
|
||||||
|
assert "input_ids" in dataset.features
|
||||||
|
assert "attention_mask" in dataset.features
|
||||||
|
assert "labels" in dataset.features
|
||||||
|
|
||||||
|
def test_load_from_single_parquet(self):
|
||||||
|
"""Standard use case. Verify a single parquet file can be loaded."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
tmp_ds_path = Path(tmp_dir) / "tmp_dataset.parquet"
|
||||||
|
self.dataset.to_parquet(tmp_ds_path)
|
||||||
|
|
||||||
|
prepared_path: Path = Path(tmp_dir) / "prepared"
|
||||||
|
cfg = DictDefault(
|
||||||
|
{
|
||||||
|
"tokenizer_config": "huggyllama/llama-7b",
|
||||||
|
"sequence_len": 256,
|
||||||
|
"datasets": [
|
||||||
|
{
|
||||||
|
"path": str(tmp_ds_path),
|
||||||
|
"name": "test_data",
|
||||||
|
"type": "alpaca",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset, _ = load_tokenized_prepared_datasets(
|
||||||
|
self.tokenizer, cfg, prepared_path
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(dataset) == 1
|
||||||
|
assert "input_ids" in dataset.features
|
||||||
|
assert "attention_mask" in dataset.features
|
||||||
|
assert "labels" in dataset.features
|
||||||
|
|
||||||
|
def test_load_from_single_json(self):
|
||||||
|
"""Standard use case. Verify a single json file can be loaded."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
tmp_ds_path = Path(tmp_dir) / "tmp_dataset.json"
|
||||||
|
self.dataset.to_json(tmp_ds_path)
|
||||||
|
|
||||||
|
prepared_path: Path = Path(tmp_dir) / "prepared"
|
||||||
|
cfg = DictDefault(
|
||||||
|
{
|
||||||
|
"tokenizer_config": "huggyllama/llama-7b",
|
||||||
|
"sequence_len": 256,
|
||||||
|
"datasets": [
|
||||||
|
{
|
||||||
|
"path": str(tmp_ds_path),
|
||||||
|
"name": "test_data",
|
||||||
|
"type": "alpaca",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset, _ = load_tokenized_prepared_datasets(
|
||||||
|
self.tokenizer, cfg, prepared_path
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(dataset) == 1
|
||||||
|
assert "input_ids" in dataset.features
|
||||||
|
assert "attention_mask" in dataset.features
|
||||||
|
assert "labels" in dataset.features
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -54,6 +54,18 @@ class TestValidation(BaseValidation):
|
|||||||
Test the validation module
|
Test the validation module
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def test_defaults(self, minimal_cfg):
|
||||||
|
test_cfg = DictDefault(
|
||||||
|
{
|
||||||
|
"weight_decay": None,
|
||||||
|
}
|
||||||
|
| minimal_cfg
|
||||||
|
)
|
||||||
|
cfg = validate_config(test_cfg)
|
||||||
|
|
||||||
|
assert cfg.train_on_inputs is False
|
||||||
|
assert cfg.weight_decay is None
|
||||||
|
|
||||||
def test_datasets_min_length(self):
|
def test_datasets_min_length(self):
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user