Compare commits
11 Commits
v0.4.0
...
sdpa-multi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1a538be9c2 | ||
|
|
74c72ca5eb | ||
|
|
e923e62d24 | ||
|
|
ba944e6554 | ||
|
|
badda3783b | ||
|
|
a01b998c0f | ||
|
|
33e117088f | ||
|
|
b4ac96adef | ||
|
|
98b4762077 | ||
|
|
ee0b5f60e5 | ||
|
|
08719b9609 |
21
.github/workflows/base.yml
vendored
21
.github/workflows/base.yml
vendored
@@ -1,10 +1,7 @@
|
|||||||
name: ci-cd-base
|
name: ci-cd-base
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
workflow_dispatch:
|
||||||
branches:
|
|
||||||
- "main-base"
|
|
||||||
- "dev-base"
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-base:
|
build-base:
|
||||||
@@ -15,11 +12,6 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- cuda: "118"
|
|
||||||
cuda_version: 11.8.0
|
|
||||||
python_version: "3.9"
|
|
||||||
pytorch: 2.0.1
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
|
||||||
- cuda: "118"
|
- cuda: "118"
|
||||||
cuda_version: 11.8.0
|
cuda_version: 11.8.0
|
||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
@@ -28,12 +20,17 @@ jobs:
|
|||||||
- cuda: "118"
|
- cuda: "118"
|
||||||
cuda_version: 11.8.0
|
cuda_version: 11.8.0
|
||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
pytorch: 2.1.1
|
pytorch: 2.1.2
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
||||||
- cuda: "121"
|
- cuda: "121"
|
||||||
cuda_version: 12.1.0
|
cuda_version: 12.1.0
|
||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
pytorch: 2.1.1
|
pytorch: 2.1.2
|
||||||
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
||||||
|
- cuda: "121"
|
||||||
|
cuda_version: 12.1.0
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.1.2
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -56,7 +53,7 @@ jobs:
|
|||||||
context: .
|
context: .
|
||||||
file: ./docker/Dockerfile-base
|
file: ./docker/Dockerfile-base
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
build-args: |
|
build-args: |
|
||||||
CUDA_VERSION=${{ matrix.cuda_version }}
|
CUDA_VERSION=${{ matrix.cuda_version }}
|
||||||
|
|||||||
33
.github/workflows/main.yml
vendored
33
.github/workflows/main.yml
vendored
@@ -4,6 +4,7 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- "main"
|
- "main"
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-axolotl:
|
build-axolotl:
|
||||||
@@ -15,24 +16,24 @@ jobs:
|
|||||||
include:
|
include:
|
||||||
- cuda: 118
|
- cuda: 118
|
||||||
cuda_version: 11.8.0
|
cuda_version: 11.8.0
|
||||||
python_version: "3.9"
|
python_version: "3.10"
|
||||||
pytorch: 2.0.1
|
pytorch: 2.0.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: 118
|
- cuda: 118
|
||||||
cuda_version: 11.8.0
|
cuda_version: 11.8.0
|
||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
pytorch: 2.0.1
|
pytorch: 2.1.2
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
is_latest: true
|
is_latest: true
|
||||||
- cuda: 118
|
|
||||||
cuda_version: 11.8.0
|
|
||||||
python_version: "3.10"
|
|
||||||
pytorch: 2.1.1
|
|
||||||
axolotl_extras:
|
|
||||||
- cuda: 121
|
- cuda: 121
|
||||||
cuda_version: 12.1.0
|
cuda_version: 12.1.0
|
||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
pytorch: 2.1.1
|
pytorch: 2.1.2
|
||||||
|
axolotl_extras:
|
||||||
|
- cuda: 121
|
||||||
|
cuda_version: 12.1.0
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.1.2
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
runs-on: [self-hosted, gpu, docker]
|
runs-on: [self-hosted, gpu, docker]
|
||||||
steps:
|
steps:
|
||||||
@@ -86,24 +87,24 @@ jobs:
|
|||||||
include:
|
include:
|
||||||
- cuda: 118
|
- cuda: 118
|
||||||
cuda_version: 11.8.0
|
cuda_version: 11.8.0
|
||||||
python_version: "3.9"
|
python_version: "3.10"
|
||||||
pytorch: 2.0.1
|
pytorch: 2.0.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: 118
|
- cuda: 118
|
||||||
cuda_version: 11.8.0
|
cuda_version: 11.8.0
|
||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
pytorch: 2.0.1
|
pytorch: 2.1.2
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
is_latest: true
|
is_latest: true
|
||||||
- cuda: 118
|
|
||||||
cuda_version: 11.8.0
|
|
||||||
python_version: "3.10"
|
|
||||||
pytorch: 2.1.1
|
|
||||||
axolotl_extras:
|
|
||||||
- cuda: 121
|
- cuda: 121
|
||||||
cuda_version: 12.1.0
|
cuda_version: 12.1.0
|
||||||
python_version: "3.10"
|
python_version: "3.10"
|
||||||
pytorch: 2.1.1
|
pytorch: 2.1.2
|
||||||
|
axolotl_extras:
|
||||||
|
- cuda: 121
|
||||||
|
cuda_version: 12.1.0
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.1.2
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
runs-on: [self-hosted, gpu, docker]
|
runs-on: [self-hosted, gpu, docker]
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
4
.github/workflows/tests.yml
vendored
4
.github/workflows/tests.yml
vendored
@@ -106,3 +106,7 @@ jobs:
|
|||||||
- name: GPU Unit Tests monkeypatched w docker image
|
- name: GPU Unit Tests monkeypatched w docker image
|
||||||
run: |
|
run: |
|
||||||
docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest /workspace/axolotl/tests/e2e/patched/
|
docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest /workspace/axolotl/tests/e2e/patched/
|
||||||
|
- name: Prune image from docker
|
||||||
|
if: github.ref != 'refs/heads/main'
|
||||||
|
run: |
|
||||||
|
docker rmi -f ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
||||||
|
|||||||
@@ -613,6 +613,8 @@ rl:
|
|||||||
# Saves the desired chat template to the tokenizer_config.json for easier inferencing
|
# Saves the desired chat template to the tokenizer_config.json for easier inferencing
|
||||||
# Currently supports chatml and inst (mistral/mixtral)
|
# Currently supports chatml and inst (mistral/mixtral)
|
||||||
chat_template: chatml
|
chat_template: chatml
|
||||||
|
# Changes the default system message
|
||||||
|
default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
|
||||||
# Axolotl attempts to save the dataset as an arrow after packing the data together so
|
# Axolotl attempts to save the dataset as an arrow after packing the data together so
|
||||||
# subsequent training attempts load faster, relative path
|
# subsequent training attempts load faster, relative path
|
||||||
dataset_prepared_path: data/last_run_prepared
|
dataset_prepared_path: data/last_run_prepared
|
||||||
|
|||||||
@@ -15,15 +15,6 @@
|
|||||||
"hysteresis": 2,
|
"hysteresis": 2,
|
||||||
"min_loss_scale": 1
|
"min_loss_scale": 1
|
||||||
},
|
},
|
||||||
"optimizer": {
|
|
||||||
"type": "AdamW",
|
|
||||||
"params": {
|
|
||||||
"lr": "auto",
|
|
||||||
"betas": "auto",
|
|
||||||
"eps": "auto",
|
|
||||||
"weight_decay": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gradient_accumulation_steps": "auto",
|
"gradient_accumulation_steps": "auto",
|
||||||
"train_batch_size": "auto",
|
"train_batch_size": "auto",
|
||||||
"train_micro_batch_size_per_gpu": "auto",
|
"train_micro_batch_size_per_gpu": "auto",
|
||||||
|
|||||||
@@ -19,15 +19,6 @@
|
|||||||
"hysteresis": 2,
|
"hysteresis": 2,
|
||||||
"min_loss_scale": 1
|
"min_loss_scale": 1
|
||||||
},
|
},
|
||||||
"optimizer": {
|
|
||||||
"type": "AdamW",
|
|
||||||
"params": {
|
|
||||||
"lr": "auto",
|
|
||||||
"betas": "auto",
|
|
||||||
"eps": "auto",
|
|
||||||
"weight_decay": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gradient_accumulation_steps": "auto",
|
"gradient_accumulation_steps": "auto",
|
||||||
"train_batch_size": "auto",
|
"train_batch_size": "auto",
|
||||||
"train_micro_batch_size_per_gpu": "auto",
|
"train_micro_batch_size_per_gpu": "auto",
|
||||||
|
|||||||
@@ -23,15 +23,6 @@
|
|||||||
"hysteresis": 2,
|
"hysteresis": 2,
|
||||||
"min_loss_scale": 1
|
"min_loss_scale": 1
|
||||||
},
|
},
|
||||||
"optimizer": {
|
|
||||||
"type": "AdamW",
|
|
||||||
"params": {
|
|
||||||
"lr": "auto",
|
|
||||||
"betas": "auto",
|
|
||||||
"eps": "auto",
|
|
||||||
"weight_decay": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gradient_accumulation_steps": "auto",
|
"gradient_accumulation_steps": "auto",
|
||||||
"train_batch_size": "auto",
|
"train_batch_size": "auto",
|
||||||
"train_micro_batch_size_per_gpu": "auto",
|
"train_micro_batch_size_per_gpu": "auto",
|
||||||
|
|||||||
@@ -23,15 +23,6 @@
|
|||||||
"hysteresis": 2,
|
"hysteresis": 2,
|
||||||
"min_loss_scale": 1
|
"min_loss_scale": 1
|
||||||
},
|
},
|
||||||
"optimizer": {
|
|
||||||
"type": "AdamW",
|
|
||||||
"params": {
|
|
||||||
"lr": "auto",
|
|
||||||
"betas": "auto",
|
|
||||||
"eps": "auto",
|
|
||||||
"weight_decay": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"gradient_accumulation_steps": "auto",
|
"gradient_accumulation_steps": "auto",
|
||||||
"train_batch_size": "auto",
|
"train_batch_size": "auto",
|
||||||
"train_micro_batch_size_per_gpu": "auto",
|
"train_micro_batch_size_per_gpu": "auto",
|
||||||
|
|||||||
197
examples/colab-notebooks/colab-axolotl-example.ipynb
Normal file
197
examples/colab-notebooks/colab-axolotl-example.ipynb
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "AKjdG7tbTb-n"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"# Example notebook for running Axolotl on google colab"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "RcbNpOgWRcii"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import torch\n",
|
||||||
|
"# Check so there is a gpu available, a T4(free tier) is enough to run this notebook\n",
|
||||||
|
"assert (torch.cuda.is_available()==True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "h3nLav8oTRA5"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"## Install Axolotl and dependencies"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "3c3yGAwnOIdi",
|
||||||
|
"outputId": "e3777b5a-40ef-424f-e181-62dfecd1dd01"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pip install -e git+https://github.com/OpenAccess-AI-Collective/axolotl#egg=axolotl\n",
|
||||||
|
"!pip install flash-attn==\"2.5.0\"\n",
|
||||||
|
"!pip install deepspeed==\"0.13.1\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "BW2MFr7HTjub"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"## Create an yaml config file"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "9pkF2dSoQEUN"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import yaml\n",
|
||||||
|
"\n",
|
||||||
|
"# Your YAML string\n",
|
||||||
|
"yaml_string = \"\"\"\n",
|
||||||
|
"base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\n",
|
||||||
|
"model_type: LlamaForCausalLM\n",
|
||||||
|
"tokenizer_type: LlamaTokenizer\n",
|
||||||
|
"is_llama_derived_model: true\n",
|
||||||
|
"\n",
|
||||||
|
"load_in_8bit: false\n",
|
||||||
|
"load_in_4bit: true\n",
|
||||||
|
"strict: false\n",
|
||||||
|
"\n",
|
||||||
|
"datasets:\n",
|
||||||
|
" - path: mhenrichsen/alpaca_2k_test\n",
|
||||||
|
" type: alpaca\n",
|
||||||
|
"dataset_prepared_path:\n",
|
||||||
|
"val_set_size: 0.05\n",
|
||||||
|
"output_dir: ./qlora-out\n",
|
||||||
|
"\n",
|
||||||
|
"adapter: qlora\n",
|
||||||
|
"lora_model_dir:\n",
|
||||||
|
"\n",
|
||||||
|
"sequence_len: 1096\n",
|
||||||
|
"sample_packing: true\n",
|
||||||
|
"pad_to_sequence_len: true\n",
|
||||||
|
"\n",
|
||||||
|
"lora_r: 32\n",
|
||||||
|
"lora_alpha: 16\n",
|
||||||
|
"lora_dropout: 0.05\n",
|
||||||
|
"lora_target_modules:\n",
|
||||||
|
"lora_target_linear: true\n",
|
||||||
|
"lora_fan_in_fan_out:\n",
|
||||||
|
"\n",
|
||||||
|
"wandb_project:\n",
|
||||||
|
"wandb_entity:\n",
|
||||||
|
"wandb_watch:\n",
|
||||||
|
"wandb_name:\n",
|
||||||
|
"wandb_log_model:\n",
|
||||||
|
"\n",
|
||||||
|
"mlflow_experiment_name: colab-example\n",
|
||||||
|
"\n",
|
||||||
|
"gradient_accumulation_steps: 1\n",
|
||||||
|
"micro_batch_size: 1\n",
|
||||||
|
"num_epochs: 4\n",
|
||||||
|
"max_steps: 20\n",
|
||||||
|
"optimizer: paged_adamw_32bit\n",
|
||||||
|
"lr_scheduler: cosine\n",
|
||||||
|
"learning_rate: 0.0002\n",
|
||||||
|
"\n",
|
||||||
|
"train_on_inputs: false\n",
|
||||||
|
"group_by_length: false\n",
|
||||||
|
"bf16: false\n",
|
||||||
|
"fp16: true\n",
|
||||||
|
"tf32: false\n",
|
||||||
|
"\n",
|
||||||
|
"gradient_checkpointing: true\n",
|
||||||
|
"early_stopping_patience:\n",
|
||||||
|
"resume_from_checkpoint:\n",
|
||||||
|
"local_rank:\n",
|
||||||
|
"logging_steps: 1\n",
|
||||||
|
"xformers_attention:\n",
|
||||||
|
"flash_attention: false\n",
|
||||||
|
"\n",
|
||||||
|
"warmup_steps: 10\n",
|
||||||
|
"evals_per_epoch:\n",
|
||||||
|
"saves_per_epoch:\n",
|
||||||
|
"debug:\n",
|
||||||
|
"deepspeed:\n",
|
||||||
|
"weight_decay: 0.0\n",
|
||||||
|
"fsdp:\n",
|
||||||
|
"fsdp_config:\n",
|
||||||
|
"special_tokens:\n",
|
||||||
|
"\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"# Convert the YAML string to a Python dictionary\n",
|
||||||
|
"yaml_dict = yaml.safe_load(yaml_string)\n",
|
||||||
|
"\n",
|
||||||
|
"# Specify your file path\n",
|
||||||
|
"file_path = 'test_axolotl.yaml'\n",
|
||||||
|
"\n",
|
||||||
|
"# Write the YAML file\n",
|
||||||
|
"with open(file_path, 'w') as file:\n",
|
||||||
|
" yaml.dump(yaml_dict, file)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "bidoj8YLTusD"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"## Launch the training"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "ydTI2Jk2RStU",
|
||||||
|
"outputId": "d6d0df17-4b53-439c-c802-22c0456d301b"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Buy using the ! the comand will be executed as a bash command\n",
|
||||||
|
"!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"accelerator": "GPU",
|
||||||
|
"colab": {
|
||||||
|
"gpuType": "T4",
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
peft==0.7.0
|
peft==0.7.1
|
||||||
transformers==4.37.0
|
transformers==4.37.0
|
||||||
tokenizers==0.15.0
|
tokenizers==0.15.0
|
||||||
bitsandbytes>=0.41.1
|
bitsandbytes>=0.41.1
|
||||||
@@ -15,16 +15,14 @@ sentencepiece
|
|||||||
wandb
|
wandb
|
||||||
einops
|
einops
|
||||||
xformers==0.0.22
|
xformers==0.0.22
|
||||||
optimum==1.13.2
|
optimum==1.16.2
|
||||||
hf_transfer
|
hf_transfer
|
||||||
colorama
|
colorama
|
||||||
numba
|
numba
|
||||||
numpy>=1.24.4
|
numpy>=1.24.4
|
||||||
mlflow
|
mlflow
|
||||||
# qlora things
|
# qlora things
|
||||||
bert-score==0.3.13
|
|
||||||
evaluate==0.4.0
|
evaluate==0.4.0
|
||||||
rouge-score==0.1.2
|
|
||||||
scipy
|
scipy
|
||||||
scikit-learn==1.2.2
|
scikit-learn==1.2.2
|
||||||
pynvml
|
pynvml
|
||||||
|
|||||||
4
setup.py
4
setup.py
@@ -27,9 +27,9 @@ def parse_requirements():
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
torch_version = version("torch")
|
torch_version = version("torch")
|
||||||
if torch_version.startswith("2.1.1"):
|
if torch_version.startswith("2.1."):
|
||||||
_install_requires.pop(_install_requires.index("xformers==0.0.22"))
|
_install_requires.pop(_install_requires.index("xformers==0.0.22"))
|
||||||
_install_requires.append("xformers==0.0.23")
|
_install_requires.append("xformers>=0.0.23")
|
||||||
except PackageNotFoundError:
|
except PackageNotFoundError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ from axolotl.cli import (
|
|||||||
)
|
)
|
||||||
from axolotl.common.cli import PreprocessCliArgs
|
from axolotl.common.cli import PreprocessCliArgs
|
||||||
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
|
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
|
||||||
|
from axolotl.prompt_strategies.sharegpt import register_chatml_template
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.cli.preprocess")
|
LOG = logging.getLogger("axolotl.cli.preprocess")
|
||||||
|
|
||||||
@@ -34,6 +35,14 @@ def do_cli(config: Path = Path("examples/"), **kwargs):
|
|||||||
return_remaining_strings=True
|
return_remaining_strings=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if parsed_cfg.chat_template == "chatml" and parsed_cfg.default_system_message:
|
||||||
|
LOG.info(
|
||||||
|
f"ChatML set. Adding default system message: {parsed_cfg.default_system_message}"
|
||||||
|
)
|
||||||
|
register_chatml_template(parsed_cfg.default_system_message)
|
||||||
|
else:
|
||||||
|
register_chatml_template()
|
||||||
|
|
||||||
if not parsed_cfg.dataset_prepared_path:
|
if not parsed_cfg.dataset_prepared_path:
|
||||||
msg = (
|
msg = (
|
||||||
Fore.RED
|
Fore.RED
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ from axolotl.cli import (
|
|||||||
print_axolotl_text_art,
|
print_axolotl_text_art,
|
||||||
)
|
)
|
||||||
from axolotl.common.cli import TrainerCliArgs
|
from axolotl.common.cli import TrainerCliArgs
|
||||||
|
from axolotl.prompt_strategies.sharegpt import register_chatml_template
|
||||||
from axolotl.train import train
|
from axolotl.train import train
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.cli.train")
|
LOG = logging.getLogger("axolotl.cli.train")
|
||||||
@@ -37,6 +38,14 @@ def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
|
|||||||
print_axolotl_text_art()
|
print_axolotl_text_art()
|
||||||
check_accelerate_default_config()
|
check_accelerate_default_config()
|
||||||
check_user_token()
|
check_user_token()
|
||||||
|
if cfg.chat_template == "chatml" and cfg.default_system_message:
|
||||||
|
LOG.info(
|
||||||
|
f"ChatML set. Adding default system message: {cfg.default_system_message}"
|
||||||
|
)
|
||||||
|
register_chatml_template(cfg.default_system_message)
|
||||||
|
else:
|
||||||
|
register_chatml_template()
|
||||||
|
|
||||||
if cfg.rl:
|
if cfg.rl:
|
||||||
dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
|
dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -170,24 +170,30 @@ class AxolotlTrainer(Trainer):
|
|||||||
num_training_steps (int): The number of training steps to do.
|
num_training_steps (int): The number of training steps to do.
|
||||||
optimizer (torch.optim.Optimizer): The training optimizer
|
optimizer (torch.optim.Optimizer): The training optimizer
|
||||||
"""
|
"""
|
||||||
|
use_cosine_quadratic = (
|
||||||
|
self.args.lr_scheduler_type == "cosine"
|
||||||
|
and self.args.lr_quadratic_warmup is True
|
||||||
|
)
|
||||||
|
|
||||||
|
use_cosine_min_lr = (
|
||||||
|
self.args.lr_scheduler_type == "cosine"
|
||||||
|
and self.args.cosine_min_lr_ratio is not None
|
||||||
|
)
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
if self.lr_scheduler is None: # type: ignore # pylint: disable=access-member-before-definition
|
if self.lr_scheduler is None: # type: ignore # pylint: disable=access-member-before-definition
|
||||||
# fmt: on
|
# fmt: on
|
||||||
if (
|
if use_cosine_quadratic:
|
||||||
self.args.lr_scheduler_type == "cosine"
|
if use_cosine_min_lr:
|
||||||
and self.args.lr_quadratic_warmup is True
|
LOG.warning("Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
|
||||||
):
|
|
||||||
self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup( # pylint: disable=attribute-defined-outside-init
|
self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup( # pylint: disable=attribute-defined-outside-init
|
||||||
optimizer,
|
optimizer,
|
||||||
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
|
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
|
||||||
num_training_steps=num_training_steps,
|
num_training_steps=num_training_steps,
|
||||||
)
|
)
|
||||||
elif self.args.lr_scheduler_type == "cosine" and self.args.cosine_min_lr_ratio is not None:
|
elif self.args.cosine_min_lr_ratio and use_cosine_min_lr:
|
||||||
assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
|
assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
|
||||||
if self.args.deepspeed:
|
|
||||||
LOG.warning("Using cosine scheduler with deepspeed. This may be ignored if a scheduler is set \
|
|
||||||
in the deepspeed JSON")
|
|
||||||
self.lr_scheduler = get_cosine_schedule_with_min_lr( # pylint: disable=attribute-defined-outside-init
|
self.lr_scheduler = get_cosine_schedule_with_min_lr( # pylint: disable=attribute-defined-outside-init
|
||||||
optimizer,
|
optimizer,
|
||||||
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
|
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
|
||||||
@@ -196,6 +202,13 @@ class AxolotlTrainer(Trainer):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return super().create_scheduler(num_training_steps, optimizer)
|
return super().create_scheduler(num_training_steps, optimizer)
|
||||||
|
else:
|
||||||
|
if use_cosine_quadratic:
|
||||||
|
LOG.warning("axolotl's cosine scheduler with quadratic warmup not used (e.g., because of deepspeed).")
|
||||||
|
|
||||||
|
if use_cosine_min_lr:
|
||||||
|
LOG.warning("axolotl's cosine scheduler with min lr not used (e.g., because of deepspeed).")
|
||||||
|
|
||||||
return self.lr_scheduler
|
return self.lr_scheduler
|
||||||
|
|
||||||
def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
|
def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
|
||||||
@@ -638,7 +651,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
training_arguments_kwargs[
|
training_arguments_kwargs[
|
||||||
"gradient_checkpointing"
|
"gradient_checkpointing"
|
||||||
] = self.cfg.gradient_checkpointing
|
] = self.cfg.gradient_checkpointing
|
||||||
if self.cfg.gradient_checkpointing_kwargs:
|
if self.cfg.gradient_checkpointing_kwargs is not None:
|
||||||
training_arguments_kwargs[
|
training_arguments_kwargs[
|
||||||
"gradient_checkpointing_kwargs"
|
"gradient_checkpointing_kwargs"
|
||||||
] = self.cfg.gradient_checkpointing_kwargs
|
] = self.cfg.gradient_checkpointing_kwargs
|
||||||
@@ -1015,6 +1028,18 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
|
|||||||
training_args_kwargs[
|
training_args_kwargs[
|
||||||
"dataloader_prefetch_factor"
|
"dataloader_prefetch_factor"
|
||||||
] = self.cfg.dataloader_prefetch_factor
|
] = self.cfg.dataloader_prefetch_factor
|
||||||
|
if self.cfg.gradient_checkpointing:
|
||||||
|
training_args_kwargs[
|
||||||
|
"gradient_checkpointing"
|
||||||
|
] = self.cfg.gradient_checkpointing
|
||||||
|
if self.cfg.gradient_checkpointing_kwargs is not None:
|
||||||
|
training_args_kwargs[
|
||||||
|
"gradient_checkpointing_kwargs"
|
||||||
|
] = self.cfg.gradient_checkpointing_kwargs
|
||||||
|
else:
|
||||||
|
training_args_kwargs["gradient_checkpointing_kwargs"] = {
|
||||||
|
"use_reentrant": False
|
||||||
|
}
|
||||||
|
|
||||||
training_args = TrainingArguments(
|
training_args = TrainingArguments(
|
||||||
per_device_train_batch_size=self.cfg.micro_batch_size,
|
per_device_train_batch_size=self.cfg.micro_batch_size,
|
||||||
@@ -1025,9 +1050,6 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
|
|||||||
save_steps=self.cfg.save_steps,
|
save_steps=self.cfg.save_steps,
|
||||||
output_dir=self.cfg.output_dir,
|
output_dir=self.cfg.output_dir,
|
||||||
warmup_steps=self.cfg.warmup_steps,
|
warmup_steps=self.cfg.warmup_steps,
|
||||||
gradient_checkpointing=self.cfg.gradient_checkpointing,
|
|
||||||
gradient_checkpointing_kwargs=self.cfg.gradient_checkpointing_kwargs
|
|
||||||
or {"use_reentrant": False},
|
|
||||||
logging_first_step=True,
|
logging_first_step=True,
|
||||||
logging_steps=1,
|
logging_steps=1,
|
||||||
optim=self.cfg.optimizer,
|
optim=self.cfg.optimizer,
|
||||||
@@ -1050,6 +1072,10 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
|
|||||||
dpo_trainer_kwargs["eval_dataset"] = self.eval_dataset
|
dpo_trainer_kwargs["eval_dataset"] = self.eval_dataset
|
||||||
if self.cfg.adapter and self.peft_config:
|
if self.cfg.adapter and self.peft_config:
|
||||||
dpo_trainer_kwargs["peft_config"] = self.peft_config
|
dpo_trainer_kwargs["peft_config"] = self.peft_config
|
||||||
|
if self.cfg.precompute_ref_log_probs is not None:
|
||||||
|
dpo_trainer_kwargs[
|
||||||
|
"precompute_ref_log_probs"
|
||||||
|
] = self.cfg.precompute_ref_log_probs
|
||||||
dpo_trainer = DPOTrainer(
|
dpo_trainer = DPOTrainer(
|
||||||
self.model,
|
self.model,
|
||||||
self.model_ref,
|
self.model_ref,
|
||||||
|
|||||||
@@ -23,6 +23,31 @@ def argilla(
|
|||||||
return transform_fn
|
return transform_fn
|
||||||
|
|
||||||
|
|
||||||
|
def icr(
|
||||||
|
cfg,
|
||||||
|
): # pylint: disable=possibly-unused-variable,unused-argument
|
||||||
|
"""
|
||||||
|
chatml transforms for datasets with system, input, chosen, rejected
|
||||||
|
ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs
|
||||||
|
"""
|
||||||
|
|
||||||
|
def transform_fn(sample):
|
||||||
|
if "system" in sample and sample["system"]:
|
||||||
|
sample["prompt"] = (
|
||||||
|
f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
|
||||||
|
f"<|im_start|>user\n{sample['input']}<|im_end|>\n<|im_start|>assistant\n"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
sample[
|
||||||
|
"prompt"
|
||||||
|
] = f"<|im_start|>user\n{sample['input']}<|im_end|>\n<|im_start|>assistant\n"
|
||||||
|
sample["chosen"] = f"{sample['chosen']}<|im_end|>"
|
||||||
|
sample["rejected"] = f"{sample['rejected']}<|im_end|>"
|
||||||
|
return sample
|
||||||
|
|
||||||
|
return transform_fn
|
||||||
|
|
||||||
|
|
||||||
def intel(cfg): # pylint: disable=possibly-unused-variable,unused-argument
|
def intel(cfg): # pylint: disable=possibly-unused-variable,unused-argument
|
||||||
"""
|
"""
|
||||||
For Intel Orca DPO Pairs
|
For Intel Orca DPO Pairs
|
||||||
|
|||||||
@@ -6,16 +6,19 @@ from fastchat.conversation import Conversation, SeparatorStyle, register_conv_te
|
|||||||
from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
|
from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
|
||||||
from axolotl.prompters import ShareGPTPrompterV2
|
from axolotl.prompters import ShareGPTPrompterV2
|
||||||
|
|
||||||
register_conv_template(
|
|
||||||
Conversation(
|
def register_chatml_template(system_message=None):
|
||||||
name="chatml",
|
system_message = system_message or "You are a helpful assistant."
|
||||||
system_template="<|im_start|>system\n{system_message}",
|
register_conv_template(
|
||||||
system_message="You are a helpful assistant.",
|
Conversation(
|
||||||
roles=["<|im_start|>user", "<|im_start|>assistant"],
|
name="chatml",
|
||||||
sep_style=SeparatorStyle.CHATML,
|
system_template="<|im_start|>system\n{system_message}",
|
||||||
sep="<|im_end|>",
|
system_message=system_message,
|
||||||
|
roles=["<|im_start|>user", "<|im_start|>assistant"],
|
||||||
|
sep_style=SeparatorStyle.CHATML,
|
||||||
|
sep="<|im_end|>",
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
|
def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
|
||||||
|
|||||||
@@ -63,6 +63,8 @@ def train(
|
|||||||
msg += " and peft_config..."
|
msg += " and peft_config..."
|
||||||
LOG.debug(msg)
|
LOG.debug(msg)
|
||||||
model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
|
model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
|
||||||
|
model.generation_config.do_sample = True
|
||||||
|
|
||||||
model_ref = None
|
model_ref = None
|
||||||
if cfg.rl:
|
if cfg.rl:
|
||||||
if cfg.adapter and not cfg.rl_adapter_ref_model:
|
if cfg.adapter and not cfg.rl_adapter_ref_model:
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ def chat_templates(user_choice: str):
|
|||||||
|
|
||||||
templates = {
|
templates = {
|
||||||
"inst": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", # I don't know what this one is called. Used by Mistral/Mixtral.
|
"inst": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", # I don't know what this one is called. Used by Mistral/Mixtral.
|
||||||
"chatml": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
"chatml": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
||||||
}
|
}
|
||||||
|
|
||||||
if user_choice in templates:
|
if user_choice in templates:
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ def normalize_config(cfg):
|
|||||||
save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
|
save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
|
||||||
if save_steps < 1.0: # prevent saves on every step
|
if save_steps < 1.0: # prevent saves on every step
|
||||||
cfg.save_steps = save_steps
|
cfg.save_steps = save_steps
|
||||||
if cfg.evals_per_epoch:
|
if (cfg.val_set_size or cfg.test_datasets) and cfg.evals_per_epoch:
|
||||||
eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs)
|
eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs)
|
||||||
if eval_steps < 1.0: # prevent evals on every step
|
if eval_steps < 1.0: # prevent evals on every step
|
||||||
cfg.eval_steps = eval_steps
|
cfg.eval_steps = eval_steps
|
||||||
@@ -163,6 +163,7 @@ def normalize_config(cfg):
|
|||||||
cfg.gradient_checkpointing
|
cfg.gradient_checkpointing
|
||||||
and cfg.unfrozen_parameters is None
|
and cfg.unfrozen_parameters is None
|
||||||
and cfg.gradient_checkpointing_kwargs is None
|
and cfg.gradient_checkpointing_kwargs is None
|
||||||
|
and cfg.rl is None
|
||||||
):
|
):
|
||||||
cfg.gradient_checkpointing_kwargs = {"use_reentrant": True}
|
cfg.gradient_checkpointing_kwargs = {"use_reentrant": True}
|
||||||
|
|
||||||
@@ -484,35 +485,43 @@ def validate_config(cfg):
|
|||||||
"`use_reentrant` must be false when used with partially frozen model."
|
"`use_reentrant` must be false when used with partially frozen model."
|
||||||
)
|
)
|
||||||
|
|
||||||
if cfg.flash_attention and cfg.deepspeed and Path(cfg.deepspeed).is_file():
|
if cfg.deepspeed and Path(cfg.deepspeed).is_file():
|
||||||
with open(cfg.deepspeed, encoding="utf-8") as file:
|
with open(cfg.deepspeed, encoding="utf-8") as file:
|
||||||
contents = file.read()
|
contents = file.read()
|
||||||
deepspeed_cfg: DictDefault = DictDefault(json.loads(contents))
|
deepspeed_cfg: DictDefault = DictDefault(json.loads(contents))
|
||||||
if (
|
if cfg.flash_attention:
|
||||||
deepspeed_cfg.zero_optimization
|
if (
|
||||||
and deepspeed_cfg.zero_optimization.stage == 3
|
deepspeed_cfg.zero_optimization
|
||||||
):
|
and deepspeed_cfg.zero_optimization.stage == 3
|
||||||
if not (
|
|
||||||
(
|
|
||||||
deepspeed_cfg.bf16
|
|
||||||
and deepspeed_cfg.bf16.enabled # pylint: disable=no-member
|
|
||||||
is True
|
|
||||||
)
|
|
||||||
or (
|
|
||||||
deepspeed_cfg.fp16
|
|
||||||
and deepspeed_cfg.fp16.enabled # pylint: disable=no-member
|
|
||||||
is True
|
|
||||||
)
|
|
||||||
):
|
):
|
||||||
raise ValueError(
|
if not (
|
||||||
"bf16.enabled or fp16.enabled must be set to true when using ZeRO-3 with flash-attention"
|
(
|
||||||
)
|
deepspeed_cfg.bf16
|
||||||
|
and deepspeed_cfg.bf16.enabled # pylint: disable=no-member
|
||||||
|
is True
|
||||||
|
)
|
||||||
|
or (
|
||||||
|
deepspeed_cfg.fp16
|
||||||
|
and deepspeed_cfg.fp16.enabled # pylint: disable=no-member
|
||||||
|
is True
|
||||||
|
)
|
||||||
|
):
|
||||||
|
raise ValueError(
|
||||||
|
"bf16.enabled or fp16.enabled must be set to true when using ZeRO-3 with flash-attention"
|
||||||
|
)
|
||||||
|
if "8bit" in cfg.optimizer and deepspeed_cfg.optimizer:
|
||||||
|
LOG.warning(
|
||||||
|
f"conflicting optimizer: {cfg.optimizer} used alongside deepspeed optimizer."
|
||||||
|
)
|
||||||
|
|
||||||
if cfg.test_datasets and cfg.val_set_size:
|
if cfg.test_datasets and cfg.val_set_size:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"non-zero val_set_size should not be used with test_datasets configuration"
|
"non-zero val_set_size should not be used with test_datasets configuration"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if cfg.fsdp and "bnb" in cfg.optimizer:
|
||||||
|
raise ValueError(f"FSDP not compatible with {cfg.optimizer}")
|
||||||
|
|
||||||
# TODO
|
# TODO
|
||||||
# MPT 7b
|
# MPT 7b
|
||||||
# https://github.com/facebookresearch/bitsandbytes/issues/25
|
# https://github.com/facebookresearch/bitsandbytes/issues/25
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ from datasets import (
|
|||||||
load_from_disk,
|
load_from_disk,
|
||||||
)
|
)
|
||||||
from huggingface_hub import hf_hub_download
|
from huggingface_hub import hf_hub_download
|
||||||
|
from huggingface_hub.utils import HFValidationError
|
||||||
from torch.utils.data import RandomSampler
|
from torch.utils.data import RandomSampler
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
@@ -213,7 +214,7 @@ def load_tokenized_prepared_datasets(
|
|||||||
token=use_auth_token,
|
token=use_auth_token,
|
||||||
)
|
)
|
||||||
ds_from_hub = True
|
ds_from_hub = True
|
||||||
except (FileNotFoundError, ConnectionError):
|
except (FileNotFoundError, ConnectionError, HFValidationError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
ds_from_cloud = False
|
ds_from_cloud = False
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ def check_model_config(cfg: DictDefault, model_config: Union[AutoConfig, DictDef
|
|||||||
):
|
):
|
||||||
lora_modules_to_save = ", ".join(map(lambda x: f"`{x}`", lora_modules_to_save))
|
lora_modules_to_save = ", ".join(map(lambda x: f"`{x}`", lora_modules_to_save))
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"`lora_modules_to_save` not properly set when adding new tokens. Please include {lora_modules_to_save} in `lora_modules_to_save`."
|
f"`lora_modules_to_save` not properly set when adding new tokens. Please include [{lora_modules_to_save}] in `lora_modules_to_save`."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -182,7 +182,7 @@ def load_tokenizer(cfg):
|
|||||||
[f"`{x}`" for x in lora_modules_to_save]
|
[f"`{x}`" for x in lora_modules_to_save]
|
||||||
)
|
)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Please set lora_modules_to_save to {lora_modules_to_save} when using an adapter and changing the special tokens."
|
f"Please set lora_modules_to_save to [{lora_modules_to_save}] when using an adapter and changing the special tokens."
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer.add_special_tokens(
|
tokenizer.add_special_tokens(
|
||||||
@@ -219,7 +219,13 @@ def load_tokenizer(cfg):
|
|||||||
LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
|
LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
|
||||||
|
|
||||||
if cfg.chat_template:
|
if cfg.chat_template:
|
||||||
tokenizer.chat_template = chat_templates(cfg.chat_template)
|
chat_template_string = chat_templates(cfg.chat_template)
|
||||||
|
if cfg.default_system_message and cfg.chat_template == "chatml":
|
||||||
|
chat_template_string = chat_template_string.replace(
|
||||||
|
"You are a helpful assistant.", cfg.default_system_message
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer.chat_template = chat_template_string
|
||||||
else:
|
else:
|
||||||
LOG.info(
|
LOG.info(
|
||||||
"No Chat template selected. Consider adding a chat template for easier inference."
|
"No Chat template selected. Consider adding a chat template for easier inference."
|
||||||
@@ -636,15 +642,17 @@ def load_model(
|
|||||||
|
|
||||||
# make sure these are fp32 per Ramesh et al. (2021)
|
# make sure these are fp32 per Ramesh et al. (2021)
|
||||||
embedding_modules = get_linear_embedding_layers(cfg.model_config_type)
|
embedding_modules = get_linear_embedding_layers(cfg.model_config_type)
|
||||||
for name, module in model.named_modules():
|
if not cfg.fsdp:
|
||||||
if any(m in name for m in ["norm", "gate"]):
|
# FSDP doesn't like mixed Float and BFloat16
|
||||||
module.to(torch.float32)
|
for name, module in model.named_modules():
|
||||||
if model_config.model_type == "btlm":
|
if any(m in name for m in ["norm", "gate"]):
|
||||||
# don't upcast lm_head for btlm
|
|
||||||
continue
|
|
||||||
if any(m in name for m in embedding_modules):
|
|
||||||
if hasattr(module, "weight"):
|
|
||||||
module.to(torch.float32)
|
module.to(torch.float32)
|
||||||
|
if model_config.model_type == "btlm":
|
||||||
|
# don't upcast lm_head for btlm
|
||||||
|
continue
|
||||||
|
if any(m in name for m in embedding_modules):
|
||||||
|
if hasattr(module, "weight"):
|
||||||
|
module.to(torch.float32)
|
||||||
|
|
||||||
needs_fa2_dtype = cfg.adapter or cfg.fsdp
|
needs_fa2_dtype = cfg.adapter or cfg.fsdp
|
||||||
skip_prepare_model_for_kbit_training = False
|
skip_prepare_model_for_kbit_training = False
|
||||||
|
|||||||
@@ -7,9 +7,14 @@ from tokenizers import AddedToken
|
|||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from axolotl.datasets import TokenizedPromptDataset
|
from axolotl.datasets import TokenizedPromptDataset
|
||||||
from axolotl.prompt_strategies.sharegpt import SimpleShareGPTPromptTokenizingStrategy
|
from axolotl.prompt_strategies.sharegpt import (
|
||||||
|
SimpleShareGPTPromptTokenizingStrategy,
|
||||||
|
register_chatml_template,
|
||||||
|
)
|
||||||
from axolotl.prompters import ShareGPTPrompterV2
|
from axolotl.prompters import ShareGPTPrompterV2
|
||||||
|
|
||||||
|
register_chatml_template()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(name="sharegpt_dataset")
|
@pytest.fixture(name="sharegpt_dataset")
|
||||||
def fixture_sharegpt_dataset():
|
def fixture_sharegpt_dataset():
|
||||||
|
|||||||
@@ -39,6 +39,32 @@ class TestExpandMask(unittest.TestCase):
|
|||||||
# Check that the output matches the expected output
|
# Check that the output matches the expected output
|
||||||
self.assertTrue(torch.allclose(_expand_mask(mask, dtype), expected_output))
|
self.assertTrue(torch.allclose(_expand_mask(mask, dtype), expected_output))
|
||||||
|
|
||||||
|
def test_output_multipack(self):
|
||||||
|
mask = torch.tensor([[1, 1, 1, 0], [2, 2, 3, 3]])
|
||||||
|
dtype = torch.float32
|
||||||
|
expected_output = torch.tensor(
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[0.0000e00, -3.4028e38, -3.4028e38, -3.4028e38],
|
||||||
|
[0.0000e00, 0.0000e00, -3.4028e38, -3.4028e38],
|
||||||
|
[0.0000e00, 0.0000e00, 0.0000e00, -3.4028e38],
|
||||||
|
[-3.4028e38, -3.4028e38, -3.4028e38, -3.4028e38],
|
||||||
|
]
|
||||||
|
],
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[0.0000e00, -3.4028e38, -3.4028e38, -3.4028e38],
|
||||||
|
[0.0000e00, 0.0000e00, -3.4028e38, -3.4028e38],
|
||||||
|
[-3.4028e38, -3.4028e38, 0.0000e00, -3.4028e38],
|
||||||
|
[-3.4028e38, -3.4028e38, 0.0000e00, 0.0000e00],
|
||||||
|
]
|
||||||
|
],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
# Check that the output matches the expected output
|
||||||
|
self.assertTrue(torch.allclose(_expand_mask(mask, dtype), expected_output))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user