Compare commits

..

7 Commits

Author SHA1 Message Date
NanoCode012
2b9a2dde4b chore: update title 2025-04-26 16:21:31 -04:00
Wing Lian
388e950016 restore dockerfile 2025-04-26 16:21:30 -04:00
NanoCode012
fb4adbb311 fix: trim allowed cuda versions 2025-04-26 16:21:30 -04:00
Wing Lian
5e8abca54f use axolotl cloud image as base and various fixes 2025-04-26 16:21:30 -04:00
Wing Lian
168ec339e5 chore: lint 2025-04-26 16:21:30 -04:00
zeke
cb7185998b remove LICENSE and fix README 2025-04-26 16:21:30 -04:00
zeke
c2fc35f520 Add runpod sls handler 2025-04-26 16:21:30 -04:00
211 changed files with 762 additions and 6003 deletions

View File

@@ -22,6 +22,12 @@ jobs:
fail-fast: false
matrix:
include:
- cuda: "124"
cuda_version: 12.4.1
cudnn_version: ""
python_version: "3.11"
pytorch: 2.4.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
- cuda: "124"
cuda_version: 12.4.1
cudnn_version: ""

View File

@@ -18,8 +18,13 @@ jobs:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.5.1
pytorch: 2.4.1
axolotl_extras:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.5.1
axolotl_extras: vllm
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
@@ -30,7 +35,7 @@ jobs:
cuda_version: 12.6.3
python_version: "3.11"
pytorch: 2.7.0
axolotl_extras:
axolotl_extras: vllm
runs-on: axolotl-gpu-runner
steps:
- name: Checkout
@@ -62,7 +67,6 @@ jobs:
CUDA=${{ matrix.cuda }}
PYTORCH_VERSION=${{ matrix.pytorch }}
AXOLOTL_ARGS=${{ matrix.axolotl_args }}
AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
file: ./docker/Dockerfile
push: ${{ github.event_name != 'pull_request' }}
tags: |
@@ -78,6 +82,11 @@ jobs:
strategy:
matrix:
include:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.4.1
axolotl_extras:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"

View File

@@ -9,7 +9,6 @@ on:
- 'pyproject.toml'
- '.github/workflows/multi-gpu-e2e.yml'
- 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
- 'src/axolotl/utils/distributed.py'
workflow_dispatch:
schedule:
- cron: '0 0 * * 1,4' # Runs at 00:00 UTC every monday & thursday
@@ -33,11 +32,18 @@ jobs:
axolotl_extras: vllm
num_gpus: 2
nightly_build: "true"
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.4.1
axolotl_extras: # no vllm support for 2.4.1
num_gpus: 2
nightly_build: "true"
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.5.1
axolotl_extras:
axolotl_extras: vllm
num_gpus: 2
nightly_build: "true"
- cuda: 126

View File

@@ -12,6 +12,11 @@ jobs:
fail-fast: false
matrix:
include:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.4.1
axolotl_extras:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
@@ -65,6 +70,11 @@ jobs:
strategy:
matrix:
include:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.4.1
axolotl_extras:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"

View File

@@ -1,61 +0,0 @@
name: Preview
on:
workflow_dispatch:
pull_request:
types: [opened, synchronize, reopened]
# Run the workflow only when one of these files changes
paths:
- '**/*.md' # any Markdown file
- '**/*.qmd' # any Quarto file
- '_quarto.yaml'
permissions:
checks: write
contents: write
deployments: write
issues: write
discussions: write
pages: write
pull-requests: write
statuses: write
jobs:
preview:
runs-on: ubuntu-latest
steps:
- name: Check out repository
uses: actions/checkout@v4
- name: Set up Quarto
uses: quarto-dev/quarto-actions/setup@v2
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: |
python3 -m pip install jupyter quartodoc
python3 -m pip install -e . --no-deps
- name: Build autodoc
run: quartodoc build
- name: Quarto render
run: quarto render
- name: Netlify Publish
uses: nwtgck/actions-netlify@v3.0
with:
publish-dir: './_site'
enable-pull-request-comment: true
enable-github-deployment: true
github-token: ${{ secrets.GITHUB_TOKEN }}
deploy-message: "Deployed On Netlify"
github-deployment-environment: 'preview'
github-deployment-description: 'Preview Deployment'
env:
NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}

View File

@@ -26,7 +26,7 @@ jobs:
max-parallel: 2
matrix:
python_version: ["3.11"]
pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
timeout-minutes: 20
steps:
@@ -106,6 +106,13 @@ jobs:
fail-fast: false
matrix:
include:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.4.1
num_gpus: 1
axolotl_extras:
nightly_build: "true"
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"

View File

@@ -27,9 +27,6 @@ concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
env:
TRANSFORMERS_IS_CI: "yes"
jobs:
pre-commit:
name: pre-commit
@@ -44,101 +41,15 @@ jobs:
env:
SKIP: no-commit-to-branch
preload-cache:
name: Preload HF cache
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python_version: ["3.11"]
pytorch_version: ["2.6.0"]
timeout-minutes: 20
env:
AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Restore HF cache
id: hf-cache-restore
uses: actions/cache/restore@v4
with:
path: |
/home/runner/.cache/huggingface/hub/datasets--*
/home/runner/.cache/huggingface/hub/models--*
key: ${{ runner.os }}-hf-hub-cache-v2
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python_version }}
cache: 'pip' # caching pip dependencies
- name: upgrade pip
run: |
pip3 install --upgrade pip
pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
- name: Install PyTorch
run: |
pip3 install torch==${{ matrix.pytorch_version }}
- name: Install dependencies
run: |
pip3 show torch
pip3 install --no-build-isolation -U -e .
python scripts/unsloth_install.py | sh
python scripts/cutcrossentropy_install.py | sh
pip3 install -r requirements-dev.txt -r requirements-tests.txt
- name: Make sure PyTorch version wasn't clobbered
run: |
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
- name: Ensure axolotl CLI was installed
run: |
axolotl --help
- name: Pre-Download dataset fixture
run: |
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
- name: Run tests
run: |
pytest -v tests/conftest.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml
flags: unittests,pytorch-${{ matrix.pytorch_version }}
fail_ci_if_error: false
- name: cleanup pip cache
run: |
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
- name: Save HF cache
id: hf-cache
uses: actions/cache/save@v4
with:
path: |
/home/runner/.cache/huggingface/hub/datasets--*
/home/runner/.cache/huggingface/hub/models--*
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
pytest:
name: PyTest
runs-on: ubuntu-latest
needs: [preload-cache]
strategy:
fail-fast: false
max-parallel: 2
matrix:
python_version: ["3.11"]
pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
pytorch_version: ["2.4.1", "2.5.1", "2.6.0", "2.7.0"]
timeout-minutes: 20
steps:
@@ -207,15 +118,24 @@ jobs:
run: |
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
- name: Save HF cache
id: hf-cache
uses: actions/cache/save@v4
with:
path: |
/home/runner/.cache/huggingface/hub/datasets--*
/home/runner/.cache/huggingface/hub/models--*
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
pytest-sdist:
name: PyTest from Source Dist
runs-on: ubuntu-latest
needs: [preload-cache]
strategy:
fail-fast: false
max-parallel: 1
matrix:
python_version: ["3.11"]
pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
timeout-minutes: 20
steps:
@@ -276,6 +196,15 @@ jobs:
run: |
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
- name: Save HF cache
id: hf-cache
uses: actions/cache/save@v4
with:
path: |
/home/runner/.cache/huggingface/hub/datasets--*
/home/runner/.cache/huggingface/hub/models--*
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
docker-e2e-tests-1st:
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
# this job needs to be run on self-hosted GPU runners...
@@ -329,12 +258,6 @@ jobs:
fail-fast: false
matrix:
include:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
num_gpus: 1
axolotl_extras: llmcompressor
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
@@ -346,7 +269,7 @@ jobs:
python_version: "3.11"
pytorch: 2.5.1
num_gpus: 1
axolotl_extras:
axolotl_extras: vllm
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"

View File

@@ -1,10 +1,11 @@
FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0
FROM runpod/pytorch:3.10-2.0.0-117
COPY .runpod/requirements.txt /requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade pip && \
python3 -m pip install --upgrade -r /requirements.txt
# Environment settings
ARG BASE_VOLUME="/runpod-volume"
ENV BASE_VOLUME=$BASE_VOLUME
@@ -14,5 +15,4 @@ ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
COPY .runpod/src /src
WORKDIR /src
CMD ["python3", "/src/handler.py"]

View File

@@ -5,3 +5,11 @@
# git+https://github.com/runpod/runpod-python.git
# To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/
runpod~=1.7.0
huggingface_hub
typing-extensions
pydantic
pydantic-settings
hf-transfer
setuptools
numpy==2.0.0
axolotl[flash-attn,deepspeed]

View File

@@ -1,86 +0,0 @@
{
"input": {
"name": "quick_smoke_test_sft",
"user_id": "user",
"model_id": "llama-test",
"run_id": "llama-test",
"credentials": {
"wandb_api_key": "",
"hf_token": ""
},
"args": {
"base_model": "HuggingFaceTB/SmolLM2-135M",
"model_type": "AutoModelForCausalLM",
"tokenizer_type": "AutoTokenizer",
"load_in_4bit": true,
"strict": false,
"datasets": [
{
"path": "mhenrichsen/alpaca_2k_test",
"type": "alpaca",
"split": "train[:10%]"
}
],
"val_set_size": 0.02,
"output_dir": "./outputs/lora-out",
"sequence_len": 4096,
"sample_packing": true,
"eval_sample_packing": false,
"pad_to_sequence_len": true,
"adapter": "qlora",
"lora_r": 32,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_target_linear": true,
"lora_modules_to_save": [
"embed_tokens",
"lm_head"
],
"gradient_accumulation_steps": 2,
"micro_batch_size": 1,
"num_epochs": 1,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"learning_rate": 0.0002,
"train_on_inputs": false,
"group_by_length": false,
"bf16": "auto",
"tf32": true,
"gradient_checkpointing": true,
"logging_steps": 1,
"flash_attention": true,
"warmup_steps": 1,
"evals_per_epoch": 1,
"eval_max_new_tokens": 128,
"saves_per_epoch": 1,
"weight_decay": 0.0,
"special_tokens": {
"pad_token": "<|endoftext|>"
},
"max_steps": 20
},
"timeout": 100000
},
"config": {
"gpuTypeId": "NVIDIA GeForce RTX 4090",
"gpuCount": 1,
"containerDiskInGb": 200,
"env": [
{
"key": "TOKENIZER",
"value": ""
},
{
"key": "DISABLE_LOG_STATS",
"value": "true"
}
],
"allowedCudaVersions": [
"12.8",
"12.7",
"12.6",
"12.5",
"12.4"
]
}
}

View File

@@ -11,43 +11,43 @@
"hf_token": ""
},
"args": {
"base_model": "HuggingFaceTB/SmolLM2-135M",
"model_type": "AutoModelForCausalLM",
"base_model": "NousResearch/Meta-Llama-3-8B",
"model_type": "LlamaForCausalLM",
"tokenizer_type": "AutoTokenizer",
"load_in_4bit": true,
"load_in_8bit": true,
"load_in_4bit": false,
"strict": false,
"datasets": [
{
"path": "mhenrichsen/alpaca_2k_test",
"type": "alpaca",
"split": "train[:10%]"
"type": "alpaca"
}
],
"val_set_size": 0.02,
"val_set_size": 0.05,
"output_dir": "./outputs/lora-out",
"sequence_len": 4096,
"sample_packing": true,
"eval_sample_packing": false,
"pad_to_sequence_len": true,
"adapter": "qlora",
"adapter": "lora",
"lora_r": 32,
"lora_alpha": 64,
"lora_alpha": 16,
"lora_dropout": 0.05,
"lora_target_linear": true,
"lora_modules_to_save": [
"embed_tokens",
"lm_head"
],
"gradient_accumulation_steps": 2,
"micro_batch_size": 1,
"gradient_accumulation_steps": 4,
"micro_batch_size": 2,
"num_epochs": 1,
"optimizer": "adamw_torch_fused",
"optimizer": "adamw_bnb_8bit",
"lr_scheduler": "cosine",
"learning_rate": 0.0002,
"train_on_inputs": false,
"group_by_length": false,
"bf16": "auto",
"tf32": true,
"tf32": false,
"gradient_checkpointing": true,
"logging_steps": 1,
"flash_attention": true,
@@ -57,9 +57,8 @@
"saves_per_epoch": 1,
"weight_decay": 0.0,
"special_tokens": {
"pad_token": "<|endoftext|>"
},
"max_steps": 20
"pad_token": "<|end_of_text|>"
}
}
},
"timeout": 100000

View File

@@ -20,4 +20,4 @@ pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
--cov-report=xml:multigpu-coverage.xml
# Upload coverage to Codecov
codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
codecov upload-process -t $CODECOV_TOKEN -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}

View File

@@ -32,8 +32,6 @@ tokenizer_legacy:
resize_token_embeddings_to_32x:
# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
shrink_embeddings:
# Optional[bool] Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs
embeddings_skip_upcast:
# Whether to load the model with randomly initialized weights. Useful for
# pre-training a model from scratch or debugging purposes.
random_init_weights:
@@ -75,12 +73,11 @@ load_in_8bit: true
load_in_4bit:
# Use CUDA bf16
bf16: true # bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require >=ampere
bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
# Use CUDA fp16
fp16: true
# Use CUDA tf32
tf32: true # require >=ampere
# Note: if bf16 is set to 'auto', and fp16 is set to true, we will prefer the explict fp16 setting
# No AMP (automatic mixed precision)
bfloat16: true # require >=ampere
@@ -157,10 +154,6 @@ datasets:
# Key containing the messages (default: "messages")
field_messages: messages
# Key containing the system message (default: "system")
# If the system message is not present in the dataset sample, it will be loaded from the field_system property.
field_system: system
# Mapping of properties from the input dataset to the chat template.
# (default: message_property_mappings={'role':'role', 'content':'content'})
# If a property exists in the template but not in this mapping, the system will attempt
@@ -187,14 +180,10 @@ datasets:
# adding a system turn with empty content.
drop_system_message:
# Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags
# See example at `docs/dataset-formats/conversation.qmd`
split_thinking:
# IMPORTANT: The following fields determine which parts of the conversation to train on.
# Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
# See examples at `docs/dataset-formats/conversation.qmd`
# Note: If the below 5 fields are empty, defaults to training only on the last message.
# Note: If the below 4 fields are set to empty, defaults to training only on the last message.
# Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
roles_to_train: ["assistant"] # default
@@ -203,13 +192,7 @@ datasets:
# - turn (default): train on the EOS token at the end of each trainable turn
# - last: train on the last EOS token in the conversation
# TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.
train_on_eos: turn
# Optional[str]. Which EOT (End-of-Turn) tokens to train on in the conversation. Possible values are:
# - all: train on all EOT tokens
# - turn: train on the EOT token at the end of each trainable turn
# - last: train on the last EOT token in the conversation
# If not specified, defaults to the value of train_on_eos for backward compatibility.
train_on_eot:
train_on_eos: last
# The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
message_field_training: training
# The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
@@ -292,17 +275,8 @@ process_reward_model:
chat_template: tokenizer_default
# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
chat_template_jinja: null
# Optional[List[str]]. Custom EOT (End-of-Turn) tokens to mask/unmask during training.
# These tokens mark the boundaries between conversation turns.
# For example: ["/INST", "</s>", "[/SYSTEM_PROMPT]"]
# If not specified, defaults to just the model's eos_token.
# This is useful for templates that use multiple delimiter tokens.
eot_tokens:
# - "</s>"
# - "[/INST]"
# - "[/SYSTEM_PROMPT]"
# Changes the default system message
default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
# Changes the default system message. Currently only supports chatml.
default_system_message: You are a helpful assistant. Please give a long and detailed answer.
# Axolotl attempts to save the dataset as an arrow after packing the data together so
# subsequent training attempts load faster, relative path
dataset_prepared_path: data/last_run_prepared
@@ -550,7 +524,7 @@ gradient_checkpointing: false
early_stopping_patience: 3
# Specify a scheduler and kwargs to use with the optimizer
lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine
lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine
lr_scheduler_kwargs:
cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
@@ -687,10 +661,8 @@ special_tokens:
# unk_token: "<unk>"
# pad_token: "[PAD]"
# Optional[list[str]]. Add extra tokens to the tokenizer.
# Add extra tokens.
tokens:
# - "<|startoftext|>"
# - "<|endoftext|>"
# Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.
# Only works for tokens that are not part of the base vocab (aka are added_tokens).

View File

@@ -49,8 +49,7 @@ sections = [
("Knowledge Distillation (KD)", "kd"),
("Liger Kernels", "liger"),
("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
("Spectrum", "spectrum"),
("LLMCompressor", "llm_compressor")
("Spectrum", "spectrum")
]
for section_name, folder_name in sections:

View File

@@ -4,6 +4,18 @@ description: Conversation format for supervised fine-tuning.
order: 3
---
## sharegpt
::: {.callout-important}
ShareGPT is deprecated!. Please see [chat_template](#chat_template) section below.
:::
## pygmalion
```{.json filename="data.jsonl"}
{"conversations": [{"role": "...", "value": "..."}]}
```
## chat_template
Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2.
@@ -52,7 +64,7 @@ We recommend checking the below examples for other usecases.
### Examples
1. (Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
1. Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
```yaml
datasets:
@@ -97,55 +109,10 @@ datasets:
```
::: {.callout-important}
Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.
:::
5. If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
```yaml
eot_tokens:
- "[/INST]"
# - "[/SYSTEM_PROMPT]"
datasets:
- path: ...
type: chat_template
# optional
train_on_eot: turn # defaults read from train_on_eos (which defaults to turn)
```
::: {.callout-tip}
See [config documentation](../config.qmd) for detailed explanations of "turn", "last", and "all" options for training on tokens.
:::
::: {.callout-note}
Using `eot_tokens` requires each token that exists in `chat_template` to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.
You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config.qmd) for more details.
:::
6. Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
```yaml
eot_tokens:
- "[/INST]"
# ...
datasets:
- path: ...
type: chat_template
train_on_eos: last
train_on_eot: turn
```
::: {.callout-tip}
If EOS token only appears at the end of a prompt, `train_on_eos: last` is equivalent to `train_on_eos: turn`. Therefore, generally, you can leave them to their defaults and omit them.
:::
7. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
5. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
For a data sample that looks like:
@@ -195,43 +162,3 @@ datasets:
::: {.callout-tip}
It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
:::
8. (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
```yaml
datasets:
- path: ...
type: chat_template
chat_template: qwen3
split_thinking: true
```
For example, a content can look like:
```json
{
"content": "<think>Some thinking outputs</think>Output after thinking."
}
```
After split, it will look like:
```json
{
"reasoning_content": "Some thinking outputs",
"content": "Output after thinking..."
}
```
## sharegpt
::: {.callout-important}
ShareGPT is deprecated!. Please see [chat_template](#chat_template) section.
:::
## pygmalion
```{.json filename="data.jsonl"}
{"conversations": [{"role": "...", "value": "..."}]}
```

View File

@@ -73,40 +73,10 @@ description: Frequently asked questions
> A: This is likely an empty turn.
**Q: The EOS token is incorrectly being masked or not being masked / `EOS token __ not found in chat template`.**
**Q: The EOS/EOT token is incorrectly being masked or not being masked.**
> A: There can be two reasons:
> 1. This is because of the mismatch between `tokenizer.eos_token` and EOS token in template. Please make sure to set `eos_token: ` under `special_tokens: ` to the same EOS token as in template.
> 2. The EOS token is not in the template. Please check if your template is correct. As an example, `phi_35` template does not use its dedicated EOS token `<|endoftext|>` at the end.
> A: This is because of the mismatch between `tokenizer.eos_token` and EOS/EOT token in template. Please make sure to set `eos_token` under `special_tokens` to the same EOS/EOT token as in template.
**Q: "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null. Please add a `chat_template` in tokenizer config"**
> A: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See [chat_template](dataset-formats/conversation.qmd#chat-template) for more details.
**Q: The EOT token(s) are incorrectly being masked or not being masked / `EOT token __ not found in chat template`.**
> A: There can be two reasons:
> 1. The EOT token is different from the EOS token and was not specified under `eot_tokens: `. Please set `eot_tokens: ` to the same EOT token(s) as in template.
> 2. There is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case.
**Q: `EOT token encoding failed. Please check if the token is valid and can be encoded.`**
> A: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.
**Q: `EOT token __ is encoded as multiple tokens.`**
> A: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `.
**Q: `Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot`**
> A: This is because the EOS token is in the `eot_tokens: ` while mismatch between `train_on_eos: ` and `train_on_eot: `. This will cause one to override the other. Please ensure that `train_on_eos: ` and `train_on_eot: ` are the same or remove the EOS token from `eot_tokens: `.
**Q: If `eot_tokens: ` is not provided, what happens?**
> A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.
> Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens.

View File

@@ -164,7 +164,7 @@ Here is an example of a multi-modal dataset:
{
"role": "user",
"content": [
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
{"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
{"type": "text", "text": "Describe this image in detail."}
]
},

View File

@@ -502,7 +502,9 @@ The input format is a simple JSON input with customizable fields based on the ab
Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
:::
In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:
If you have multiple GPUs available, we reccomend using `vLLM` with the `GRPOTrainer` to significantly speedup trajectory generation during training.
First, launch a `vLLM` server using `trl vllm-serve` - you may use a config file or CLI overrides to configure your vLLM server. In this example, we're
using 4 GPUs - 2 for training, and 2 for vLLM:
::: {.callout-important}
Make sure you've installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. `pip install axolotl[vllm]`.
@@ -537,10 +539,6 @@ Your `vLLM` instance will now attempt to spin up, and it's time to kick off trai
CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
```
::: {.callout-note}
Due to TRL's implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use `CUDA_VISIBLE_DEVICES=2,3` for the vLLM instance.
:::
#### Reward functions
GRPO uses custom reward functions and transformations. Please have them ready locally.

View File

@@ -59,7 +59,9 @@ gradient_checkpointing: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
sdp_attention:
flash_optimum:
gptq_groupsize:
gptq_model_v1:

View File

@@ -39,7 +39,8 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: xformers
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 10

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -49,8 +49,7 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch:

View File

@@ -112,7 +112,9 @@
"early_stopping_patience:\n",
"resume_from_checkpoint:\n",
"logging_steps: 1\n",
"attention: sdpa\n",
"xformers_attention:\n",
"flash_attention: false\n",
"sdp_attention: true\n",
"\n",
"warmup_steps: 1\n",
"max_steps: 25\n",

View File

@@ -52,8 +52,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch:

View File

@@ -55,8 +55,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch:

View File

@@ -39,8 +39,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch:

View File

@@ -35,8 +35,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 2

View File

@@ -59,8 +59,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 2

View File

@@ -43,7 +43,8 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: xformers
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 40

View File

@@ -73,7 +73,8 @@ early_stopping_patience: 3
resume_from_checkpoint:
auto_resume_from_checkpoints: true
logging_steps: 1
attention: xformers
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 10

View File

@@ -40,7 +40,8 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: xformers
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 40

View File

@@ -47,8 +47,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -53,8 +53,7 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch:

View File

@@ -43,8 +43,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch:

View File

@@ -57,8 +57,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch:

View File

@@ -51,7 +51,8 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
logging_steps: 1
attention: flash
flash_attention: true
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1

View File

@@ -53,7 +53,8 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
logging_steps: 1
attention: flash
flash_attention: true
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1

View File

@@ -36,7 +36,8 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: xformers
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 10

View File

@@ -47,8 +47,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch:

View File

@@ -46,8 +46,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch:

View File

@@ -45,8 +45,7 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: true
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 1

View File

@@ -37,7 +37,8 @@ bf16: auto
tf32: true
resume_from_checkpoint:
logging_steps: 5
attention: xformers
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20

View File

@@ -42,8 +42,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
flash_attn_cross_entropy: false
flash_attn_rms_norm: true
flash_attn_fuse_qkv: false

View File

@@ -53,7 +53,9 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention:
sdp_attention:
flash_optimum:
warmup_steps: 100
evals_per_epoch: 4
saves_per_epoch: 1

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
flash_attn_cross_entropy: false
flash_attn_rms_norm: true
flash_attn_fuse_qkv: false

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
use_reentrant: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -50,7 +50,8 @@ tf32: true
gradient_checkpointing: true
logging_steps: 1
attention: flash
flash_attention: true
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1

View File

@@ -49,8 +49,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 2

View File

@@ -34,8 +34,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 2

View File

@@ -61,8 +61,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -56,8 +56,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -77,8 +77,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -53,8 +53,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -54,8 +54,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -48,8 +48,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -55,8 +55,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -49,8 +49,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -53,8 +53,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 20
evals_per_epoch: 4

View File

@@ -51,8 +51,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -39,8 +39,7 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: true
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
use_reentrant: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -1,77 +0,0 @@
base_model: neuralmagic/Sparse-Llama-3.1-8B-2of4
plugins:
- axolotl.integrations.llm_compressor.LLMCompressorPlugin
load_in_8bit: false
load_in_4bit: false
strict: false
datasets:
- path: tatsu-lab/alpaca
type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.05
output_dir: ./outputs/out
sequence_len: 4096
sample_packing: true
pad_to_sequence_len: true
eval_sample_packing: false
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 8
micro_batch_size: 1
num_epochs: 1
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
early_stopping_patience:
resume_from_checkpoint:
logging_steps: 1
xformers_attention:
flash_attention: true
warmup_steps: 100
evals_per_epoch: 2
eval_table_size:
saves_per_epoch: 1
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:
pad_token: <|end_of_text|>
llmcompressor:
recipe:
finetuning_stage:
finetuning_modifiers:
ConstantPruningModifier:
targets: [
're:.*q_proj.weight',
're:.*k_proj.weight',
're:.*v_proj.weight',
're:.*o_proj.weight',
're:.*gate_proj.weight',
're:.*up_proj.weight',
're:.*down_proj.weight',
]
start: 0
save_compressed: true

View File

@@ -46,7 +46,8 @@ tf32: true
gradient_checkpointing: true
logging_steps: 1
attention: flash
flash_attention: true
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1

View File

@@ -39,7 +39,7 @@ tf32: true
gradient_checkpointing: false
resume_from_checkpoint:
logging_steps: 1
attention: eager
flash_attention:
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -42,8 +42,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
save_total_limit: 1
save_steps:

View File

@@ -36,8 +36,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -53,7 +53,8 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: sdpa
flash_attention: false
sdp_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -54,8 +54,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -71,7 +71,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: eager
flash_attention: false
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -51,8 +51,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -59,8 +59,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -48,7 +48,9 @@ tf32: true
gradient_checkpointing: true
logging_steps: 1
attention: eager # PixtralVisionModel does not support Flash Attention 2.0 yet.
flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet.
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

View File

@@ -49,8 +49,7 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -51,8 +51,7 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -69,8 +69,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -40,8 +40,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
save_total_limit: 1
save_steps:

View File

@@ -54,8 +54,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -39,7 +39,7 @@ bf16: auto
tf32: true
resume_from_checkpoint:
logging_steps: 5
attention: eager
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20

View File

@@ -39,8 +39,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20

View File

@@ -47,8 +47,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20

View File

@@ -40,8 +40,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20

View File

@@ -1,341 +0,0 @@
# Finetuning LLMs to output audio
In this example, we finetune Orpcanopylabs/orpheus-tts-0.1-pretrained (a LLaMA 3.2 3b model) to output audio.
The `finetune.yml` withe current settings will run on any Nvidia GPU with 45GB VRAM or more. If you adjust the batch size it can easily run on any GPU under 24GB.
## Dataset pre-processing for pre-training
If you are adding another voice in English, please jump ahead to finetuning pre-processing.
For this to work, we need to preprocess our dataset. Since we are expecting to output audio, we will need to add tokens to the tokenizer.
Using this code, it will download the SNAC model and add the correct tokens and upload the final dataset.
```python
import torch
from snac import SNAC
from datasets import load_dataset
from huggingface_hub import snapshot_download
from datasets import load_dataset
import random
import torchaudio.transforms as T
from transformers import AutoTokenizer
import os
my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
dsn = my_original_dataset_name
snapshot_download(
repo_id=dsn,
repo_type="dataset",
revision="main",
max_workers=64,
)
ds = load_dataset(dsn, split="train")
ds_sample_rate = ds[0]["audio"]["sampling_rate"]
model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
model = model.to("mps")
def tokenise_audio(waveform):
waveform = torch.from_numpy(waveform).unsqueeze(0)
waveform = waveform.to(dtype=torch.float32)
resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
waveform = resample_transform(waveform)
waveform = waveform.unsqueeze(0).to("cuda")
#generate the codes from snac
with torch.inference_mode():
codes = model.encode(waveform)
all_codes = []
for i in range(codes[0].shape[1]):
all_codes.append(codes[0][0][i].item()+128266)
all_codes.append(codes[1][0][2*i].item()+128266+4096)
all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
return all_codes
def add_codes(example):
# Always initialize codes_list to None
codes_list = None
try:
answer_audio = example.get("audio")
# If there's a valid audio array, tokenise it
if answer_audio and "array" in answer_audio:
audio_array = answer_audio["array"]
codes_list = tokenise_audio(audio_array)
except Exception as e:
print(f"Skipping row due to error: {e}")
# Keep codes_list as None if we fail
example["codes_list"] = codes_list
return example
ds = ds.map(add_codes, remove_columns=["audio"])
#@title Load Tokenizer
tokeniser_length = 128256
start_of_text = 128000
end_of_text = 128009
start_of_speech = tokeniser_length + 1
end_of_speech = tokeniser_length + 2
start_of_human = tokeniser_length + 3
end_of_human = tokeniser_length + 4
start_of_ai = tokeniser_length + 5
end_of_ai = tokeniser_length + 6
pad_token = tokeniser_length + 7
audio_tokens_start = tokeniser_length + 10
tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
num_proc = os.cpu_count() - 2
ds = ds.filter(lambda x: x["codes_list"] is not None)
ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
#@title Create Input Ids
def remove_duplicate_frames(example):
vals = example["codes_list"]
if len(vals) % 7 != 0:
raise ValueError("Input list length must be divisible by 7")
result = vals[:7]
removed_frames = 0
for i in range(7, len(vals), 7):
current_first = vals[i]
previous_first = result[-7]
if current_first != previous_first:
result.extend(vals[i:i+7])
else:
removed_frames += 1
example["codes_list"] = result
return example
ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
def create_input_ids(example):
text_ids = tokenizer.encode({example['text']}, add_special_tokens=True)
text_ids.append(end_of_text)
example["text_tokens"] = text_ids
input_ids = (
[start_of_human]
+ example["text_tokens"]
+ [end_of_human]
+ [start_of_ai]
+ [start_of_speech]
+ example["codes_list"]
+ [end_of_speech]
+ [end_of_ai]
)
example["input_ids"] = input_ids
example["labels"] = input_ids
example["attention_mask"] = [1] * len(input_ids)
return example
ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
#@title Remove unnecessary columns
columns_to_keep = ["input_ids", "labels", "attention_mask"]
columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
ds = ds.remove_columns(columns_to_remove)
ds.push_to_hub(name_to_push_dataset_to)
```
## Finetune pre-processing
Use this code to add a new voice.
```python
import torch
from snac import SNAC
from datasets import load_dataset
from huggingface_hub import snapshot_download
from datasets import load_dataset
import random
import torchaudio.transforms as T
from transformers import AutoTokenizer
import os
my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
dsn = my_original_dataset_name
snapshot_download(
repo_id=dsn,
repo_type="dataset",
revision="main",
max_workers=64,
)
ds = load_dataset(dsn, split="train")
ds_sample_rate = ds[0]["audio"]["sampling_rate"]
model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
model = model.to("mps")
def tokenise_audio(waveform):
waveform = torch.from_numpy(waveform).unsqueeze(0)
waveform = waveform.to(dtype=torch.float32)
resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
waveform = resample_transform(waveform)
waveform = waveform.unsqueeze(0).to("cuda")
#generate the codes from snac
with torch.inference_mode():
codes = model.encode(waveform)
all_codes = []
for i in range(codes[0].shape[1]):
all_codes.append(codes[0][0][i].item()+128266)
all_codes.append(codes[1][0][2*i].item()+128266+4096)
all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
return all_codes
def add_codes(example):
# Always initialize codes_list to None
codes_list = None
try:
answer_audio = example.get("audio")
# If there's a valid audio array, tokenise it
if answer_audio and "array" in answer_audio:
audio_array = answer_audio["array"]
codes_list = tokenise_audio(audio_array)
except Exception as e:
print(f"Skipping row due to error: {e}")
# Keep codes_list as None if we fail
example["codes_list"] = codes_list
return example
ds = ds.map(add_codes, remove_columns=["audio"])
#@title Load Tokenizer
tokeniser_length = 128256
start_of_text = 128000
end_of_text = 128009
start_of_speech = tokeniser_length + 1
end_of_speech = tokeniser_length + 2
start_of_human = tokeniser_length + 3
end_of_human = tokeniser_length + 4
start_of_ai = tokeniser_length + 5
end_of_ai = tokeniser_length + 6
pad_token = tokeniser_length + 7
audio_tokens_start = tokeniser_length + 10
tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
num_proc = os.cpu_count() - 2
ds = ds.filter(lambda x: x["codes_list"] is not None)
ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
#@title Create Input Ids
def remove_duplicate_frames(example):
vals = example["codes_list"]
if len(vals) % 7 != 0:
raise ValueError("Input list length must be divisible by 7")
result = vals[:7]
removed_frames = 0
for i in range(7, len(vals), 7):
current_first = vals[i]
previous_first = result[-7]
if current_first != previous_first:
result.extend(vals[i:i+7])
else:
removed_frames += 1
example["codes_list"] = result
return example
ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
tok_info = '''*** HERE you can modify the text prompt
i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:
f"{example["source"]}: {example["text"]}", as is passed.
'''
print(tok_info)
def create_input_ids(example):
text_ids = tokenizer.encode(f"{example['speaker_id']}: {example['text']}", add_special_tokens=True)
text_ids.append(end_of_text)
example["text_tokens"] = text_ids
input_ids = (
[start_of_human]
+ example["text_tokens"]
+ [end_of_human]
+ [start_of_ai]
+ [start_of_speech]
+ example["codes_list"]
+ [end_of_speech]
+ [end_of_ai]
)
example["input_ids"] = input_ids
example["labels"] = input_ids
example["attention_mask"] = [1] * len(input_ids)
return example
ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
#@title Remove unnecessary columns
columns_to_keep = ["input_ids", "labels", "attention_mask"]
columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
ds = ds.remove_columns(columns_to_remove)
ds.push_to_hub(name_to_push_dataset_to)
```
## Training
After preprocessing is done, fill out the blanks in finetune.yml and simply run `axolotl train finetune.yml`
## Inference
For inference, please refer to the original [orpheus github](https://github.com/canopyai/Orpheus-TTS/tree/main).

View File

@@ -1,52 +0,0 @@
base_model: canopylabs/orpheus-3b-0.1-pretrained
hub_model_id: <your-hub-model-id>
plugins:
- axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: true
datasets:
- path: <your-hf-dataset-id>
type: # leave empty to load pre-tokenized
dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out
sequence_len: 8192
sample_packing: true
pad_to_sequence_len: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 8
micro_batch_size: 4
num_epochs: 3
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 2e-5
bf16: auto
tf32: false
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_steps: 20
evals_per_epoch: 5
saves_per_epoch: 5
weight_decay: 0.05
special_tokens:
pad_token: <custom_token_7>

View File

@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
use_reentrant: True
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 4

View File

@@ -51,8 +51,7 @@ gradient_checkpointing_kwargs:
use_reentrant: True
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
use_reentrant: True
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 4

View File

@@ -49,8 +49,7 @@ gradient_checkpointing_kwargs:
use_reentrant: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 4

View File

@@ -44,8 +44,7 @@ gradient_checkpointing_kwargs:
use_reentrant: True
early_stopping_patience: 3
logging_steps: 1
attention: flash
flash_attention: true
eval_steps: 1000
save_steps: 5000

View File

@@ -46,7 +46,8 @@ tf32: true
gradient_checkpointing: true
logging_steps: 1
attention: eager # PixtralVisionModel does not support Flash Attention 2.0 yet
flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1

Some files were not shown because too many files have changed in this diff Show More