Compare commits

..

27 Commits

Author SHA1 Message Date
Wing Lian
6affbb1f85 move import of llmcompressor to reset session inside test 2025-04-30 18:10:44 -04:00
Wing Lian
0ed4b4c310 make sure to reset the session after each test 2025-04-30 17:21:53 -04:00
Wing Lian
f4a0f496a0 move decorator to test method instead of class 2025-04-30 17:21:53 -04:00
Wing Lian
82b16bd040 split llmcompressor from vllm checks 2025-04-30 17:21:53 -04:00
Wing Lian
fd5c985038 additional fixes for docker and saving compressed 2025-04-30 17:21:53 -04:00
Rahul Tuli
5246aebc04 Fix: Test
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-04-30 17:21:53 -04:00
Rahul Tuli
f4bcc71c86 Apply patch from @winglian
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-04-30 17:21:53 -04:00
Rahul Tuli
3a9e172272 Add: line about further optimizations using llmcompressor
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-04-30 17:21:53 -04:00
Rahul Tuli
372f0e137b Address Review Comments:
* deleted redundant docs/llm_compressor.qmd
* incorporated feedback in integration README.md
* added llmcompressor integration to docs/custom_integrations.qmd

Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-04-30 17:21:52 -04:00
Rahul Tuli
17dffec71d Add: .qmd file 2025-04-30 17:21:52 -04:00
Rahul Tuli
3a8b637598 Tests, Style, Updates 2025-04-30 17:21:52 -04:00
Rahul Tuli
12cd09e6f5 Rebase and updates! 2025-04-30 17:21:52 -04:00
Rahul Tuli
fe82f62248 Add: llm_compressor integration documentation 2025-04-30 17:21:52 -04:00
Rahul Tuli
db31d7ad22 Move: LLMCompressorPlugin into it's own submodule 2025-04-30 17:21:52 -04:00
Rahul Tuli
eb7f2aa4b9 Update model config 2025-04-30 17:21:51 -04:00
Rahul Tuli
f80e36ddd2 Use: absolute import 2025-04-30 17:21:51 -04:00
Rahul Tuli
412d2ec6d0 Rename: sft.yaml to sparse-finetuning.yaml 2025-04-30 17:21:51 -04:00
Rahul Tuli
50fc5e6984 Add: llcompressor installable 2025-04-30 17:21:51 -04:00
Rahul Tuli
83a88b745f Address review comments from @markurtz 2025-04-30 17:21:51 -04:00
Rahul Tuli
8855bb115f Apply suggestions from @markurtz
Co-authored-by: Mark Kurtz <mark.j.kurtz@gmail.com>
2025-04-30 17:21:51 -04:00
Rahul Tuli
ef9543b371 Update llmcompressor version to latest 2025-04-30 17:21:51 -04:00
Rahul Tuli
25e701e885 Revert: TODO's 2025-04-30 17:21:50 -04:00
Rahul Tuli
891a21e599 Use: warning over warn 2025-04-30 17:21:50 -04:00
Rahul Tuli
8beb2f27ad pre commit hooks 2025-04-30 17:21:50 -04:00
Rahul Tuli
56ba66b60f Add:llmcompressor instalable 2025-04-30 17:21:50 -04:00
Rahul Tuli
13d4b865d6 Update: review comments! 2025-04-30 17:21:50 -04:00
Rahul Tuli
3da866b2b9 Add: SFTPlugin with llmcompressor 2025-04-30 17:21:50 -04:00
150 changed files with 417 additions and 2173 deletions

View File

@@ -30,7 +30,7 @@ jobs:
cuda_version: 12.6.3
python_version: "3.11"
pytorch: 2.7.0
axolotl_extras:
axolotl_extras: vllm
runs-on: axolotl-gpu-runner
steps:
- name: Checkout

View File

@@ -4,12 +4,6 @@ on:
pull_request:
types: [opened, synchronize, reopened]
# Run the workflow only when one of these files changes
paths:
- '**/*.md' # any Markdown file
- '**/*.qmd' # any Quarto file
- '_quarto.yaml'
permissions:
checks: write
contents: write

View File

@@ -44,98 +44,12 @@ jobs:
env:
SKIP: no-commit-to-branch
preload-cache:
name: Preload HF cache
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python_version: ["3.11"]
pytorch_version: ["2.6.0"]
timeout-minutes: 20
env:
AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Restore HF cache
id: hf-cache-restore
uses: actions/cache/restore@v4
with:
path: |
/home/runner/.cache/huggingface/hub/datasets--*
/home/runner/.cache/huggingface/hub/models--*
key: ${{ runner.os }}-hf-hub-cache-v2
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python_version }}
cache: 'pip' # caching pip dependencies
- name: upgrade pip
run: |
pip3 install --upgrade pip
pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
- name: Install PyTorch
run: |
pip3 install torch==${{ matrix.pytorch_version }}
- name: Install dependencies
run: |
pip3 show torch
pip3 install --no-build-isolation -U -e .
python scripts/unsloth_install.py | sh
python scripts/cutcrossentropy_install.py | sh
pip3 install -r requirements-dev.txt -r requirements-tests.txt
- name: Make sure PyTorch version wasn't clobbered
run: |
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
- name: Ensure axolotl CLI was installed
run: |
axolotl --help
- name: Pre-Download dataset fixture
run: |
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
- name: Run tests
run: |
pytest -v tests/conftest.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml
flags: unittests,pytorch-${{ matrix.pytorch_version }}
fail_ci_if_error: false
- name: cleanup pip cache
run: |
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
- name: Save HF cache
id: hf-cache
uses: actions/cache/save@v4
with:
path: |
/home/runner/.cache/huggingface/hub/datasets--*
/home/runner/.cache/huggingface/hub/models--*
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
pytest:
name: PyTest
runs-on: ubuntu-latest
needs: [preload-cache]
strategy:
fail-fast: false
max-parallel: 2
matrix:
python_version: ["3.11"]
pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
@@ -207,12 +121,21 @@ jobs:
run: |
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
- name: Save HF cache
id: hf-cache
uses: actions/cache/save@v4
with:
path: |
/home/runner/.cache/huggingface/hub/datasets--*
/home/runner/.cache/huggingface/hub/models--*
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
pytest-sdist:
name: PyTest from Source Dist
runs-on: ubuntu-latest
needs: [preload-cache]
strategy:
fail-fast: false
max-parallel: 1
matrix:
python_version: ["3.11"]
pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
@@ -276,6 +199,15 @@ jobs:
run: |
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
- name: Save HF cache
id: hf-cache
uses: actions/cache/save@v4
with:
path: |
/home/runner/.cache/huggingface/hub/datasets--*
/home/runner/.cache/huggingface/hub/models--*
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
docker-e2e-tests-1st:
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
# this job needs to be run on self-hosted GPU runners...

View File

@@ -1,90 +0,0 @@
{
"tests": [
{
"name": "quick_smoke_test_sft",
"input": {
"user_id": "user",
"model_id": "llama-test",
"run_id": "llama-test",
"credentials": {
"wandb_api_key": "",
"hf_token": ""
},
"args": {
"base_model": "HuggingFaceTB/SmolLM2-135M",
"model_type": "AutoModelForCausalLM",
"tokenizer_type": "AutoTokenizer",
"load_in_4bit": true,
"strict": false,
"datasets": [
{
"path": "mhenrichsen/alpaca_2k_test",
"type": "alpaca",
"split": "train[:10%]"
}
],
"val_set_size": 0.02,
"output_dir": "./outputs/lora-out",
"sequence_len": 4096,
"sample_packing": true,
"eval_sample_packing": false,
"pad_to_sequence_len": true,
"adapter": "qlora",
"lora_r": 32,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_target_linear": true,
"lora_modules_to_save": [
"embed_tokens",
"lm_head"
],
"gradient_accumulation_steps": 2,
"micro_batch_size": 1,
"num_epochs": 1,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"learning_rate": 0.0002,
"train_on_inputs": false,
"group_by_length": false,
"bf16": "auto",
"tf32": true,
"gradient_checkpointing": true,
"logging_steps": 1,
"flash_attention": true,
"warmup_steps": 1,
"evals_per_epoch": 1,
"eval_max_new_tokens": 128,
"saves_per_epoch": 1,
"weight_decay": 0.0,
"special_tokens": {
"pad_token": "<|endoftext|>"
},
"max_steps": 20
}
},
"timeout": 100000
}
],
"config": {
"gpuTypeId": "NVIDIA GeForce RTX 4090",
"gpuCount": 1,
"containerDiskInGb": 200,
"env": [
{
"key": "TOKENIZER",
"value": ""
},
{
"key": "DISABLE_LOG_STATS",
"value": "true"
}
],
"allowedCudaVersions": [
"12.8",
"12.7",
"12.6",
"12.5",
"12.4"
]
}
}

View File

@@ -32,8 +32,6 @@ tokenizer_legacy:
resize_token_embeddings_to_32x:
# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
shrink_embeddings:
# Optional[bool] Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs
embeddings_skip_upcast:
# Whether to load the model with randomly initialized weights. Useful for
# pre-training a model from scratch or debugging purposes.
random_init_weights:
@@ -75,12 +73,11 @@ load_in_8bit: true
load_in_4bit:
# Use CUDA bf16
bf16: true # bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require >=ampere
bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
# Use CUDA fp16
fp16: true
# Use CUDA tf32
tf32: true # require >=ampere
# Note: if bf16 is set to 'auto', and fp16 is set to true, we will prefer the explict fp16 setting
# No AMP (automatic mixed precision)
bfloat16: true # require >=ampere
@@ -187,8 +184,8 @@ datasets:
# adding a system turn with empty content.
drop_system_message:
# Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags
# See example at `docs/dataset-formats/conversation.qmd`
# Optional[bool]. Whether to split the assistant turn based on a reasoning trace inside delimited tags
# defaults to False
split_thinking:
# IMPORTANT: The following fields determine which parts of the conversation to train on.
@@ -550,7 +547,7 @@ gradient_checkpointing: false
early_stopping_patience: 3
# Specify a scheduler and kwargs to use with the optimizer
lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine
lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine
lr_scheduler_kwargs:
cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)

View File

@@ -196,34 +196,6 @@ datasets:
It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
:::
8. (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
```yaml
datasets:
- path: ...
type: chat_template
chat_template: qwen3
split_thinking: true
```
For example, a content can look like:
```json
{
"content": "<think>Some thinking outputs</think>Output after thinking."
}
```
After split, it will look like:
```json
{
"reasoning_content": "Some thinking outputs",
"content": "Output after thinking..."
}
```
## sharegpt
::: {.callout-important}

View File

@@ -59,7 +59,9 @@ gradient_checkpointing: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
sdp_attention:
flash_optimum:
gptq_groupsize:
gptq_model_v1:

View File

@@ -39,7 +39,8 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: xformers
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 10

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -49,8 +49,7 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch:

View File

@@ -112,7 +112,9 @@
"early_stopping_patience:\n",
"resume_from_checkpoint:\n",
"logging_steps: 1\n",
"attention: sdpa\n",
"xformers_attention:\n",
"flash_attention: false\n",
"sdp_attention: true\n",
"\n",
"warmup_steps: 1\n",
"max_steps: 25\n",

View File

@@ -52,8 +52,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch:

View File

@@ -55,8 +55,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch:

View File

@@ -39,8 +39,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch:

View File

@@ -35,8 +35,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 2

View File

@@ -59,8 +59,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 2

View File

@@ -43,7 +43,8 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: xformers
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 40

View File

@@ -73,7 +73,8 @@ early_stopping_patience: 3
resume_from_checkpoint:
auto_resume_from_checkpoints: true
logging_steps: 1
attention: xformers
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 10

View File

@@ -40,7 +40,8 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: xformers
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 40

View File

@@ -47,8 +47,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -53,8 +53,7 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch:

View File

@@ -43,8 +43,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch:

View File

@@ -57,8 +57,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch:

View File

@@ -51,7 +51,8 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
logging_steps: 1
attention: flash
flash_attention: true
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1

View File

@@ -53,7 +53,8 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
logging_steps: 1
attention: flash
flash_attention: true
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1

View File

@@ -36,7 +36,8 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: xformers
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 10

View File

@@ -47,8 +47,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch:

View File

@@ -46,8 +46,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch:

View File

@@ -45,8 +45,7 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: true
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 1

View File

@@ -37,7 +37,8 @@ bf16: auto
tf32: true
resume_from_checkpoint:
logging_steps: 5
attention: xformers
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20

View File

@@ -42,8 +42,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
flash_attn_cross_entropy: false
flash_attn_rms_norm: true
flash_attn_fuse_qkv: false

View File

@@ -53,7 +53,9 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention:
sdp_attention:
flash_optimum:
warmup_steps: 100
evals_per_epoch: 4
saves_per_epoch: 1

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
flash_attn_cross_entropy: false
flash_attn_rms_norm: true
flash_attn_fuse_qkv: false

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
use_reentrant: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -50,7 +50,8 @@ tf32: true
gradient_checkpointing: true
logging_steps: 1
attention: flash
flash_attention: true
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1

View File

@@ -49,8 +49,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 2

View File

@@ -34,8 +34,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 2

View File

@@ -61,8 +61,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -56,8 +56,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -77,8 +77,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -53,8 +53,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -54,8 +54,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -48,8 +48,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -55,8 +55,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -49,8 +49,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -53,8 +53,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 20
evals_per_epoch: 4

View File

@@ -51,8 +51,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -39,8 +39,7 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: true
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
use_reentrant: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,7 +46,8 @@ tf32: true
gradient_checkpointing: true
logging_steps: 1
attention: flash
flash_attention: true
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1

View File

@@ -39,7 +39,7 @@ tf32: true
gradient_checkpointing: false
resume_from_checkpoint:
logging_steps: 1
attention: eager
flash_attention:
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -42,8 +42,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
save_total_limit: 1
save_steps:

View File

@@ -36,8 +36,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -53,7 +53,8 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: sdpa
flash_attention: false
sdp_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -54,8 +54,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -71,7 +71,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: eager
flash_attention: false
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -51,8 +51,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -59,8 +59,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -48,7 +48,9 @@ tf32: true
gradient_checkpointing: true
logging_steps: 1
attention: eager # PixtralVisionModel does not support Flash Attention 2.0 yet.
flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet.
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

View File

@@ -49,8 +49,7 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -51,8 +51,7 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -69,8 +69,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -40,8 +40,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
save_total_limit: 1
save_steps:

View File

@@ -54,8 +54,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -39,7 +39,7 @@ bf16: auto
tf32: true
resume_from_checkpoint:
logging_steps: 5
attention: eager
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20

View File

@@ -39,8 +39,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20

View File

@@ -47,8 +47,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20

View File

@@ -40,8 +40,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20

View File

@@ -1,341 +0,0 @@
# Finetuning LLMs to output audio
In this example, we finetune Orpcanopylabs/orpheus-tts-0.1-pretrained (a LLaMA 3.2 3b model) to output audio.
The `finetune.yml` withe current settings will run on any Nvidia GPU with 45GB VRAM or more. If you adjust the batch size it can easily run on any GPU under 24GB.
## Dataset pre-processing for pre-training
If you are adding another voice in English, please jump ahead to finetuning pre-processing.
For this to work, we need to preprocess our dataset. Since we are expecting to output audio, we will need to add tokens to the tokenizer.
Using this code, it will download the SNAC model and add the correct tokens and upload the final dataset.
```python
import torch
from snac import SNAC
from datasets import load_dataset
from huggingface_hub import snapshot_download
from datasets import load_dataset
import random
import torchaudio.transforms as T
from transformers import AutoTokenizer
import os
my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
dsn = my_original_dataset_name
snapshot_download(
repo_id=dsn,
repo_type="dataset",
revision="main",
max_workers=64,
)
ds = load_dataset(dsn, split="train")
ds_sample_rate = ds[0]["audio"]["sampling_rate"]
model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
model = model.to("mps")
def tokenise_audio(waveform):
waveform = torch.from_numpy(waveform).unsqueeze(0)
waveform = waveform.to(dtype=torch.float32)
resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
waveform = resample_transform(waveform)
waveform = waveform.unsqueeze(0).to("cuda")
#generate the codes from snac
with torch.inference_mode():
codes = model.encode(waveform)
all_codes = []
for i in range(codes[0].shape[1]):
all_codes.append(codes[0][0][i].item()+128266)
all_codes.append(codes[1][0][2*i].item()+128266+4096)
all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
return all_codes
def add_codes(example):
# Always initialize codes_list to None
codes_list = None
try:
answer_audio = example.get("audio")
# If there's a valid audio array, tokenise it
if answer_audio and "array" in answer_audio:
audio_array = answer_audio["array"]
codes_list = tokenise_audio(audio_array)
except Exception as e:
print(f"Skipping row due to error: {e}")
# Keep codes_list as None if we fail
example["codes_list"] = codes_list
return example
ds = ds.map(add_codes, remove_columns=["audio"])
#@title Load Tokenizer
tokeniser_length = 128256
start_of_text = 128000
end_of_text = 128009
start_of_speech = tokeniser_length + 1
end_of_speech = tokeniser_length + 2
start_of_human = tokeniser_length + 3
end_of_human = tokeniser_length + 4
start_of_ai = tokeniser_length + 5
end_of_ai = tokeniser_length + 6
pad_token = tokeniser_length + 7
audio_tokens_start = tokeniser_length + 10
tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
num_proc = os.cpu_count() - 2
ds = ds.filter(lambda x: x["codes_list"] is not None)
ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
#@title Create Input Ids
def remove_duplicate_frames(example):
vals = example["codes_list"]
if len(vals) % 7 != 0:
raise ValueError("Input list length must be divisible by 7")
result = vals[:7]
removed_frames = 0
for i in range(7, len(vals), 7):
current_first = vals[i]
previous_first = result[-7]
if current_first != previous_first:
result.extend(vals[i:i+7])
else:
removed_frames += 1
example["codes_list"] = result
return example
ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
def create_input_ids(example):
text_ids = tokenizer.encode({example['text']}, add_special_tokens=True)
text_ids.append(end_of_text)
example["text_tokens"] = text_ids
input_ids = (
[start_of_human]
+ example["text_tokens"]
+ [end_of_human]
+ [start_of_ai]
+ [start_of_speech]
+ example["codes_list"]
+ [end_of_speech]
+ [end_of_ai]
)
example["input_ids"] = input_ids
example["labels"] = input_ids
example["attention_mask"] = [1] * len(input_ids)
return example
ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
#@title Remove unnecessary columns
columns_to_keep = ["input_ids", "labels", "attention_mask"]
columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
ds = ds.remove_columns(columns_to_remove)
ds.push_to_hub(name_to_push_dataset_to)
```
## Finetune pre-processing
Use this code to add a new voice.
```python
import torch
from snac import SNAC
from datasets import load_dataset
from huggingface_hub import snapshot_download
from datasets import load_dataset
import random
import torchaudio.transforms as T
from transformers import AutoTokenizer
import os
my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
dsn = my_original_dataset_name
snapshot_download(
repo_id=dsn,
repo_type="dataset",
revision="main",
max_workers=64,
)
ds = load_dataset(dsn, split="train")
ds_sample_rate = ds[0]["audio"]["sampling_rate"]
model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
model = model.to("mps")
def tokenise_audio(waveform):
waveform = torch.from_numpy(waveform).unsqueeze(0)
waveform = waveform.to(dtype=torch.float32)
resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
waveform = resample_transform(waveform)
waveform = waveform.unsqueeze(0).to("cuda")
#generate the codes from snac
with torch.inference_mode():
codes = model.encode(waveform)
all_codes = []
for i in range(codes[0].shape[1]):
all_codes.append(codes[0][0][i].item()+128266)
all_codes.append(codes[1][0][2*i].item()+128266+4096)
all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
return all_codes
def add_codes(example):
# Always initialize codes_list to None
codes_list = None
try:
answer_audio = example.get("audio")
# If there's a valid audio array, tokenise it
if answer_audio and "array" in answer_audio:
audio_array = answer_audio["array"]
codes_list = tokenise_audio(audio_array)
except Exception as e:
print(f"Skipping row due to error: {e}")
# Keep codes_list as None if we fail
example["codes_list"] = codes_list
return example
ds = ds.map(add_codes, remove_columns=["audio"])
#@title Load Tokenizer
tokeniser_length = 128256
start_of_text = 128000
end_of_text = 128009
start_of_speech = tokeniser_length + 1
end_of_speech = tokeniser_length + 2
start_of_human = tokeniser_length + 3
end_of_human = tokeniser_length + 4
start_of_ai = tokeniser_length + 5
end_of_ai = tokeniser_length + 6
pad_token = tokeniser_length + 7
audio_tokens_start = tokeniser_length + 10
tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
num_proc = os.cpu_count() - 2
ds = ds.filter(lambda x: x["codes_list"] is not None)
ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
#@title Create Input Ids
def remove_duplicate_frames(example):
vals = example["codes_list"]
if len(vals) % 7 != 0:
raise ValueError("Input list length must be divisible by 7")
result = vals[:7]
removed_frames = 0
for i in range(7, len(vals), 7):
current_first = vals[i]
previous_first = result[-7]
if current_first != previous_first:
result.extend(vals[i:i+7])
else:
removed_frames += 1
example["codes_list"] = result
return example
ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
tok_info = '''*** HERE you can modify the text prompt
i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:
f"{example["source"]}: {example["text"]}", as is passed.
'''
print(tok_info)
def create_input_ids(example):
text_ids = tokenizer.encode(f"{example['speaker_id']}: {example['text']}", add_special_tokens=True)
text_ids.append(end_of_text)
example["text_tokens"] = text_ids
input_ids = (
[start_of_human]
+ example["text_tokens"]
+ [end_of_human]
+ [start_of_ai]
+ [start_of_speech]
+ example["codes_list"]
+ [end_of_speech]
+ [end_of_ai]
)
example["input_ids"] = input_ids
example["labels"] = input_ids
example["attention_mask"] = [1] * len(input_ids)
return example
ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
#@title Remove unnecessary columns
columns_to_keep = ["input_ids", "labels", "attention_mask"]
columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
ds = ds.remove_columns(columns_to_remove)
ds.push_to_hub(name_to_push_dataset_to)
```
## Training
After preprocessing is done, fill out the blanks in finetune.yml and simply run `axolotl train finetune.yml`
## Inference
For inference, please refer to the original [orpheus github](https://github.com/canopyai/Orpheus-TTS/tree/main).

View File

@@ -1,52 +0,0 @@
base_model: canopylabs/orpheus-3b-0.1-pretrained
hub_model_id: <your-hub-model-id>
plugins:
- axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: true
datasets:
- path: <your-hf-dataset-id>
type: # leave empty to load pre-tokenized
dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out
sequence_len: 8192
sample_packing: true
pad_to_sequence_len: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 8
micro_batch_size: 4
num_epochs: 3
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 2e-5
bf16: auto
tf32: false
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_steps: 20
evals_per_epoch: 5
saves_per_epoch: 5
weight_decay: 0.05
special_tokens:
pad_token: <custom_token_7>

View File

@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
use_reentrant: True
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 4

View File

@@ -51,8 +51,7 @@ gradient_checkpointing_kwargs:
use_reentrant: True
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
use_reentrant: True
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 4

View File

@@ -49,8 +49,7 @@ gradient_checkpointing_kwargs:
use_reentrant: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 100
evals_per_epoch: 4

View File

@@ -44,8 +44,7 @@ gradient_checkpointing_kwargs:
use_reentrant: True
early_stopping_patience: 3
logging_steps: 1
attention: flash
flash_attention: true
eval_steps: 1000
save_steps: 5000

View File

@@ -46,7 +46,8 @@ tf32: true
gradient_checkpointing: true
logging_steps: 1
attention: eager # PixtralVisionModel does not support Flash Attention 2.0 yet
flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1

View File

@@ -47,7 +47,7 @@ tf32: false
gradient_checkpointing: false
resume_from_checkpoint:
logging_steps: 1
attention: eager
flash_attention:
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -47,7 +47,7 @@ tf32: false
gradient_checkpointing: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention:
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -43,8 +43,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,7 +46,8 @@ tf32: true
gradient_checkpointing: true
logging_steps: 1
attention: flash
flash_attention: true
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1

View File

@@ -49,8 +49,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -47,8 +47,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch:

View File

@@ -47,8 +47,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -43,8 +43,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch:

View File

@@ -40,7 +40,7 @@ bf16: auto
tf32: true
resume_from_checkpoint:
logging_steps: 5
attention: flash
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20

View File

@@ -38,7 +38,7 @@ tf32: true
gradient_checkpointing:
resume_from_checkpoint:
logging_steps: 1
attention: eager
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20

View File

@@ -44,8 +44,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
flash_attn_cross_entropy: false
flash_attn_rms_norm: true
flash_attn_fuse_qkv: false

View File

@@ -47,8 +47,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
attention: flash
flash_attention: true
flash_attn_cross_entropy: false
flash_attn_rms_norm: true

Some files were not shown because too many files have changed in this diff Show More