Compare commits
81 Commits
v0.11.0
...
custom-mod
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
08aa74e418 | ||
|
|
dfa14f87ab | ||
|
|
fbe1b504da | ||
|
|
5b8370969c | ||
|
|
22810c97b7 | ||
|
|
2eb7ff95af | ||
|
|
90e5598930 | ||
|
|
1d2aa1e467 | ||
|
|
430be216d8 | ||
|
|
28804b82e4 | ||
|
|
add3e5076b | ||
|
|
41434f0c28 | ||
|
|
f7ea140838 | ||
|
|
460e0f9ed9 | ||
|
|
e80faea0db | ||
|
|
0ff2f172ef | ||
|
|
1407aac779 | ||
|
|
b34c3371ed | ||
|
|
5f1a4306b0 | ||
|
|
93709eb5ce | ||
|
|
208fb7b8e7 | ||
|
|
b86a1d47b0 | ||
|
|
01d8175d48 | ||
|
|
631268a0ca | ||
|
|
3a208cfd84 | ||
|
|
7267edc168 | ||
|
|
dfba881e99 | ||
|
|
d32058e149 | ||
|
|
bc1076d8a2 | ||
|
|
b7e8f66e5a | ||
|
|
e207762928 | ||
|
|
fefb0797ee | ||
|
|
af8d257aa2 | ||
|
|
db5f6f4693 | ||
|
|
8e5f146701 | ||
|
|
31a15a49b6 | ||
|
|
b986f7c7cb | ||
|
|
e5734e5cf0 | ||
|
|
109d9c7442 | ||
|
|
170322a1f0 | ||
|
|
5f5ae76213 | ||
|
|
a798975b7c | ||
|
|
d23f972602 | ||
|
|
8e41317250 | ||
|
|
9f2bb188a4 | ||
|
|
9dde9e1b71 | ||
|
|
f2474ef941 | ||
|
|
8a4bcacdb2 | ||
|
|
d2c3d5a954 | ||
|
|
36cbe13d18 | ||
|
|
2c408b5c5e | ||
|
|
942005f526 | ||
|
|
10ba1622f7 | ||
|
|
d320ef6199 | ||
|
|
354eaaf0d3 | ||
|
|
a061446540 | ||
|
|
cd079b5536 | ||
|
|
5cc16040a8 | ||
|
|
38359a8997 | ||
|
|
7dc3ac6cb3 | ||
|
|
99187cd208 | ||
|
|
aa684122f1 | ||
|
|
ca4d4ef793 | ||
|
|
37edbe4999 | ||
|
|
e581c15d40 | ||
|
|
af92151a7b | ||
|
|
80dc4c261a | ||
|
|
7ccbbd8e77 | ||
|
|
5081db7f8a | ||
|
|
41664c7c4c | ||
|
|
9a8073e73d | ||
|
|
7fb8441e0e | ||
|
|
4dc5910e1c | ||
|
|
fb7bc9250d | ||
|
|
d6e4a611e5 | ||
|
|
eb662557a7 | ||
|
|
03b2a113fe | ||
|
|
9b95a625ab | ||
|
|
c370d0795c | ||
|
|
76aeb16156 | ||
|
|
7c5ea0010f |
41
.axolotl-complete.bash
Normal file
41
.axolotl-complete.bash
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/bin/bash
|
||||
|
||||
_axolotl_completions() {
|
||||
local cur prev
|
||||
COMPREPLY=()
|
||||
cur="${COMP_WORDS[COMP_CWORD]}"
|
||||
prev="${COMP_WORDS[COMP_CWORD-1]}"
|
||||
|
||||
# If we're completing the first argument (the command)
|
||||
if [[ $COMP_CWORD -eq 1 ]]; then
|
||||
mapfile -t COMPREPLY < <(compgen -W "delinearize-llama4 fetch lm-eval merge-sharded-fsdp-weights quantize vllm-serve evaluate inference merge-lora preprocess train" -- "$cur")
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Commands that should complete with directories and YAML files
|
||||
local -a yaml_commands=("merge-sharded-fsdp-weights" "quantize" "vllm-serve" "evaluate" "inference" "merge-lora" "preprocess" "train")
|
||||
|
||||
# Check if previous word is in our list
|
||||
if [[ " ${yaml_commands[*]} " =~ (^|[[:space:]])$prev($|[[:space:]]) ]]; then
|
||||
# Use filename completion which handles directories properly
|
||||
compopt -o filenames
|
||||
mapfile -t COMPREPLY < <(compgen -f -- "$cur")
|
||||
|
||||
# Filter to only include directories and YAML files
|
||||
local -a filtered=()
|
||||
for item in "${COMPREPLY[@]}"; do
|
||||
if [[ -d "$item" ]] || [[ "$item" == *.yaml ]] || [[ "$item" == *.yml ]]; then
|
||||
filtered+=("$item")
|
||||
fi
|
||||
done
|
||||
COMPREPLY=("${filtered[@]}")
|
||||
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Default: no completion
|
||||
return 0
|
||||
}
|
||||
|
||||
# Remove the -o nospace option - let filenames handle it
|
||||
complete -F _axolotl_completions axolotl
|
||||
16
.coderabbit.yaml
Normal file
16
.coderabbit.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
|
||||
language: "en-US"
|
||||
early_access: false
|
||||
reviews:
|
||||
profile: "chill"
|
||||
request_changes_workflow: false
|
||||
high_level_summary: true
|
||||
review_status: true
|
||||
collapse_walkthrough: true
|
||||
poem: false
|
||||
sequence_diagrams: false
|
||||
auto_review:
|
||||
enabled: true
|
||||
drafts: false
|
||||
chat:
|
||||
auto_reply: true
|
||||
4
.github/workflows/base.yml
vendored
4
.github/workflows/base.yml
vendored
@@ -17,7 +17,7 @@ on:
|
||||
|
||||
jobs:
|
||||
build-base:
|
||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
||||
timeout-minutes: 480
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
runs-on: ubuntu-latest-m
|
||||
@@ -108,7 +108,7 @@ jobs:
|
||||
PYTORCH_VERSION=${{ matrix.pytorch }}
|
||||
TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
|
||||
build-base-uv:
|
||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
||||
timeout-minutes: 480
|
||||
runs-on: ubuntu-latest-m
|
||||
strategy:
|
||||
|
||||
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@@ -3,6 +3,7 @@ on:
|
||||
# check on PRs, and manual triggers
|
||||
merge_group:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, ready_for_review]
|
||||
paths:
|
||||
- '**.py'
|
||||
- 'requirements.txt'
|
||||
@@ -16,6 +17,7 @@ jobs:
|
||||
pre-commit:
|
||||
name: pre-commit
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
|
||||
2
.github/workflows/main.yml
vendored
2
.github/workflows/main.yml
vendored
@@ -87,7 +87,6 @@ jobs:
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
@@ -98,6 +97,7 @@ jobs:
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
|
||||
11
.github/workflows/multi-gpu-e2e.yml
vendored
11
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -21,7 +21,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
test-axolotl-multigpu:
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -36,10 +36,17 @@ jobs:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
pytorch: 2.7.0
|
||||
axolotl_extras:
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras: vllm
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 120
|
||||
steps:
|
||||
|
||||
15
.github/workflows/nightlies.yml
vendored
15
.github/workflows/nightlies.yml
vendored
@@ -12,11 +12,16 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
axolotl_extras:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -60,15 +65,15 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
axolotl_extras:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
|
||||
21
.github/workflows/preview-docs.yml
vendored
21
.github/workflows/preview-docs.yml
vendored
@@ -2,7 +2,7 @@ name: Preview
|
||||
on:
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
types: [opened, synchronize, reopened, ready_for_review]
|
||||
|
||||
# Run the workflow only when one of these files changes
|
||||
paths:
|
||||
@@ -25,9 +25,12 @@ permissions:
|
||||
jobs:
|
||||
preview:
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
steps:
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
|
||||
- name: Set up Quarto
|
||||
uses: quarto-dev/quarto-actions/setup@v2
|
||||
@@ -50,10 +53,12 @@ jobs:
|
||||
|
||||
- name: Netlify Publish
|
||||
uses: nwtgck/actions-netlify@v3.0
|
||||
if: ${{ secrets.NETLIFY_AUTH_TOKEN != '' }}
|
||||
id: netlify
|
||||
with:
|
||||
publish-dir: './_site'
|
||||
enable-pull-request-comment: true
|
||||
enable-github-deployment: true
|
||||
enable-pull-request-comment: false
|
||||
enable-github-deployment: false
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
deploy-message: "Deployed On Netlify"
|
||||
github-deployment-environment: 'preview'
|
||||
@@ -61,3 +66,13 @@ jobs:
|
||||
env:
|
||||
NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
|
||||
NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
|
||||
|
||||
- name: Update PR with preview link
|
||||
if: ${{ steps.netlify.outcome == 'success' && secrets.NETLIFY_AUTH_TOKEN != '' }}
|
||||
uses: marocchino/sticky-pull-request-comment@v2
|
||||
with:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
message: |
|
||||
📖 **Documentation Preview**: ${{ steps.netlify.outputs.deploy-url }}
|
||||
|
||||
Deployed on Netlify from commit ${{ github.event.pull_request.head.sha }}
|
||||
|
||||
55
.github/workflows/tests-nightly.yml
vendored
55
.github/workflows/tests-nightly.yml
vendored
@@ -52,7 +52,7 @@ jobs:
|
||||
|
||||
- name: Install PyTorch
|
||||
run: |
|
||||
pip3 install torch==${{ matrix.pytorch_version }}
|
||||
pip3 install torch==${{ matrix.pytorch_version }} torchvision
|
||||
|
||||
- name: Update requirements.txt
|
||||
run: |
|
||||
@@ -92,7 +92,7 @@ jobs:
|
||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 60
|
||||
timeout-minutes: 120
|
||||
needs: [pre-commit, pytest]
|
||||
|
||||
strategy:
|
||||
@@ -106,6 +106,13 @@ jobs:
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
@@ -116,7 +123,7 @@ jobs:
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==0.71.8 jinja2
|
||||
pip install modal==1.0.2 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
@@ -130,3 +137,45 @@ jobs:
|
||||
- name: Run tests job on Modal
|
||||
run: |
|
||||
modal run cicd.e2e_tests
|
||||
docker-e2e-multigpu-tests:
|
||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 120
|
||||
needs: [pre-commit, pytest, docker-e2e-tests]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
num_gpus: 2
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.0.2 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||
echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
|
||||
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
||||
- name: Run tests job on Modal
|
||||
run: |
|
||||
modal run cicd.multigpu
|
||||
|
||||
13
.github/workflows/tests.yml
vendored
13
.github/workflows/tests.yml
vendored
@@ -13,6 +13,7 @@ on:
|
||||
- 'cicd/cicd.sh'
|
||||
- 'cicd/Dockerfile.jinja'
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, ready_for_review]
|
||||
paths:
|
||||
- '**.py'
|
||||
- 'requirements.txt'
|
||||
@@ -34,6 +35,7 @@ jobs:
|
||||
pre-commit:
|
||||
name: pre-commit
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
@@ -47,6 +49,7 @@ jobs:
|
||||
pytest:
|
||||
name: PyTest
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
# needs: [preload-cache]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -78,7 +81,7 @@ jobs:
|
||||
|
||||
- name: Install PyTorch
|
||||
run: |
|
||||
pip3 install torch==${{ matrix.pytorch_version }}
|
||||
pip3 install torch==${{ matrix.pytorch_version }} torchvision
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
@@ -121,6 +124,7 @@ jobs:
|
||||
pytest-sdist:
|
||||
name: PyTest from Source Dist
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -151,7 +155,7 @@ jobs:
|
||||
|
||||
- name: Install PyTorch
|
||||
run: |
|
||||
pip3 install torch==${{ matrix.pytorch_version }}
|
||||
pip3 install torch==${{ matrix.pytorch_version }} torchvision
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
@@ -185,7 +189,7 @@ jobs:
|
||||
|
||||
docker-e2e-tests-1st:
|
||||
# Run this job first as a gate for running the remainder of the test matrix
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 120
|
||||
@@ -235,7 +239,7 @@ jobs:
|
||||
modal run cicd.e2e_tests
|
||||
|
||||
docker-e2e-tests:
|
||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 120
|
||||
@@ -289,6 +293,7 @@ jobs:
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 90
|
||||
needs: [docker-e2e-tests]
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
|
||||
@@ -27,7 +27,7 @@ repos:
|
||||
hooks:
|
||||
- id: pylint
|
||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||
rev: v1.16.1
|
||||
rev: v1.17.0
|
||||
hooks:
|
||||
- id: mypy
|
||||
additional_dependencies:
|
||||
|
||||
@@ -119,14 +119,15 @@ datasets:
|
||||
|
||||
## Dataset Processing
|
||||
|
||||
| Option | Default | Description |
|
||||
| ----------------------------- | -------------------------- | --------------------------------- |
|
||||
| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
|
||||
| `push_dataset_to_hub` | `""` | Push dataset to HF hub |
|
||||
| `dataset_processes` | `4` | Number of preprocessing processes |
|
||||
| `dataset_keep_in_memory` | `false` | Keep dataset in memory |
|
||||
| `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
|
||||
| `dataset_exact_deduplication` | `true` | Deduplicate datasets |
|
||||
| Option | Default | Description |
|
||||
| --------------------------------- | -------------------------- | ----------------------------------- |
|
||||
| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
|
||||
| `push_dataset_to_hub` | `""` | Push dataset to HF hub |
|
||||
| `dataset_processes` | `4` | Number of preprocessing processes |
|
||||
| `dataset_keep_in_memory` | `false` | Keep dataset in memory |
|
||||
| `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
|
||||
| `shuffle_before_merging_datasets` | `false` | Shuffle each dataset before merging |
|
||||
| `dataset_exact_deduplication` | `true` | Deduplicate datasets |
|
||||
|
||||
## LoRA Configuration
|
||||
|
||||
|
||||
@@ -97,7 +97,7 @@
|
||||
# # 'no_input_format' cannot include {input}
|
||||
# no_input_format: "{instruction} "
|
||||
|
||||
# # For `completion` datsets only, uses the provided field instead of `text` column
|
||||
# # For `completion` datasets only, uses the provided field instead of `text` column
|
||||
# field:
|
||||
|
||||
# # Axolotl attempts to save the dataset as an arrow after packing the data together so
|
||||
|
||||
22
README.md
22
README.md
@@ -25,6 +25,8 @@
|
||||
|
||||
## 🎉 Latest Updates
|
||||
|
||||
- 2025/07: Voxtral with mistral-common tokenizer support has been integrated in Axolotl. Read the [docs](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/voxtral)!
|
||||
- 2025/07: TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
|
||||
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
|
||||
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
|
||||
- 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
|
||||
@@ -79,6 +81,20 @@ docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
|
||||
|
||||
Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
#### Cloud Providers
|
||||
|
||||
<details>
|
||||
|
||||
- [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
|
||||
- [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=github&utm_medium=developer_community&utm_campaign=template_launch_axolotl&utm_content=readme)
|
||||
- [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true)
|
||||
- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl)
|
||||
- [Novita](https://novita.ai/gpus-console?templateId=311)
|
||||
- [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
|
||||
- [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
|
||||
|
||||
</details>
|
||||
|
||||
### Your First Fine-tune
|
||||
|
||||
```bash
|
||||
@@ -120,12 +136,6 @@ Contributions are welcome! Please see our [Contributing Guide](https://github.co
|
||||
|
||||
## ❤️ Sponsors
|
||||
|
||||
Thank you to our sponsors who help make Axolotl possible:
|
||||
|
||||
- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl) - Modal lets you run
|
||||
jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale,
|
||||
fine-tune large language models, run protein folding simulations, and much more.
|
||||
|
||||
Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai)
|
||||
|
||||
## 📜 License
|
||||
|
||||
@@ -268,6 +268,8 @@ website:
|
||||
- docs/batch_vs_grad.qmd
|
||||
- docs/dataset_preprocessing.qmd
|
||||
- docs/multipack.qmd
|
||||
- docs/mixed_precision.qmd
|
||||
- docs/gradient_accumulation.qmd
|
||||
|
||||
- section: "Advanced Features"
|
||||
contents:
|
||||
@@ -276,6 +278,7 @@ website:
|
||||
- docs/torchao.qmd
|
||||
- docs/custom_integrations.qmd
|
||||
- docs/sequence_parallelism.qmd
|
||||
- docs/gradient_checkpointing.qmd
|
||||
|
||||
- section: "Troubleshooting"
|
||||
contents:
|
||||
|
||||
@@ -11,7 +11,7 @@ ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
|
||||
ENV HF_HOME="{{ HF_HOME }}"
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ ENV HF_HOME="{{ HF_HOME }}"
|
||||
ENV AXOLOTL_DATASET_PROCESSES="8"
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
|
||||
@@ -19,5 +19,7 @@ pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
|
||||
--cov-append \
|
||||
--cov-report=xml:multigpu-coverage.xml
|
||||
|
||||
# Upload coverage to Codecov
|
||||
codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
|
||||
# Upload coverage to Codecov if CODECOV_TOKEN is available
|
||||
if [ -n "$CODECOV_TOKEN" ]; then
|
||||
codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
|
||||
fi
|
||||
|
||||
@@ -22,6 +22,7 @@ coverage:
|
||||
only_pulls: true
|
||||
flags: null
|
||||
paths: null
|
||||
informational: true
|
||||
patch:
|
||||
default:
|
||||
# basic
|
||||
|
||||
@@ -7,9 +7,9 @@
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": 0,
|
||||
"stage3_max_reuse_distance": 0,
|
||||
"stage3_gather_16bit_weights_on_model_save": true
|
||||
"max_live_parameters": 0,
|
||||
"max_reuse_distance": 0,
|
||||
"gather_16bit_weights_on_model_save": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
|
||||
@@ -7,9 +7,9 @@
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": 0,
|
||||
"stage3_max_reuse_distance": 0,
|
||||
"stage3_gather_16bit_weights_on_model_save": true
|
||||
"max_live_parameters": 0,
|
||||
"max_reuse_distance": 0,
|
||||
"gather_16bit_weights_on_model_save": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": true
|
||||
|
||||
@@ -17,9 +17,9 @@
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": 0,
|
||||
"stage3_max_reuse_distance": 0,
|
||||
"stage3_gather_16bit_weights_on_model_save": true
|
||||
"max_live_parameters": 0,
|
||||
"max_reuse_distance": 0,
|
||||
"gather_16bit_weights_on_model_save": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": true
|
||||
|
||||
@@ -13,9 +13,9 @@
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": 0,
|
||||
"stage3_max_reuse_distance": 0,
|
||||
"stage3_gather_16bit_weights_on_model_save": true
|
||||
"max_live_parameters": 0,
|
||||
"max_reuse_distance": 0,
|
||||
"gather_16bit_weights_on_model_save": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": true
|
||||
|
||||
@@ -10,7 +10,9 @@ ARG PYTORCH_VERSION="2.1.2"
|
||||
ENV PYTORCH_VERSION=$PYTORCH_VERSION
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs
|
||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
|
||||
rm -rf /var/cache/apt/archives && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
@@ -23,17 +25,17 @@ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
else \
|
||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
||||
fi
|
||||
fi && \
|
||||
python scripts/unsloth_install.py | sh && \
|
||||
python scripts/cutcrossentropy_install.py | sh && \
|
||||
pip install pytest && \
|
||||
pip cache purge
|
||||
|
||||
RUN python scripts/unsloth_install.py | sh
|
||||
RUN python scripts/cutcrossentropy_install.py | sh
|
||||
|
||||
# So we can test the Docker image
|
||||
RUN pip install pytest
|
||||
|
||||
# fix so that git fetch/pull from remote works
|
||||
# fix so that git fetch/pull from remote works with shallow clone
|
||||
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
||||
git config --get remote.origin.fetch
|
||||
git config --get remote.origin.fetch && \
|
||||
git config --global credential.helper store
|
||||
|
||||
# helper for huggingface-login cli
|
||||
RUN git config --global credential.helper store
|
||||
COPY .axolotl-complete.bash /root/.axolotl-complete.bash
|
||||
RUN chmod +x /root/.axolotl-complete.bash && \
|
||||
echo 'source /root/.axolotl-complete.bash' >> ~/.bashrc
|
||||
|
||||
@@ -16,12 +16,16 @@ ENV PYTHON_VERSION=$PYTHON_VERSION
|
||||
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
|
||||
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config \
|
||||
&& rm -rf /var/cache/apt/archives \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& wget \
|
||||
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
||||
&& mkdir /root/.conda \
|
||||
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
|
||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
|
||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||
|
||||
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
||||
@@ -31,12 +35,14 @@ WORKDIR /workspace
|
||||
RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
|
||||
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
|
||||
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
|
||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
|
||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
RUN git lfs install --skip-repo && \
|
||||
pip3 install awscli && \
|
||||
# The base image ships with `pydantic==1.8.2` which is not working
|
||||
pip3 install -U --no-cache-dir pydantic==1.10.10
|
||||
pip3 install -U --no-cache-dir pydantic==1.10.10 && \
|
||||
pip3 cache purge
|
||||
|
||||
RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
|
||||
FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
|
||||
|
||||
@@ -22,18 +22,22 @@ RUN apt-get update \
|
||||
&& mkdir /root/.conda \
|
||||
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
|
||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
|
||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||
|
||||
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
|
||||
RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
|
||||
python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
|
||||
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
|
||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
|
||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
RUN git lfs install --skip-repo && \
|
||||
pip3 install awscli && \
|
||||
# The base image ships with `pydantic==1.8.2` which is not working
|
||||
pip3 install -U --no-cache-dir pydantic==1.10.10
|
||||
pip3 install -U --no-cache-dir pydantic==1.10.10 && \
|
||||
pip3 cache purge
|
||||
|
||||
@@ -14,7 +14,10 @@ COPY scripts/motd /etc/motd
|
||||
|
||||
RUN pip install jupyterlab notebook ipywidgets && \
|
||||
jupyter lab clean
|
||||
RUN apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
|
||||
RUN apt update && \
|
||||
apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
|
||||
rm -rf /var/cache/apt/archives && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
mkdir -p ~/.ssh && \
|
||||
chmod 700 ~/.ssh && \
|
||||
printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
|
||||
|
||||
@@ -9,13 +9,15 @@ ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
||||
EXPOSE 8888
|
||||
EXPOSE 22
|
||||
|
||||
COPY scripts/cloud-entrypoint-term.sh /root/cloud-entrypoint.sh
|
||||
COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
|
||||
COPY scripts/motd /etc/motd
|
||||
|
||||
RUN pip install jupyterlab notebook ipywidgets && \
|
||||
jupyter lab clean
|
||||
RUN apt install --yes --no-install-recommends openssh-server tmux sudo && \
|
||||
pip3 install -U --no-cache-dir grpcio ray[default]==2.9.3 && \
|
||||
RUN apt update && \
|
||||
apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
|
||||
rm -rf /var/cache/apt/archives && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
mkdir -p ~/.ssh && \
|
||||
chmod 700 ~/.ssh && \
|
||||
printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
|
||||
|
||||
@@ -187,6 +187,7 @@ Instead of passing `tools` via the system prompt, an alternative method would be
|
||||
"role": "assistant", // call the function via assistant
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": "...", // required only for mistral
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "...",
|
||||
@@ -199,6 +200,7 @@ Instead of passing `tools` via the system prompt, an alternative method would be
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": "...", // required only for mistral
|
||||
"name": "...",
|
||||
"content": "..."
|
||||
},
|
||||
|
||||
@@ -34,6 +34,7 @@ Tags examples:
|
||||
|
||||
- `main-base-py3.11-cu128-2.7.1`
|
||||
- `main-base-py3.11-cu126-2.7.1`
|
||||
- `main-base-py3.11-cu126-2.7.0`
|
||||
- `main-base-py3.11-cu126-2.6.0`
|
||||
- `main-base-py3.11-cu124-2.6.0`
|
||||
|
||||
@@ -75,6 +76,7 @@ Tags examples:
|
||||
|
||||
- `main-py3.11-cu128-2.7.1`
|
||||
- `main-py3.11-cu126-2.7.1`
|
||||
- `main-py3.11-cu126-2.7.0`
|
||||
- `main-py3.11-cu126-2.6.0`
|
||||
- `main-py3.11-cu124-2.6.0`
|
||||
- `main-latest`
|
||||
|
||||
@@ -136,3 +136,7 @@ description: Frequently asked questions
|
||||
> dynamic: false
|
||||
> mode: max-autotune-no-cudagraphs
|
||||
> ```
|
||||
|
||||
**Q: `ValueError("Backward pass should have cleared tracker of all tensors")`
|
||||
|
||||
> A: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with `offload_activations: legacy` in your YAML.
|
||||
|
||||
29
docs/gradient_checkpointing.qmd
Normal file
29
docs/gradient_checkpointing.qmd
Normal file
@@ -0,0 +1,29 @@
|
||||
---
|
||||
title: Gradient Checkpointing and Activation Offloading
|
||||
---
|
||||
|
||||
Gradient checkpointing and activation offloading are techniques used to optimize the performance of deep learning
|
||||
models by reducing the memory footprint and improving computational efficiency.
|
||||
|
||||
### Enabling Gradient Checkpointing
|
||||
|
||||
```yaml
|
||||
gradient_checkpointing: true
|
||||
```
|
||||
|
||||
### Enabling Activation Offloading
|
||||
|
||||
```yaml
|
||||
gradient_checkpointing: true # required for activation offloading
|
||||
activation_offloading: true
|
||||
```
|
||||
|
||||
Activation offloading variants:
|
||||
|
||||
The default `activation_offloading: true` offloads activations to CPU and uses CUDA streams
|
||||
to overlap the communications and computations when offloading.
|
||||
|
||||
The `activation_offloading: legacy` naively offloads activations to CPU and without additional optimizations.
|
||||
|
||||
For resource constrained environments with limited CPU memory, `activation_offloading: disk` offloads
|
||||
activations to disk instead of CPU RAM so that much larger context lengths can be trained with minimal memory.
|
||||
@@ -124,10 +124,13 @@ For providers supporting Docker:
|
||||
|
||||
- Use `axolotlai/axolotl-cloud:main-latest`
|
||||
- Available on:
|
||||
- [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
|
||||
- [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
|
||||
- [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
|
||||
- [Novita](https://novita.ai/gpus-console?templateId=311)
|
||||
- [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
|
||||
- [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=axolotl&utm_medium=partner&utm_campaign=template_launch_july2025&utm_content=docs_link)
|
||||
- [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true)
|
||||
- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl)
|
||||
- [Novita](https://novita.ai/gpus-console?templateId=311)
|
||||
- [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
|
||||
- [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
|
||||
|
||||
### Google Colab {#sec-colab}
|
||||
|
||||
|
||||
149
docs/mixed_precision.qmd
Normal file
149
docs/mixed_precision.qmd
Normal file
@@ -0,0 +1,149 @@
|
||||
---
|
||||
title: "Mixed Precision Training"
|
||||
format:
|
||||
html:
|
||||
toc: true
|
||||
toc-depth: 3
|
||||
number-sections: true
|
||||
code-tools: true
|
||||
execute:
|
||||
enabled: false
|
||||
---
|
||||
|
||||
Mixed precision training uses lower precision data types to reduce memory usage and increase training speed while maintaining model quality. Axolotl supports several mixed precision formats:
|
||||
|
||||
- **FP16** - Half precision 16-bit (Pascal generation+)
|
||||
- **BF16** - Brain Float 16-bit (Ampere generation+)
|
||||
- **FP8** - 8-bit floating point (Hopper generation+)
|
||||
|
||||
## FP16 Mixed Precision {#sec-fp16}
|
||||
|
||||
### Overview {#sec-fp16-overview}
|
||||
|
||||
FP16 is the traditional half-precision format, supported on older GPUs but can be less numerically stable than BF16.
|
||||
|
||||
### Configuration {#sec-fp16-config}
|
||||
|
||||
```{.yaml}
|
||||
fp16: true
|
||||
```
|
||||
|
||||
### FP16 Considerations {#sec-fp16-considerations}
|
||||
|
||||
- May require gradient scaling to prevent underflow
|
||||
- Less numerically stable than BF16
|
||||
- Can cause training instability with some model architectures
|
||||
- Consider using BF16 if your hardware supports it
|
||||
|
||||
## BF16 Mixed Precision {#sec-bf16}
|
||||
|
||||
### Overview {#sec-bf16-overview}
|
||||
|
||||
BF16 (Brain Float 16) offers better numerical stability than FP16 and is the recommended mixed precision format for modern GPUs. It provides the same dynamic range as FP32 while using half the memory.
|
||||
|
||||
### Configuration {#sec-bf16-config}
|
||||
|
||||
```{.yaml}
|
||||
# Automatic BF16 detection (recommended)
|
||||
bf16: auto
|
||||
|
||||
# Or explicitly enable
|
||||
bf16: true
|
||||
|
||||
# For evaluation with BF16
|
||||
bf16: full # Equivalent to bf16_full_eval in the HF trainer
|
||||
```
|
||||
|
||||
## FP8 Mixed Precision {#sec-fp8}
|
||||
|
||||
::: {.callout-note}
|
||||
FP8 support is experimental and requires compatible hardware (H100, H200) and recent PyTorch versions with TorchAO.
|
||||
:::
|
||||
|
||||
### What is FP8? {#sec-fp8-overview}
|
||||
|
||||
FP8 (8-bit floating point) can provide significant time savings compared to FP16/BF16 while maintaining training stability. Axolotl's implementation uses PyTorch's TorchAO library with "tensorwise" scaling strategy.
|
||||
|
||||
### Requirements {#sec-fp8-software}
|
||||
|
||||
- Hopper+ GPUs (H100/H200)
|
||||
- PyTorch 2.7+ (+ compatible TorchAO version)
|
||||
- CUDA 12.4+
|
||||
|
||||
### Configuration {#sec-fp8-config}
|
||||
|
||||
Add to your YAML config:
|
||||
|
||||
```{.yaml}
|
||||
# Enable FP8 mixed precision
|
||||
fp8: true
|
||||
|
||||
# Optional: Enable FP8 for FSDP all-gather operations
|
||||
fp8_enable_fsdp_float8_all_gather: true
|
||||
|
||||
# Enable torch.compile (almost always necessary for FP8 speedups)
|
||||
torch_compile: true
|
||||
```
|
||||
|
||||
::: {.callout-important}
|
||||
**torch.compile is critical for FP8 performance**
|
||||
|
||||
FP8 training requires `torch_compile: true` to see meaningful speedups. Without compilation, FP8 may actually be slower and use more memory than FP16/BF16.
|
||||
:::
|
||||
|
||||
### Advanced FP8 Configs {#sec-fp8-advanced}
|
||||
|
||||
For [FSDP](multi-gpu.qmd#sec-fsdp) (Fully Sharded Data Parallel) training:
|
||||
|
||||
```{.yaml}
|
||||
fp8: true
|
||||
fp8_enable_fsdp_float8_all_gather: true
|
||||
|
||||
torch_compile: true
|
||||
|
||||
# FSDP configuration
|
||||
fsdp_version: 2
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
reshard_after_forward: true
|
||||
```
|
||||
|
||||
## Best Practices {#sec-best-practices}
|
||||
|
||||
### Choosing Precision Format {#sec-choosing-format}
|
||||
|
||||
- **Start with automatic detection**: `bf16: auto`
|
||||
- **For Hopper+ (H100/H200)**: Try FP8 + torch.compile for maximum speed
|
||||
- **For Ampere (A100/RTX 30/40)**: Use BF16
|
||||
- **For older Pascal/Turing GPUs**: Use FP16 with caution
|
||||
- **For very old or unsupported GPUs**: Use FP32
|
||||
|
||||
### Validation and Testing {#sec-validation}
|
||||
|
||||
Always validate your mixed precision setup:
|
||||
|
||||
- **Start with a small dataset** to verify stability
|
||||
- **Monitor loss curves** for irregularities
|
||||
- **Compare with FP32 baseline** when possible
|
||||
- **Test evaluation metrics** match expectations
|
||||
|
||||
### FP8 Particulars {#sec-fp8-details}
|
||||
|
||||
- Use cases
|
||||
- Single GPU training
|
||||
- Multi GPU training with FSDP2 or Deepspeed
|
||||
- Speedups
|
||||
- Please refer to the [TorchAO FP8 training benchmarks](https://github.com/pytorch/ao/tree/main/torchao/float8#rowwise-scaling) for expected matmul speedups for different (M, K, N) settings
|
||||
- Concrete number for LLaMA 3 8B training can be found [here](https://github.com/pytorch/ao/tree/main/torchao/float8#training-benchmarks)
|
||||
- Known issues:
|
||||
- FP8 + DDP + `torch.compile` (causes [error](https://gist.github.com/djsaunde/0c1664c32e44a64d31b5e01b4aafe5c4))
|
||||
- FP8 + FSDP2 + `torch.compile` + FSDP2 activation checkpointing tends to be _slower_ than the BF16 equivalent training
|
||||
- Flash Attention 2 does not play nicely with `torch.compile`
|
||||
|
||||
See `examples/llama-3/3b-fp8-fsdp2.yaml` for an optimized example config. Enabling FP8 mixed precision + FP8 all-gather training results in ~10% faster iterations per second vs. BF16 for a relatively small (3B param) model
|
||||
|
||||
For more information on multi-GPU training, see our [Multi-GPU guide](multi-gpu.qmd).
|
||||
@@ -23,8 +23,6 @@ Axolotl supports several methods for multi-GPU training:
|
||||
|
||||
## DeepSpeed {#sec-deepspeed}
|
||||
|
||||
DeepSpeed is the recommended approach for multi-GPU training due to its stability and performance. It provides various optimization levels through ZeRO stages.
|
||||
|
||||
### Configuration {#sec-deepspeed-config}
|
||||
|
||||
Add to your YAML config:
|
||||
@@ -32,7 +30,6 @@ Add to your YAML config:
|
||||
```{.yaml}
|
||||
deepspeed: deepspeed_configs/zero1.json
|
||||
```
|
||||
|
||||
### Usage {#sec-deepspeed-usage}
|
||||
|
||||
```{.bash}
|
||||
@@ -66,9 +63,75 @@ Start from Stage 1 -> Stage 2 -> Stage 3.
|
||||
|
||||
:::
|
||||
|
||||
## FSDP {#sec-fsdp}
|
||||
::: {.callout-tip}
|
||||
|
||||
### Basic FSDP Configuration {#sec-fsdp-config}
|
||||
Using ZeRO Stage 3 with Single-GPU training
|
||||
|
||||
ZeRO Stage 3 can be used for training on a single GPU by manually setting the environment variables:
|
||||
`WORLD_SIZE=1 LOCAL_RANK=0 MASTER_ADDR=0.0.0.0 MASTER_PORT=29500`
|
||||
|
||||
:::
|
||||
|
||||
## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
|
||||
|
||||
::: {.callout-note}
|
||||
|
||||
FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.
|
||||
|
||||
:::
|
||||
|
||||
### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}
|
||||
|
||||
To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and
|
||||
also follow the config field mapping below to update field names.
|
||||
|
||||
#### Config mapping
|
||||
|
||||
FSDP1 | FSDP2
|
||||
-------- | --------
|
||||
fsdp_sharding_strategy | reshard_after_forward
|
||||
fsdp_backward_prefetch_policy | **REMOVED**
|
||||
fsdp_backward_prefetch | **REMOVED**
|
||||
fsdp_forward_prefetch | **REMOVED**
|
||||
fsdp_sync_module_states | **REMOVED**
|
||||
fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading
|
||||
fsdp_state_dict_type | state_dict_type
|
||||
fsdp_use_orig_params | **REMOVED**
|
||||
|
||||
For more details, please see the migration guide in the [torchtitan repo](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md). In Axolotl,
|
||||
if you were using the following FSDP1 config:
|
||||
|
||||
```{.yaml}
|
||||
fsdp_version: 1
|
||||
fsdp_config:
|
||||
fsdp_offload_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
```
|
||||
|
||||
You can migrate to the following FSDP2 config:
|
||||
|
||||
```{.yaml}
|
||||
fsdp_version: 2
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Qwen3DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
reshard_after_forward: true
|
||||
```
|
||||
|
||||
### FSDP1 (deprecated) {#sec-fsdp-config}
|
||||
|
||||
::: {.callout-note}
|
||||
|
||||
Using `fsdp` to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use `fsdp_config` as above instead.
|
||||
|
||||
:::
|
||||
|
||||
```{.yaml}
|
||||
fsdp:
|
||||
@@ -80,6 +143,7 @@ fsdp_config:
|
||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
```
|
||||
|
||||
|
||||
## Sequence parallelism {#sec-sequence-parallelism}
|
||||
|
||||
We support sequence parallelism (SP) via the
|
||||
|
||||
@@ -40,13 +40,13 @@ use_cpu: false
|
||||
|
||||
Configure your model to use FSDP in the Axolotl yaml. For example:
|
||||
```yaml
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_version: 2
|
||||
fsdp_config:
|
||||
fsdp_offload_params: true
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
offload_params: true
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
reshard_after_forward: true
|
||||
```
|
||||
|
||||
All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
|
||||
|
||||
@@ -14,6 +14,7 @@ format:
|
||||
- [Llava-1.5](#sec-llava-15)
|
||||
- [Mistral-Small-3.1](#sec-mistral-small-31)
|
||||
- [Gemma-3](#sec-gemma-3)
|
||||
- [Gemma-3n](#sec-gemma-3n)
|
||||
- [Qwen2-VL](#sec-qwen2-vl)
|
||||
- [Qwen2.5-VL](#sec-qwen25-vl)
|
||||
|
||||
@@ -110,6 +111,22 @@ base_model: google/gemma-3-4b-it
|
||||
chat_template: gemma3
|
||||
```
|
||||
|
||||
### Gemma-3n {#sec-gemma-3n}
|
||||
|
||||
::: {.callout-warning}
|
||||
The model's initial loss and grad norm will be very high. We suspect this to be due to the Conv in the vision layers.
|
||||
:::
|
||||
|
||||
::: {.callout-tip}
|
||||
Please make sure to install `timm` via `pip3 install timm==1.0.17`
|
||||
:::
|
||||
|
||||
```yaml
|
||||
base_model: google/gemma-3n-E2B-it
|
||||
|
||||
chat_template: gemma3n
|
||||
```
|
||||
|
||||
### Qwen2-VL {#sec-qwen2-vl}
|
||||
|
||||
```yaml
|
||||
@@ -132,7 +149,9 @@ For multi-modal datasets, we adopt an extended `chat_template` format similar to
|
||||
|
||||
- A message is a list of `role` and `content`.
|
||||
- `role` can be `system`, `user`, `assistant`, etc.
|
||||
- `content` is a list of `type` and (`text` or `image` or `path` or `url` or `base64`).
|
||||
- `content` is a list of `type` and (`text`, `image`, `path`, `url`, `base64`, or `audio`).
|
||||
|
||||
### Image
|
||||
|
||||
::: {.callout-note}
|
||||
For backwards compatibility:
|
||||
@@ -141,15 +160,29 @@ For backwards compatibility:
|
||||
- If `content` is a string, it will be converted to a list with `type` as `text`.
|
||||
:::
|
||||
|
||||
::: {.callout-tip}
|
||||
For image loading, you can use the following keys within `content` alongside `"type": "image"`:
|
||||
|
||||
- `"path": "/path/to/image.jpg"`
|
||||
- `"url": "https://example.com/image.jpg"`
|
||||
- `"base64": "..."`
|
||||
- `"image": PIL.Image`
|
||||
|
||||
### Audio
|
||||
|
||||
For audio loading, you can use the following keys within `content` alongside `"type": "audio"`:
|
||||
|
||||
- `"path": "/path/to/audio.mp3"`
|
||||
- `"url": "https://example.com/audio.mp3"`
|
||||
- `"audio": np.ndarray`
|
||||
|
||||
::: {.callout-tip}
|
||||
|
||||
You may need to install `librosa` via `pip3 install librosa==0.11.0`.
|
||||
|
||||
:::
|
||||
|
||||
### Example
|
||||
|
||||
Here is an example of a multi-modal dataset:
|
||||
```json
|
||||
[
|
||||
@@ -178,3 +211,9 @@ Here is an example of a multi-modal dataset:
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## FAQ
|
||||
|
||||
1. `PIL.UnidentifiedImageError: cannot identify image file ...`
|
||||
|
||||
`PIL` could not retrieve the file at `url` using `requests`. Please check for typo. One alternative reason is that the request is blocked by the server.
|
||||
|
||||
@@ -17,7 +17,6 @@ feedback. Various methods include, but not limited to:
|
||||
- [Kahneman-Tversky Optimization (KTO)](#kto)
|
||||
- [Odds Ratio Preference Optimization (ORPO)](#orpo)
|
||||
- [Group Relative Policy Optimization (GRPO)](#grpo)
|
||||
- Proximal Policy Optimization (PPO) (not yet supported in axolotl, if you're interested in contributing, please reach out!)
|
||||
|
||||
|
||||
## RLHF using Axolotl
|
||||
@@ -275,15 +274,14 @@ rl: dpo
|
||||
datasets:
|
||||
- path: ...
|
||||
split: train
|
||||
type: user_defined.default
|
||||
|
||||
field_prompt: "prompt"
|
||||
field_system: "system"
|
||||
field_chosen: "chosen"
|
||||
field_rejected: "rejected"
|
||||
prompt_format: "{prompt}"
|
||||
chosen_format: "{chosen}"
|
||||
rejected_format: "{rejected}"
|
||||
type:
|
||||
field_prompt: "prompt"
|
||||
field_system: "system"
|
||||
field_chosen: "chosen"
|
||||
field_rejected: "rejected"
|
||||
prompt_format: "{prompt}"
|
||||
chosen_format: "{chosen}"
|
||||
rejected_format: "{rejected}"
|
||||
```
|
||||
|
||||
The input format is a simple JSON input with customizable fields based on the above config.
|
||||
@@ -476,14 +474,13 @@ rl: kto
|
||||
datasets:
|
||||
- path: ...
|
||||
split: train
|
||||
type: user_defined.default
|
||||
|
||||
field_prompt: "prompt"
|
||||
field_system: "system"
|
||||
field_completion: "completion"
|
||||
field_label: "label"
|
||||
prompt_format: "{prompt}"
|
||||
completion_format: "{completion}"
|
||||
type:
|
||||
field_prompt: "prompt"
|
||||
field_system: "system"
|
||||
field_completion: "completion"
|
||||
field_label: "label"
|
||||
prompt_format: "{prompt}"
|
||||
completion_format: "{completion}"
|
||||
```
|
||||
|
||||
The input format is a simple JSON input with customizable fields based on the above config.
|
||||
|
||||
9
examples/alst/README.md
Normal file
9
examples/alst/README.md
Normal file
@@ -0,0 +1,9 @@
|
||||
# Arctic Long Sequence Training (ALST)
|
||||
|
||||
Artic Long Sequence Training (ALST) is a technique for training long context models using a variety of optimization
|
||||
techniques. It is a combination of:
|
||||
- TiledMLP: Leverage tiling over the sequence dimension on MLP layers to reduce memory usage
|
||||
- Tiled Loss: Using optimized loss functions like Liger-Kernel or Cut Cross Entropy to reduce memory usage
|
||||
- Activation Offloading: Offload activations to CPU RAM to reduce memory usage
|
||||
|
||||
For more information, you can check out the ALST paper [here](https://www.arxiv.org/abs/2506.13996).
|
||||
53
examples/alst/llama3-8b-deepspeed-alst.yaml
Normal file
53
examples/alst/llama3-8b-deepspeed-alst.yaml
Normal file
@@ -0,0 +1,53 @@
|
||||
base_model: meta-llama/Llama-3.1-8B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
datasets:
|
||||
- path: togethercomputer/Long-Data-Collections
|
||||
type: completion
|
||||
field: text
|
||||
data_files:
|
||||
- pretrain/rp_sub.jsonl.zst
|
||||
- path: princeton-nlp/TextbookChapters
|
||||
type: completion
|
||||
field: chapter
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 500_000
|
||||
min_sample_len: 200_000
|
||||
sample_packing: true
|
||||
|
||||
tiled_mlp: true
|
||||
sequence_parallel_degree: 8
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 2e-5
|
||||
|
||||
bf16: auto
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
activation_offloading: legacy
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 100
|
||||
saves_per_epoch: 1
|
||||
evals_per_epoch: 2
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
pad_token: <|end_of_text|>
|
||||
|
||||
deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
59
examples/alst/llama3-8b-fsdp2-alst.yaml
Normal file
59
examples/alst/llama3-8b-fsdp2-alst.yaml
Normal file
@@ -0,0 +1,59 @@
|
||||
base_model: meta-llama/Llama-3.1-8B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
datasets:
|
||||
- path: togethercomputer/Long-Data-Collections
|
||||
type: completion
|
||||
field: text
|
||||
data_files:
|
||||
- pretrain/rp_sub.jsonl.zst
|
||||
- path: princeton-nlp/TextbookChapters
|
||||
type: completion
|
||||
field: chapter
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 500_000
|
||||
min_sample_len: 200_000
|
||||
sample_packing: true
|
||||
|
||||
tiled_mlp: true
|
||||
context_parallel_size: 8
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 2e-5
|
||||
|
||||
bf16: auto
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
activation_offloading: legacy
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 100
|
||||
saves_per_epoch: 1
|
||||
evals_per_epoch: 2
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
pad_token: <|end_of_text|>
|
||||
|
||||
fsdp_version: 2
|
||||
fsdp_config:
|
||||
offload_params: false # offloading is currently not compatible with SP + torchao optimizer
|
||||
state_dict_type: SHARDED_STATE_DICT
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
reshard_after_forward: true
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
5
examples/archived/README.md
Normal file
5
examples/archived/README.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# Archived Examples
|
||||
|
||||
This directory contains examples that are no longer maintained and may no longer be functional.
|
||||
|
||||
We keep them around for archival purposes in case they are useful to others.
|
||||
@@ -66,7 +66,7 @@ flash_optimum:
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
|
||||
warmup_steps: 32
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
save_total_limit:
|
||||
@@ -43,7 +43,7 @@ xformers_attention: true
|
||||
flash_attention:
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.1
|
||||
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -20,7 +20,7 @@ lora_model_dir:
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -20,7 +20,7 @@ lora_model_dir:
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -20,7 +20,7 @@ lora_model_dir:
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -54,7 +54,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch:
|
||||
saves_per_epoch: 1
|
||||
|
||||
@@ -57,7 +57,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch:
|
||||
saves_per_epoch: 1
|
||||
|
||||
@@ -41,7 +41,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch:
|
||||
saves_per_epoch: 1
|
||||
|
||||
@@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
@@ -51,7 +51,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -47,7 +47,7 @@ xformers_attention: true
|
||||
flash_attention:
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 40
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -77,7 +77,7 @@ xformers_attention: true
|
||||
flash_attention:
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.000001
|
||||
@@ -44,7 +44,7 @@ xformers_attention: true
|
||||
flash_attention:
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 40
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -25,7 +25,7 @@ lora_target_linear: true
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
@@ -40,7 +40,7 @@ xformers_attention: true
|
||||
flash_attention:
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.1
|
||||
@@ -41,7 +41,7 @@ xformers_attention: true
|
||||
flash_attention:
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 20
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.1
|
||||
@@ -42,7 +42,7 @@ logging_steps: 5
|
||||
flash_attention:
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 20
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0001
|
||||
@@ -42,7 +42,7 @@ logging_steps: 1
|
||||
flash_attention: true
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 20
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.1
|
||||
@@ -50,7 +50,7 @@ logging_steps: 1
|
||||
flash_attention: true
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 20
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.1
|
||||
@@ -43,7 +43,7 @@ logging_steps: 1
|
||||
flash_attention: true
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 20
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.1
|
||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention:
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention:
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -45,7 +45,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -43,7 +43,7 @@ logging_steps: 5
|
||||
flash_attention:
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 20
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0001
|
||||
@@ -41,7 +41,7 @@ logging_steps: 1
|
||||
flash_attention:
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 20
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0
|
||||
@@ -16,7 +16,7 @@ output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
adapter:
|
||||
lora_model_dir:
|
||||
@@ -50,7 +50,7 @@ flash_attn_rms_norm: true
|
||||
flash_attn_fuse_qkv: false
|
||||
flash_attn_fuse_mlp: true
|
||||
|
||||
warmup_steps: 100
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
|
||||
@@ -19,7 +19,7 @@ output_dir: ./outputs/lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
@@ -51,7 +51,7 @@ flash_attention: true
|
||||
flash_attn_cross_entropy: false
|
||||
flash_attn_rms_norm: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -19,7 +19,7 @@ lora_model_dir:
|
||||
|
||||
sequence_len: 8192
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 20
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
eval_steps:
|
||||
saves_per_epoch: 4
|
||||
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
eval_sample_packing: false
|
||||
|
||||
adapter: lora
|
||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: false
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 0
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -38,7 +38,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch:
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -21,7 +21,7 @@ lora_model_dir:
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -75,7 +75,7 @@ xformers_attention: true
|
||||
flash_attention:
|
||||
gptq_groupsize:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -20,7 +20,7 @@ special_tokens:
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
type: alpaca
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
|
||||
# Iterations
|
||||
num_epochs: 1
|
||||
@@ -27,7 +27,7 @@ lora_target_linear: true
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
@@ -35,7 +35,6 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
@@ -56,3 +55,5 @@ evals_per_epoch:
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
"%%capture\n",
|
||||
"# This step can take ~5-10 minutes to install dependencies\n",
|
||||
"!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
|
||||
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@78b2a45713a54c9bedf8b33f5e31cf07a1a57154\""
|
||||
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@010c3ac3f1e725098961832830303eeb4142dd88\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
@@ -51,8 +51,10 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
|
||||
@@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
@@ -51,8 +51,10 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
|
||||
@@ -12,7 +12,7 @@ output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
@@ -37,7 +37,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 100
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 2
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -55,3 +55,5 @@ fsdp_config:
|
||||
fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
|
||||
@@ -30,7 +30,7 @@ output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
@@ -61,7 +61,7 @@ resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 100
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 2
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -79,3 +79,5 @@ fsdp_config:
|
||||
fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
# Finetune Devstral with Axolotl
|
||||
|
||||
Devstral Small is a 24B parameter opensource model from MistralAI found on HuggingFace [Devstral-Small-2505](https://huggingface.co/mistralai/Devstral-Small-2505). This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
|
||||
Devstral Small is a 24B parameter opensource model from MistralAI found on HuggingFace [Devstral-Small-2505](https://huggingface.co/mistralai/Devstral-Small-2505) and [Devstral-Small-2507](https://huggingface.co/mistralai/Devstral-Small-2507). `Devstral-Small-2507` is the latest version of the model and has [function calling](https://mistralai.github.io/mistral-common/usage/tools/) support.
|
||||
|
||||
The model was fine-tuned ontop of [Mistral-Small-3.1](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503) without the vision layer and has a context of upto 128k tokens.
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
|
||||
|
||||
The model was fine-tuned ontop of [Mistral-Small-3.1](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503) without the vision layer and has a context of up to 128k tokens.
|
||||
|
||||
Thanks to the team at MistralAI for giving us early access to prepare for this release.
|
||||
|
||||
## Getting started
|
||||
|
||||
@@ -17,11 +21,6 @@ cd axolotl
|
||||
|
||||
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
|
||||
pip3 install --no-build-isolation -e '.[flash-attn]'
|
||||
|
||||
# Install the latest mistral-common from source
|
||||
pip3 uninstall mistral-common
|
||||
pip3 install git+https://github.com/mistralai/mistral-common.git@039465d
|
||||
|
||||
```
|
||||
|
||||
2. Run the finetuning example:
|
||||
@@ -39,6 +38,7 @@ Let us know how it goes. Happy finetuning! 🚀
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
- Learn how to use function calling with Axolotl at [docs](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#using-tool-use).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
@@ -57,6 +57,7 @@ In addition, we do not support overriding tokens yet.
|
||||
## Related Resources
|
||||
|
||||
- [MistralAI Devstral Blog](https://mistral.ai/news/devstral)
|
||||
- [MistralAI Devstral 1.1 Blog](https://mistral.ai/news/devstral-2507)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
base_model: mistralai/Devstral-Small-2505
|
||||
base_model: mistralai/Devstral-Small-2507
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
@@ -25,7 +25,7 @@ lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
@@ -62,3 +62,5 @@ saves_per_epoch: 1
|
||||
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
|
||||
@@ -38,7 +38,7 @@ lora_target_modules:
|
||||
sequence_len: 2048
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
@@ -69,3 +69,5 @@ evals_per_epoch:
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user