Compare commits
73 Commits
fix/hpc-ro
...
dft
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0a0115493d | ||
|
|
7a4f33802d | ||
|
|
170dca9bb9 | ||
|
|
d282f32481 | ||
|
|
6331e4a130 | ||
|
|
1410e4474e | ||
|
|
dc77b5bf42 | ||
|
|
359b7ad85e | ||
|
|
258ce8d4fa | ||
|
|
3e0bbd33ec | ||
|
|
4ae6f766ad | ||
|
|
e7f0d4ba5b | ||
|
|
7bf6f70e96 | ||
|
|
8aab807e67 | ||
|
|
ee59e4de97 | ||
|
|
4e61b8aa23 | ||
|
|
b26ba3a5cb | ||
|
|
afe18ace35 | ||
|
|
2b199f9915 | ||
|
|
e73dab6df9 | ||
|
|
f45a97a9ff | ||
|
|
11c0b5b256 | ||
|
|
66a3de3629 | ||
|
|
a6080df73c | ||
|
|
4f5e8a328a | ||
|
|
418933f0d1 | ||
|
|
372f664c63 | ||
|
|
97f1b1758d | ||
|
|
f2155eaf79 | ||
|
|
92ee4256f7 | ||
|
|
efeb5a4e41 | ||
|
|
faaff6c792 | ||
|
|
43cef27458 | ||
|
|
07c41a6c2a | ||
|
|
bbd3486f57 | ||
|
|
3750d7dd64 | ||
|
|
2197b0bf89 | ||
|
|
3e51a680c2 | ||
|
|
2cf254b4af | ||
|
|
83d4d97dcc | ||
|
|
a1d07f42e4 | ||
|
|
2a664dc8ad | ||
|
|
4ac78aa562 | ||
|
|
b3f4aa149f | ||
|
|
75b20fb66f | ||
|
|
5992e607a2 | ||
|
|
2b66ee189c | ||
|
|
86d8cca149 | ||
|
|
4a0f98e612 | ||
|
|
c6ddcdd06a | ||
|
|
7fb6a947d9 | ||
|
|
b234532d9f | ||
|
|
8990ca3205 | ||
|
|
006f226270 | ||
|
|
0b635e69c5 | ||
|
|
0d27e14e45 | ||
|
|
f5f21fb216 | ||
|
|
4e55871112 | ||
|
|
a6bafb55cb | ||
|
|
0fbde69e9c | ||
|
|
301e22849f | ||
|
|
dcf24fd24e | ||
|
|
49b8107989 | ||
|
|
9901ee5602 | ||
|
|
dd78f2e0cc | ||
|
|
b54f9c942b | ||
|
|
11eb36585a | ||
|
|
d0c846fc5e | ||
|
|
b5fcc2f14b | ||
|
|
b62eed8809 | ||
|
|
ed2e8cacd6 | ||
|
|
80270a92fa | ||
|
|
bfdc9a8249 |
6
.github/FUNDING.yml
vendored
6
.github/FUNDING.yml
vendored
@@ -1,13 +1,13 @@
|
||||
# These are supported funding model platforms
|
||||
|
||||
github: [winglian, OpenAccess-AI-Collective] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
||||
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
||||
patreon: # Replace with a single Patreon username
|
||||
open_collective: # Replace with a single Open Collective username
|
||||
ko_fi: axolotl_ai # Replace with a single Ko-fi username
|
||||
ko_fi: # Replace with a single Ko-fi username
|
||||
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
|
||||
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
|
||||
liberapay: # Replace with a single Liberapay username
|
||||
issuehunt: # Replace with a single IssueHunt username
|
||||
otechie: # Replace with a single Otechie username
|
||||
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
|
||||
custom: ['https://quickchart.io/qr?text=bitcoin%3Abc1qxlgwlqwfea5s2cxm42xqsfmwjct0rj8w8ea5np&size=480¢erImageUrl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fcommons%2Fthumb%2F4%2F46%2FBitcoin.svg%2F64px-Bitcoin.svg.png'] # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
|
||||
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
|
||||
|
||||
5
.github/PULL_REQUEST_TEMPLATE.md
vendored
5
.github/PULL_REQUEST_TEMPLATE.md
vendored
@@ -15,6 +15,11 @@
|
||||
<!--- Include details of your testing environment, tests ran to see how -->
|
||||
<!--- your change affects other areas of the code, etc. -->
|
||||
|
||||
## AI Usage Disclaimer
|
||||
|
||||
<!--- Was AI (e.g., ChatGPT, Claude, Copilot) used to generate or assist with this PR? -->
|
||||
<!--- Please indicate: No / Yes (specify which tool and to what extent) -->
|
||||
|
||||
## Screenshots (if appropriate)
|
||||
|
||||
## Types of changes
|
||||
|
||||
70
.github/workflows/base.yml
vendored
70
.github/workflows/base.yml
vendored
@@ -21,31 +21,12 @@ jobs:
|
||||
timeout-minutes: 480
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
runs-on: ubuntu-latest-m
|
||||
env:
|
||||
HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: "126"
|
||||
cuda_version: 12.6.3
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
- cuda: "126"
|
||||
cuda_version: 12.6.3
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
@@ -53,6 +34,7 @@ jobs:
|
||||
pytorch: 2.8.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
platforms: "linux/amd64"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
@@ -60,13 +42,23 @@ jobs:
|
||||
pytorch: 2.9.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "130"
|
||||
cuda_version: 13.0.0
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.0
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
# - cuda: "128"
|
||||
# cuda_version: 12.8.1
|
||||
# cudnn_version: ""
|
||||
@@ -90,10 +82,10 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
winglian/axolotl-base
|
||||
axolotlai/axolotl-base
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
@@ -104,6 +96,7 @@ jobs:
|
||||
with:
|
||||
context: .
|
||||
file: ./docker/${{ matrix.dockerfile }}
|
||||
platforms: ${{ matrix.platforms }}
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
@@ -118,24 +111,12 @@ jobs:
|
||||
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
||||
timeout-minutes: 480
|
||||
runs-on: ubuntu-latest-m
|
||||
env:
|
||||
HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: "126"
|
||||
cuda_version: 12.6.3
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
@@ -143,6 +124,15 @@ jobs:
|
||||
pytorch: 2.8.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
platforms: "linux/amd64"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
@@ -150,13 +140,15 @@ jobs:
|
||||
pytorch: 2.9.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "130"
|
||||
cuda_version: 13.0.0
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.0
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
@@ -168,6 +160,7 @@ jobs:
|
||||
axolotlai/axolotl-base-uv
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
@@ -178,6 +171,7 @@ jobs:
|
||||
with:
|
||||
context: .
|
||||
file: ./docker/${{ matrix.dockerfile }}
|
||||
platforms: ${{ matrix.platforms }}
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
|
||||
3
.github/workflows/docs.yml
vendored
3
.github/workflows/docs.yml
vendored
@@ -12,6 +12,9 @@ jobs:
|
||||
build-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: cleanup node
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up Quarto
|
||||
|
||||
104
.github/workflows/main.yml
vendored
104
.github/workflows/main.yml
vendored
@@ -15,27 +15,31 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.0
|
||||
axolotl_extras:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras: vllm
|
||||
is_latest: true
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.8.0
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64"
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.0
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
is_latest: true
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -45,7 +49,6 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
winglian/axolotl
|
||||
axolotlai/axolotl
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
@@ -62,6 +65,7 @@ jobs:
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.platforms }}
|
||||
build-args: |
|
||||
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
@@ -83,33 +87,31 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.0
|
||||
axolotl_extras:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
is_latest:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras: vllm
|
||||
is_latest: true
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.8.0
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64"
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.0
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -119,7 +121,6 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
winglian/axolotl-cloud
|
||||
axolotlai/axolotl-cloud
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
@@ -135,6 +136,7 @@ jobs:
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.platforms }}
|
||||
build-args: |
|
||||
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
@@ -152,22 +154,16 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
is_latest:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras: vllm
|
||||
is_latest: true
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.8.0
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
is_latest:
|
||||
runs-on: axolotl-gpu-runner
|
||||
@@ -179,7 +175,6 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
winglian/axolotl-cloud-term
|
||||
axolotlai/axolotl-cloud-term
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
@@ -195,6 +190,7 @@ jobs:
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64
|
||||
build-args: |
|
||||
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
|
||||
23
.github/workflows/multi-gpu-e2e.yml
vendored
23
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -19,6 +19,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
env:
|
||||
MODAL_IMAGE_BUILDER_VERSION: "2025.06"
|
||||
|
||||
jobs:
|
||||
test-axolotl-multigpu:
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
||||
@@ -26,13 +29,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras: vllm
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
@@ -43,7 +39,14 @@ jobs:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.0
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras: fbgemm-gpu
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras: fbgemm-gpu
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
@@ -59,7 +62,7 @@ jobs:
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.0.2 jinja2
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
@@ -72,4 +75,4 @@ jobs:
|
||||
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
||||
- name: Run tests job on Modal
|
||||
run: |
|
||||
modal run cicd.multigpu
|
||||
modal run -m cicd.multigpu
|
||||
|
||||
22
.github/workflows/nightlies.yml
vendored
22
.github/workflows/nightlies.yml
vendored
@@ -12,16 +12,16 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.8.0
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -31,7 +31,6 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
winglian/axolotl
|
||||
axolotlai/axolotl
|
||||
tags: |
|
||||
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
||||
@@ -65,16 +64,16 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.8.0
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -84,7 +83,6 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
winglian/axolotl-cloud
|
||||
axolotlai/axolotl-cloud
|
||||
tags: |
|
||||
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
||||
|
||||
5
.github/workflows/preview-docs.yml
vendored
5
.github/workflows/preview-docs.yml
vendored
@@ -11,6 +11,7 @@ on:
|
||||
- '_quarto.yml'
|
||||
- docs/scripts/generate_config_docs.py
|
||||
- src/axolotl/utils/schemas/**.py
|
||||
- .github/workflows/preview-docs.yml
|
||||
|
||||
permissions:
|
||||
checks: write
|
||||
@@ -27,6 +28,10 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
steps:
|
||||
- name: cleanup node
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
||||
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
|
||||
20
.github/workflows/tests-nightly.yml
vendored
20
.github/workflows/tests-nightly.yml
vendored
@@ -26,7 +26,7 @@ jobs:
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.7.1", "2.8.0"]
|
||||
pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
@@ -99,17 +99,17 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
pytorch: 2.8.0
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.8.0
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
@@ -123,7 +123,7 @@ jobs:
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.0.2 jinja2
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
@@ -148,10 +148,10 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 2
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
@@ -165,7 +165,7 @@ jobs:
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.0.2 jinja2
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
|
||||
87
.github/workflows/tests.yml
vendored
87
.github/workflows/tests.yml
vendored
@@ -55,18 +55,23 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.7.1", "2.8.0", "2.9.0"]
|
||||
pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
- name: cleanup node
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
||||
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Restore Cache from S3
|
||||
id: hf-cache-restore-s3
|
||||
run: |
|
||||
mkdir -p /home/runner/.cache/huggingface/hub
|
||||
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
||||
mkdir -p ~/.cache/huggingface/hub
|
||||
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1
|
||||
ls -ltr ~/.cache/huggingface/hub/
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
@@ -91,6 +96,10 @@ jobs:
|
||||
python scripts/cutcrossentropy_install.py | sh
|
||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
||||
|
||||
- name: cleanup pip cache
|
||||
run: |
|
||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||
|
||||
- name: Make sure PyTorch version wasn't clobbered
|
||||
run: |
|
||||
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
|
||||
@@ -103,13 +112,23 @@ jobs:
|
||||
run: |
|
||||
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
||||
|
||||
- name: Show HF cache
|
||||
run: hf cache scan
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
|
||||
df -h
|
||||
pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
|
||||
df -h
|
||||
pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
|
||||
df -h
|
||||
pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml
|
||||
df -h
|
||||
pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
|
||||
|
||||
- name: Show HF cache
|
||||
run: hf cache scan
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
@@ -118,10 +137,6 @@ jobs:
|
||||
flags: unittests,pytorch-${{ matrix.pytorch_version }}
|
||||
fail_ci_if_error: false
|
||||
|
||||
- name: cleanup pip cache
|
||||
run: |
|
||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||
|
||||
pytest-sdist:
|
||||
name: PyTest from Source Dist
|
||||
runs-on: ubuntu-latest
|
||||
@@ -130,18 +145,23 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.7.1", "2.8.0", "2.9.0"]
|
||||
pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
- name: cleanup node
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
||||
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Restore Cache from S3
|
||||
id: hf-cache-restore-s3
|
||||
run: |
|
||||
mkdir -p /home/runner/.cache/huggingface/hub
|
||||
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
||||
mkdir -p ~/.cache/huggingface/hub
|
||||
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1
|
||||
ls -ltr ~/.cache/huggingface/hub/
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
@@ -167,6 +187,10 @@ jobs:
|
||||
python scripts/cutcrossentropy_install.py | sh
|
||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
||||
|
||||
- name: cleanup pip cache
|
||||
run: |
|
||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||
|
||||
- name: Make sure PyTorch version wasn't clobbered
|
||||
run: |
|
||||
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
|
||||
@@ -176,17 +200,16 @@ jobs:
|
||||
axolotl --help
|
||||
|
||||
- name: Show HF cache
|
||||
run: huggingface-cli scan-cache
|
||||
run: hf cache scan
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
|
||||
pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
|
||||
pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
|
||||
pytest -v --durations=10 tests/cli/
|
||||
|
||||
- name: cleanup pip cache
|
||||
run: |
|
||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||
- name: Show HF cache
|
||||
run: hf cache scan
|
||||
|
||||
gate-skip-e2e:
|
||||
needs: [pre-commit, pytest, pytest-sdist]
|
||||
@@ -248,7 +271,7 @@ jobs:
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.0.2 jinja2
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
@@ -280,18 +303,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
# - cuda: 128
|
||||
# cuda_version: 12.8.1
|
||||
# python_version: "3.11"
|
||||
# pytorch: 2.7.1
|
||||
# num_gpus: 1
|
||||
# axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
@@ -302,7 +313,13 @@ jobs:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.0
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
steps:
|
||||
@@ -315,7 +332,7 @@ jobs:
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.0.2 jinja2
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
@@ -342,10 +359,10 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
steps:
|
||||
@@ -358,7 +375,7 @@ jobs:
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.0.2 jinja2
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
|
||||
@@ -11,13 +11,13 @@ repos:
|
||||
- id: no-commit-to-branch
|
||||
args: ['--branch', 'main']
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.14.3
|
||||
rev: v0.14.10
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--fix]
|
||||
- id: ruff-format
|
||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||
rev: v1.18.2
|
||||
rev: v1.19.1
|
||||
hooks:
|
||||
- id: mypy
|
||||
additional_dependencies:
|
||||
@@ -26,7 +26,7 @@ repos:
|
||||
'pydantic>=2.5.3',
|
||||
]
|
||||
- repo: https://github.com/PyCQA/bandit
|
||||
rev: 1.8.6
|
||||
rev: 1.9.2
|
||||
hooks:
|
||||
- id: bandit
|
||||
args: [
|
||||
|
||||
@@ -10,6 +10,7 @@ ARG BASE_VOLUME="/runpod-volume"
|
||||
ENV BASE_VOLUME=$BASE_VOLUME
|
||||
ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
|
||||
ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
||||
ENV HF_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
||||
ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
||||
|
||||
COPY .runpod/src /src
|
||||
|
||||
23
README.md
23
README.md
@@ -29,21 +29,25 @@
|
||||
|
||||
## 🎉 Latest Updates
|
||||
|
||||
- 2025/12: Axolotl now includes support for [Kimi-Linear](https://docs.axolotl.ai/docs/models/kimi-linear.html), [Plano-Orchestrator](https://docs.axolotl.ai/docs/models/plano.html), [MiMo](https://docs.axolotl.ai/docs/models/mimo.html), [InternVL 3.5](https://docs.axolotl.ai/docs/models/internvl3_5.html), [Olmo3](https://docs.axolotl.ai/docs/models/olmo3.html), [Trinity](https://docs.axolotl.ai/docs/models/trinity.html), and [Ministral3](https://docs.axolotl.ai/docs/models/ministral3.html).
|
||||
- 2025/10: New model support has been added in Axolotl for: [Qwen3 Next](https://docs.axolotl.ai/docs/models/qwen3-next.html), [Qwen2.5-vl, Qwen3-vl](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl), [Qwen3, Qwen3MoE](https://docs.axolotl.ai/docs/models/qwen3.html), [Granite 4](https://docs.axolotl.ai/docs/models/granite4.html), [HunYuan](https://docs.axolotl.ai/docs/models/hunyuan.html), [Magistral 2509](https://docs.axolotl.ai/docs/models/magistral/vision.html), [Apertus](https://docs.axolotl.ai/docs/models/apertus.html), and [Seed-OSS](https://docs.axolotl.ai/docs/models/seed-oss.html).
|
||||
- 2025/09: Axolotl now has text diffusion training. Read more [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/diffusion).
|
||||
- 2025/08: QAT has been updated to include NVFP4 support. See [PR](https://github.com/axolotl-ai-cloud/axolotl/pull/3107).
|
||||
- 2025/07:
|
||||
- ND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the [blog post](https://huggingface.co/blog/accelerate-nd-parallel) for more info.
|
||||
- Axolotl adds more models: [GPT-OSS](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gpt-oss), [Gemma 3n](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gemma3n), [Liquid Foundation Model 2 (LFM2)](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/lfm2), and [Arcee Foundation Models (AFM)](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/afm).
|
||||
- Axolotl adds more models: [GPT-OSS](https://docs.axolotl.ai/docs/models/gpt-oss.html), [Gemma 3n](https://docs.axolotl.ai/docs/models/gemma3n.html), [Liquid Foundation Model 2 (LFM2)](https://docs.axolotl.ai/docs/models/LiquidAI.html), and [Arcee Foundation Models (AFM)](https://docs.axolotl.ai/docs/models/arcee.html).
|
||||
- FP8 finetuning with fp8 gather op is now possible in Axolotl via `torchao`. Get started [here](https://docs.axolotl.ai/docs/mixed_precision.html#sec-fp8)!
|
||||
- [Voxtral](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/voxtral), [Magistral 1.1](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral), and [Devstral](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/devstral) with mistral-common tokenizer support has been integrated in Axolotl!
|
||||
- [Voxtral](https://docs.axolotl.ai/docs/models/voxtral.html), [Magistral 1.1](https://docs.axolotl.ai/docs/models/magistral.html), and [Devstral](https://docs.axolotl.ai/docs/models/devstral.html) with mistral-common tokenizer support has been integrated in Axolotl!
|
||||
- TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
|
||||
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
|
||||
- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
|
||||
|
||||
<details>
|
||||
|
||||
<summary>Expand older updates</summary>
|
||||
|
||||
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
|
||||
- 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
|
||||
- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
|
||||
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [docs](https://docs.axolotl.ai/docs/models/magistral.html) to start training your own Magistral models with Axolotl!
|
||||
- 2025/04: Llama 4 support has been added in Axolotl. See [docs](https://docs.axolotl.ai/docs/models/llama-4.html) to start training your own Llama 4 models with Axolotl's linearized version!
|
||||
- 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
|
||||
- 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
|
||||
- 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
|
||||
@@ -73,7 +77,7 @@ Features:
|
||||
|
||||
- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
|
||||
- Python 3.11
|
||||
- PyTorch ≥2.7.1
|
||||
- PyTorch ≥2.8.0
|
||||
|
||||
### Google Colab
|
||||
|
||||
@@ -154,6 +158,13 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge
|
||||
|
||||
Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.
|
||||
|
||||
## 📈 Telemetry
|
||||
|
||||
Axolotl has opt-out telemetry that helps us understand how the project is being used
|
||||
and prioritize improvements. We collect basic system information, model types, and
|
||||
error rates—never personal data or file paths. Telemetry is enabled by default. To
|
||||
disable it, set AXOLOTL_DO_NOT_TRACK=1. For more details, see our [telemetry documentation](https://docs.axolotl.ai/docs/telemetry.html).
|
||||
|
||||
## ❤️ Sponsors
|
||||
|
||||
Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai)
|
||||
|
||||
45
_quarto.yml
45
_quarto.yml
@@ -1,6 +1,8 @@
|
||||
project:
|
||||
type: website
|
||||
pre-render: docs/scripts/generate_config_docs.py
|
||||
pre-render:
|
||||
- docs/scripts/generate_config_docs.py
|
||||
- docs/scripts/generate_examples_docs.py
|
||||
|
||||
quartodoc:
|
||||
dir: docs/api
|
||||
@@ -240,7 +242,48 @@ website:
|
||||
- docs/getting-started.qmd
|
||||
- docs/installation.qmd
|
||||
- docs/inference.qmd
|
||||
- section: "Model Guides"
|
||||
contents:
|
||||
- docs/models/kimi-linear.qmd
|
||||
- docs/models/plano.qmd
|
||||
- docs/models/mimo.qmd
|
||||
- docs/models/internvl3_5.qmd
|
||||
- docs/models/olmo3.qmd
|
||||
- docs/models/trinity.qmd
|
||||
- docs/models/arcee.qmd
|
||||
- docs/models/mistral.qmd
|
||||
- section: "Ministral3"
|
||||
contents:
|
||||
- docs/models/ministral3.qmd
|
||||
- docs/models/ministral3/think.qmd
|
||||
- docs/models/ministral3/vision.qmd
|
||||
- section: "Magistral"
|
||||
contents:
|
||||
- docs/models/magistral.qmd
|
||||
- docs/models/magistral/think.qmd
|
||||
- docs/models/magistral/vision.qmd
|
||||
- docs/models/ministral.qmd
|
||||
- docs/models/mistral-small.qmd
|
||||
- docs/models/voxtral.qmd
|
||||
- docs/models/devstral.qmd
|
||||
- docs/models/llama-4.qmd
|
||||
- docs/models/llama-2.qmd
|
||||
- docs/models/qwen3-next.qmd
|
||||
- docs/models/qwen3.qmd
|
||||
- docs/models/gemma3n.qmd
|
||||
- docs/models/apertus.qmd
|
||||
- docs/models/gpt-oss.qmd
|
||||
- docs/models/seed-oss.qmd
|
||||
- docs/models/phi.qmd
|
||||
- docs/models/smolvlm2.qmd
|
||||
- docs/models/granite4.qmd
|
||||
- docs/models/LiquidAI.qmd
|
||||
- docs/models/hunyuan.qmd
|
||||
- docs/models/jamba.qmd
|
||||
- docs/models/orpheus.qmd
|
||||
|
||||
- docs/cli.qmd
|
||||
- docs/telemetry.qmd
|
||||
- docs/config-reference.qmd
|
||||
- text: "API Reference"
|
||||
href: docs/api
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
set -e
|
||||
|
||||
# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
|
||||
pytest -v --durations=10 -n2 \
|
||||
pytest -v --durations=10 -n2 --maxfail=4 \
|
||||
--ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
|
||||
--ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
|
||||
/workspace/axolotl/tests/e2e/multigpu/ \
|
||||
|
||||
@@ -6,6 +6,7 @@ ARG AXOLOTL_EXTRAS=""
|
||||
ARG AXOLOTL_ARGS=""
|
||||
ARG CUDA="118"
|
||||
ARG PYTORCH_VERSION="2.1.2"
|
||||
ARG TARGETARCH
|
||||
|
||||
ENV PYTORCH_VERSION=$PYTORCH_VERSION
|
||||
|
||||
@@ -20,13 +21,17 @@ RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
|
||||
WORKDIR /workspace/axolotl
|
||||
|
||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
# If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
|
||||
RUN if [ "$TARGETARCH" = "arm64" ]; then \
|
||||
BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
|
||||
else \
|
||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
||||
BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
|
||||
fi && \
|
||||
python scripts/unsloth_install.py | sh && \
|
||||
if [ "$AXOLOTL_EXTRAS" != "" ]; then \
|
||||
pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
else \
|
||||
pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
|
||||
fi && \ python scripts/unsloth_install.py | sh && \
|
||||
python scripts/cutcrossentropy_install.py | sh && \
|
||||
pip install pytest && \
|
||||
pip cache purge
|
||||
|
||||
@@ -2,14 +2,16 @@ ARG CUDA_VERSION="11.8.0"
|
||||
ARG CUDNN_VERSION="8"
|
||||
ARG UBUNTU_VERSION="22.04"
|
||||
ARG MAX_JOBS=4
|
||||
ARG TARGETARCH
|
||||
|
||||
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
||||
|
||||
ENV PATH="/root/miniconda3/bin:${PATH}"
|
||||
|
||||
ARG PYTHON_VERSION="3.10"
|
||||
ARG TARGETARCH
|
||||
ARG PYTHON_VERSION="3.11"
|
||||
ARG PYTORCH_VERSION="2.1.2"
|
||||
ARG CUDA="118"
|
||||
ARG CUDA="128"
|
||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
||||
|
||||
ENV PYTHON_VERSION=$PYTHON_VERSION
|
||||
@@ -22,11 +24,17 @@ RUN apt-get update \
|
||||
librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm \
|
||||
&& rm -rf /var/cache/apt/archives \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& wget \
|
||||
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
||||
&& if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
MINICONDA_ARCH="x86_64"; \
|
||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
||||
MINICONDA_ARCH="aarch64"; \
|
||||
else \
|
||||
echo "Unsupported architecture: $TARGETARCH"; exit 1; \
|
||||
fi \
|
||||
&& wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
|
||||
&& mkdir /root/.conda \
|
||||
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
||||
&& bash Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh -b \
|
||||
&& rm -f Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
|
||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
|
||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
|
||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||
@@ -51,8 +59,34 @@ RUN git lfs install --skip-repo && \
|
||||
pip3 install -U --no-cache-dir pydantic==1.10.10 && \
|
||||
pip3 cache purge
|
||||
|
||||
RUN if [ "$PYTORCH_VERSION" = "2.9.0" ] && [ "$CUDA" = "128" ] ; then \
|
||||
wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
pip3 install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
fi
|
||||
RUN case "$PYTORCH_VERSION" in \
|
||||
2.9.[0-9]*) \
|
||||
if [ "$CUDA" = "128" ]; then \
|
||||
if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
WHL_FILE="flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl"; \
|
||||
WHL_VERSION="v0.5.4"; \
|
||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
||||
WHL_FILE="flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl"; \
|
||||
WHL_VERSION="v0.6.4"; \
|
||||
else \
|
||||
echo "Unsupported architecture: $TARGETARCH"; exit 1; \
|
||||
fi; \
|
||||
wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}; \
|
||||
pip3 install --no-cache-dir ${WHL_FILE}; \
|
||||
rm ${WHL_FILE}; \
|
||||
elif [ "$CUDA" = "130" ]; then \
|
||||
if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
WHL_FILE="flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl"; \
|
||||
WHL_VERSION="v0.5.4"; \
|
||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
||||
WHL_FILE="flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl"; \
|
||||
WHL_VERSION="v0.6.4"; \
|
||||
else \
|
||||
echo "Unsupported architecture: $TARGETARCH"; exit 1; \
|
||||
fi; \
|
||||
wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}; \
|
||||
pip3 install --no-cache-dir ${WHL_FILE}; \
|
||||
rm ${WHL_FILE}; \
|
||||
fi \
|
||||
;; \
|
||||
esac
|
||||
|
||||
@@ -2,6 +2,7 @@ ARG CUDA_VERSION="12.6.3"
|
||||
ARG CUDNN_VERSION=""
|
||||
ARG UBUNTU_VERSION="22.04"
|
||||
ARG MAX_JOBS=4
|
||||
ARG TARGETARCH
|
||||
|
||||
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
||||
|
||||
@@ -31,12 +32,35 @@ ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
|
||||
|
||||
RUN uv pip install packaging setuptools wheel psutil \
|
||||
&& uv pip install torch==${PYTORCH_VERSION} torchvision \
|
||||
&& uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
|
||||
&& uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
|
||||
&& uv pip install awscli pydantic
|
||||
|
||||
RUN if [ "$PYTORCH_VERSION" = "2.9.0" ] && [ "$CUDA" = "128" ] ; then \
|
||||
wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main"; \
|
||||
uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
|
||||
fi
|
||||
|
||||
RUN case "$PYTORCH_VERSION" in \
|
||||
2.9.[0-9]*) \
|
||||
if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
if [ "$CUDA" = "128" ]; then \
|
||||
wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.5.4/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
elif [ "$CUDA" = "130" ]; then \
|
||||
wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.5.4/flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
uv pip install --no-cache-dir flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
rm flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
fi \
|
||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
||||
if [ "$CUDA" = "128" ]; then \
|
||||
wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.6.4/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl; \
|
||||
uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl; \
|
||||
rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl; \
|
||||
elif [ "$CUDA" = "130" ]; then \
|
||||
wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.6.4/flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl; \
|
||||
uv pip install --no-cache-dir flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl; \
|
||||
rm flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl; \
|
||||
fi \
|
||||
fi \
|
||||
;; \
|
||||
esac
|
||||
|
||||
2
docs/.gitignore
vendored
2
docs/.gitignore
vendored
@@ -3,3 +3,5 @@ _site/
|
||||
/api/*.qmd
|
||||
/api/*.html
|
||||
config-reference.qmd
|
||||
models/**/*.qmd
|
||||
models/**/*.html
|
||||
|
||||
86
docs/checkpoint_saving.qmd
Normal file
86
docs/checkpoint_saving.qmd
Normal file
@@ -0,0 +1,86 @@
|
||||
---
|
||||
title: "Checkpoint Saving"
|
||||
format:
|
||||
html:
|
||||
toc: true
|
||||
toc-depth: 2
|
||||
number-sections: true
|
||||
execute:
|
||||
enabled: false
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Axolotl supports on-demand checkpoint saving during training. You can trigger checkpoints via file-based triggers (for programmatic control) or Control+C (for interactive use).
|
||||
|
||||
## File-Based Checkpoint Trigger
|
||||
|
||||
### Configuration
|
||||
|
||||
Enable in your config:
|
||||
|
||||
```yaml
|
||||
dynamic_checkpoint:
|
||||
enabled: true
|
||||
check_interval: 100 # Optional: check every N steps (default: 100)
|
||||
trigger_file_path: "axolotl_checkpoint.save" # Optional: custom filename
|
||||
```
|
||||
|
||||
**Options:**
|
||||
- `enabled`: `true` to enable (required)
|
||||
- `check_interval`: Steps between file checks. Default: 100. Lower = faster response, higher I/O overhead.
|
||||
- `trigger_file_path`: Custom trigger filename. Default: `axolotl_checkpoint.save`
|
||||
|
||||
### How It Works
|
||||
|
||||
1. Rank 0 checks for trigger file every `check_interval` steps in `output_dir`
|
||||
2. When detected, file is deleted and checkpoint is saved
|
||||
3. In distributed training, rank 0 broadcasts to synchronize all ranks
|
||||
|
||||
### Usage
|
||||
|
||||
**Command line:**
|
||||
```bash
|
||||
touch /path/to/output_dir/axolotl_checkpoint.save
|
||||
```
|
||||
|
||||
**Programmatic:**
|
||||
```python
|
||||
from pathlib import Path
|
||||
Path("/path/to/output_dir/axolotl_checkpoint.save").touch()
|
||||
```
|
||||
|
||||
Checkpoint saves within the next `check_interval` steps. The trigger file is auto-deleted after detection, so you can create it multiple times.
|
||||
|
||||
**Custom filename:**
|
||||
```yaml
|
||||
dynamic_checkpoint:
|
||||
enabled: true
|
||||
trigger_file_path: "my_trigger.save"
|
||||
```
|
||||
```bash
|
||||
touch /path/to/output_dir/my_trigger.save
|
||||
```
|
||||
|
||||
## Control+C (SIGINT) Checkpoint
|
||||
|
||||
Pressing `Ctrl+C` during training saves the model state and exits gracefully. **Note:** This saves only the model weights, not optimizer state. For resumable checkpoints, use the file-based trigger.
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Check interval**: Lower values (10-50) for fast training, default 100 for slower training
|
||||
- **Distributed training**: Create trigger file once; rank 0 handles synchronization
|
||||
- **Resume**: Dynamic checkpoints can be resumed like regular checkpoints via `resume_from_checkpoint`
|
||||
|
||||
## Example
|
||||
|
||||
```yaml
|
||||
output_dir: ./outputs/lora-out
|
||||
save_steps: 500 # Scheduled checkpoints
|
||||
|
||||
dynamic_checkpoint:
|
||||
enabled: true
|
||||
check_interval: 50
|
||||
```
|
||||
|
||||
This enables scheduled checkpoints every 500 steps plus on-demand saves via file trigger (checked every 50 steps).
|
||||
@@ -218,6 +218,13 @@ If you have tool arguments with same name but different dtypes (like `"time": st
|
||||
```
|
||||
"arguments": "{\"...\": \"...\"}"
|
||||
```
|
||||
|
||||
The same is applicable for tool parameters.
|
||||
|
||||
```
|
||||
"parameters": "{\"...\": \"...\"}"
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
Example config for Llama4:
|
||||
|
||||
@@ -32,11 +32,8 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
|
||||
|
||||
Tags examples:
|
||||
|
||||
- `main-base-py3.11-cu128-2.7.1`
|
||||
- `main-base-py3.11-cu126-2.7.1`
|
||||
- `main-base-py3.11-cu126-2.7.0`
|
||||
- `main-base-py3.11-cu126-2.6.0`
|
||||
- `main-base-py3.11-cu124-2.6.0`
|
||||
- `main-base-py3.11-cu128-2.8.0`
|
||||
- `main-base-py3.11-cu128-2.9.1`
|
||||
|
||||
## Main
|
||||
|
||||
@@ -74,15 +71,12 @@ There may be some extra tags appended to the image, like `-vllm` which installs
|
||||
|
||||
Tags examples:
|
||||
|
||||
- `main-py3.11-cu128-2.7.1`
|
||||
- `main-py3.11-cu126-2.7.1`
|
||||
- `main-py3.11-cu126-2.7.0`
|
||||
- `main-py3.11-cu126-2.6.0`
|
||||
- `main-py3.11-cu124-2.6.0`
|
||||
- `main-py3.11-cu128-2.8.0`
|
||||
- `main-py3.11-cu128-2.9.1`
|
||||
- `main-latest`
|
||||
- `main-20250303-py3.11-cu124-2.6.0`
|
||||
- `main-20250303-py3.11-cu126-2.6.0`
|
||||
- `0.10.1`
|
||||
- `0.12.0`
|
||||
|
||||
## Cloud
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ Follow the instructions at: [https://pytorch.org/get-started/locally/](https://p
|
||||
:::
|
||||
|
||||
::: {.callout-important}
|
||||
For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
|
||||
For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.
|
||||
:::
|
||||
|
||||
### PyPI Installation (Recommended) {#sec-pypi}
|
||||
@@ -111,7 +111,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
|
||||
:::
|
||||
|
||||
::: {.callout-important}
|
||||
For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.7.0` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0`.
|
||||
For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1`.
|
||||
:::
|
||||
|
||||
Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
|
||||
|
||||
@@ -4,7 +4,7 @@ format:
|
||||
html:
|
||||
toc: true
|
||||
toc-depth: 3
|
||||
number-sections: true
|
||||
# number-sections: true
|
||||
code-tools: true
|
||||
execute:
|
||||
enabled: false
|
||||
@@ -14,12 +14,18 @@ This guide covers advanced training configurations for multi-GPU setups using Ax
|
||||
|
||||
## Overview {#sec-overview}
|
||||
|
||||
Axolotl supports several methods for multi-GPU training:
|
||||
When training on multiple GPUs, Axolotl supports 3 sharding/parallelism strategies. Additionally, you can layer specific optimization features on top of that strategy.
|
||||
|
||||
- DeepSpeed (recommended)
|
||||
- FSDP (Fully Sharded Data Parallel)
|
||||
- Sequence parallelism
|
||||
- FSDP + QLoRA
|
||||
You generally cannot combine these strategies; they are mutually exclusive.
|
||||
|
||||
1. **DeepSpeed**: Powerful optimization library, supports ZeRO stages 1-3.
|
||||
2. **FSDP (Fully Sharded Data Parallel)**: PyTorch's native sharding implementation (Recommended).
|
||||
3. **DDP (Distributed Data Parallel)**: PyTorch's native parallelism implementation (Default if neither of the above are selected).
|
||||
|
||||
These features can often be combined with the strategies above:
|
||||
|
||||
* **Sequence Parallelism**: Splits long sequences across GPUs (Compatible with DDP, DeepSpeed, and FSDP).
|
||||
* **FSDP + QLoRA**: Combines 4-bit quantization with FSDP (Specific to FSDP).
|
||||
|
||||
## DeepSpeed {#sec-deepspeed}
|
||||
|
||||
@@ -65,12 +71,18 @@ Start from Stage 1 -> Stage 2 -> Stage 3.
|
||||
|
||||
## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
|
||||
|
||||
FSDP allows you to shard model parameters, gradients, and optimizer states across data parallel workers.
|
||||
|
||||
::: {.callout-note}
|
||||
|
||||
FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.
|
||||
|
||||
:::
|
||||
|
||||
### FSDP + QLoRA {#sec-fsdp-qlora}
|
||||
|
||||
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
|
||||
|
||||
### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}
|
||||
|
||||
To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and
|
||||
@@ -145,10 +157,6 @@ single sequence causes OOM errors during model training.
|
||||
|
||||
See our [dedicated guide](sequence_parallelism.qmd) for more information.
|
||||
|
||||
### FSDP + QLoRA {#sec-fsdp-qlora}
|
||||
|
||||
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
|
||||
|
||||
## Performance Optimization {#sec-performance}
|
||||
|
||||
### Liger Kernel Integration {#sec-liger}
|
||||
|
||||
@@ -21,6 +21,7 @@ format:
|
||||
- [Qwen2.5-VL](#sec-qwen25-vl)
|
||||
- [SmolVLM2](#sec-smolvlm2)
|
||||
- [LFM2-VL](#sec-lfm2-vl)
|
||||
- [Intern-VL](#sec-intern-vl)
|
||||
|
||||
## Usage
|
||||
|
||||
@@ -124,6 +125,8 @@ Please make sure to install audio lib via `pip3 install librosa==0.11.0 'mistral
|
||||
|
||||
```yaml
|
||||
base_model: mistralai/Voxtral-Mini-3B-2507
|
||||
|
||||
processor_type: VoxtralProcessor
|
||||
```
|
||||
|
||||
### Gemma-3 {#sec-gemma-3}
|
||||
@@ -200,6 +203,16 @@ Please uninstall `causal-conv1d` via `pip3 uninstall -y causal-conv1d`
|
||||
base_model: LiquidAI/LFM2-VL-450M
|
||||
```
|
||||
|
||||
### Intern-VL {#sec-intern-vl}
|
||||
|
||||
::: {.callout-tip}
|
||||
Please make sure to install `timm` via `pip3 install timm==1.0.19`
|
||||
:::
|
||||
|
||||
```yaml
|
||||
base_model: OpenGVLab/InternVL3_5-8B
|
||||
```
|
||||
|
||||
## Dataset Format
|
||||
|
||||
For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
|
||||
|
||||
110
docs/rlhf.qmd
110
docs/rlhf.qmd
@@ -597,6 +597,116 @@ To see other examples of custom reward functions, please see [TRL GRPO Docs](htt
|
||||
|
||||
To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
|
||||
|
||||
#### OpenEnv Rollout Functions
|
||||
|
||||
GRPO supports custom rollout functions for OpenEnv-style environments, enabling interactive tasks like web browsing, code execution, or tool use. This allows you to implement custom generation logic that interacts with external environments.
|
||||
|
||||
For example, to implement a simple math-solving environment with step-by-step verification:
|
||||
|
||||
```python
|
||||
# math_env.py
|
||||
import re
|
||||
|
||||
def math_solver_rollout(model, processing_class, prompts, generation_config=None):
|
||||
"""
|
||||
Custom rollout function that generates step-by-step math solutions.
|
||||
|
||||
Args:
|
||||
model: The language model
|
||||
processing_class: The tokenizer/processing_class
|
||||
prompts: List of prompt dicts (with 'messages' key for chat format)
|
||||
generation_config: Optional generation configuration
|
||||
|
||||
Returns:
|
||||
List of completion strings
|
||||
"""
|
||||
completions = []
|
||||
|
||||
for prompt in prompts:
|
||||
# Apply chat template to prompt
|
||||
messages = prompt.get("messages", [])
|
||||
formatted_prompt = processing_class.apply_chat_template(
|
||||
messages, processing_class=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
# Generate step-by-step solution
|
||||
full_response = ""
|
||||
for step in range(5): # Max 5 reasoning steps
|
||||
current_input = formatted_prompt + full_response + "\nNext step:"
|
||||
inputs = processing_class(current_input, return_tensors="pt").to(model.device)
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
generation_config=generation_config,
|
||||
)
|
||||
step_text = processing_class.decode(
|
||||
outputs[0][inputs.input_ids.shape[1]:],
|
||||
skip_special_tokens=True
|
||||
)
|
||||
|
||||
# Check if solution is complete
|
||||
if "FINAL ANSWER:" in step_text:
|
||||
full_response += step_text
|
||||
break
|
||||
full_response += step_text + "\n"
|
||||
|
||||
completions.append(full_response)
|
||||
|
||||
return completions
|
||||
|
||||
def math_reward(prompts, completions, answers, **kwargs):
|
||||
"""Reward function that checks mathematical correctness"""
|
||||
rewards = []
|
||||
for completion, correct_answer in zip(completions, answers):
|
||||
# Extract predicted answer
|
||||
match = re.search(r"FINAL ANSWER:\s*(.+)", completion)
|
||||
predicted = match.group(1).strip() if match else ""
|
||||
|
||||
# Compare with correct answer
|
||||
reward = 1.0 if predicted == str(correct_answer) else 0.0
|
||||
rewards.append(reward)
|
||||
|
||||
return rewards
|
||||
|
||||
def math_transform(cfg, *args, **kwargs):
|
||||
"""Transform dataset to GRPO format with answer field"""
|
||||
def transform_fn(example, processing_class=None):
|
||||
return {
|
||||
"prompt": [{"role": "user", "content": example["question"]}],
|
||||
"answer": str(example["answer"]),
|
||||
}
|
||||
return transform_fn, {"remove_columns": ["question"]}
|
||||
```
|
||||
|
||||
```yaml
|
||||
rl: grpo
|
||||
|
||||
trl:
|
||||
beta: 0.001
|
||||
max_completion_length: 512
|
||||
num_generations: 4
|
||||
rollout_func: "math_env.math_solver_rollout" # Custom rollout function
|
||||
reward_funcs: ["math_env.math_reward"]
|
||||
reward_weights: [1.0]
|
||||
|
||||
datasets:
|
||||
- path: openai/gsm8k
|
||||
name: main
|
||||
type: math_env.math_transform
|
||||
```
|
||||
|
||||
The `rollout_func` parameter accepts a fully qualified name (e.g., `module_name.function_name`) that points to a callable function in your local directory. The function receives:
|
||||
|
||||
- `model`: The language model
|
||||
- `processing_class`: The tokenizer/processing class
|
||||
- `prompts`: List of prompt dictionaries
|
||||
- `generation_config` (optional): Generation configuration
|
||||
|
||||
And should return a list of completion strings.
|
||||
|
||||
For more OpenEnv examples, see [TRL OpenEnv Documentation](https://huggingface.co/docs/trl/main/en/openenv).
|
||||
|
||||
#### GRPO with DAPO/Dr. GRPO loss
|
||||
|
||||
The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
|
||||
|
||||
90
docs/scripts/examples-allowlist.yml
Normal file
90
docs/scripts/examples-allowlist.yml
Normal file
@@ -0,0 +1,90 @@
|
||||
examples:
|
||||
# December 2025
|
||||
- name: kimi-linear
|
||||
title: Kimi Linear
|
||||
- name: plano
|
||||
title: Plano Orchestrator
|
||||
- name: mimo
|
||||
title: MiMo
|
||||
- name: internvl3_5
|
||||
title: InternVL 3.5
|
||||
|
||||
# AllenAI
|
||||
- name: olmo3
|
||||
title: OLMo 3
|
||||
|
||||
# ArceeAI
|
||||
- name: trinity
|
||||
title: Trinity
|
||||
- name: arcee
|
||||
title: Arcee AFM
|
||||
|
||||
# MistralAI
|
||||
- name: ministral3/think
|
||||
title: Ministral 3 Thinking
|
||||
- name: ministral3/vision
|
||||
title: Ministral 3 Vision
|
||||
- name: magistral/think
|
||||
title: Magistral Thinking
|
||||
- name: magistral/vision
|
||||
title: Magistral Vision
|
||||
- name: ministral
|
||||
title: Ministral
|
||||
- name: mistral-small
|
||||
title: Mistral Small 3.1/3.2
|
||||
- name: voxtral
|
||||
title: Voxtral
|
||||
- name: devstral
|
||||
title: Devstral
|
||||
- name: mistral
|
||||
title: Mistral 7B
|
||||
|
||||
# Meta
|
||||
- name: llama-4
|
||||
title: Llama 4
|
||||
- name: llama-2
|
||||
title: Llama 2
|
||||
|
||||
# Alibaba
|
||||
- name: qwen3-next
|
||||
title: Qwen 3 Next
|
||||
- name: qwen3
|
||||
title: Qwen 3
|
||||
|
||||
# Google
|
||||
- name: gemma3n
|
||||
title: Gemma 3n
|
||||
|
||||
# Swiss AI
|
||||
- name: apertus
|
||||
title: Apertus
|
||||
|
||||
# GPT-OSS
|
||||
- name: gpt-oss
|
||||
title: GPT-OSS
|
||||
- name: seed-oss
|
||||
title: Seed-OSS
|
||||
|
||||
# Microsoft
|
||||
- name: phi
|
||||
title: Phi
|
||||
|
||||
# SmolVLM
|
||||
- name: smolvlm2
|
||||
title: SmolVLM 2
|
||||
|
||||
# IBM
|
||||
- name: granite4
|
||||
title: Granite 4
|
||||
|
||||
# LiquidAI
|
||||
- name: LiquidAI
|
||||
title: Liquid Foundation Models 2
|
||||
|
||||
# Other
|
||||
- name: hunyuan
|
||||
title: Hunyuan
|
||||
- name: jamba
|
||||
title: Jamba
|
||||
- name: orpheus
|
||||
title: Orpheus
|
||||
424
docs/scripts/generate_examples_docs.py
Executable file
424
docs/scripts/generate_examples_docs.py
Executable file
@@ -0,0 +1,424 @@
|
||||
"""
|
||||
auto generate example docs from allowlist
|
||||
"""
|
||||
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
# Paths
|
||||
THIS = Path(__file__).resolve()
|
||||
ROOT = THIS.parents[2] # repo root (docs/scripts -> docs -> ROOT)
|
||||
EXAMPLES_DIR = ROOT / "examples"
|
||||
OUTPUT_DIR = ROOT / "docs" / "models"
|
||||
ALLOWLIST_YML = THIS.parent / "examples-allowlist.yml"
|
||||
|
||||
|
||||
def slugify(name: str) -> str:
|
||||
"""Convert a name to a slug (lowercase, hyphens for spaces)."""
|
||||
s = re.sub(r"[^a-zA-Z0-9\s\-]+", "", name.strip())
|
||||
s = re.sub(r"\s+", "-", s).strip("-").lower()
|
||||
return s or "example"
|
||||
|
||||
|
||||
def read_allowlist():
|
||||
with open(ALLOWLIST_YML, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
items = data.get("examples", [])
|
||||
if not isinstance(items, list):
|
||||
raise ValueError("`examples` must be a list in examples-allowlist.yml")
|
||||
return items
|
||||
|
||||
|
||||
def find_readme(folder: Path) -> Path | None:
|
||||
for name in ("README.md", "Readme.md", "readme.md"):
|
||||
p = folder / name
|
||||
if p.exists():
|
||||
return p
|
||||
return None
|
||||
|
||||
|
||||
def remove_first_h1(md: str) -> tuple[str, str | None]:
|
||||
"""
|
||||
Remove the first H1 from markdown and return (modified_md, h1_title).
|
||||
The H1 is removed since we use the frontmatter title instead.
|
||||
"""
|
||||
lines = md.splitlines()
|
||||
result = []
|
||||
h1_title = None
|
||||
skipped_first = False
|
||||
|
||||
for line in lines:
|
||||
if not skipped_first and line.startswith("# "):
|
||||
h1_title = line[2:].strip()
|
||||
skipped_first = True
|
||||
continue
|
||||
result.append(line)
|
||||
|
||||
return "\n".join(result), h1_title
|
||||
|
||||
|
||||
IMG_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
|
||||
LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
|
||||
|
||||
|
||||
def rewrite_and_copy_assets(md: str, src_dir: Path, dest_assets_root: Path) -> str:
|
||||
"""
|
||||
Copy local image assets referenced in markdown to
|
||||
docs/examples/assets/... and rewrite the links.
|
||||
"""
|
||||
dest_assets = dest_assets_root / "assets"
|
||||
|
||||
def repl(m):
|
||||
url = m.group(1).strip()
|
||||
if re.match(r"^(https?:)?//", url):
|
||||
return m.group(0) # leave remote URLs
|
||||
src_path = (src_dir / url).resolve()
|
||||
if not src_path.exists():
|
||||
return m.group(0) # leave as-is if not found
|
||||
rel = src_path.relative_to(src_dir)
|
||||
# Create a unique asset path based on source directory name
|
||||
asset_name = src_dir.name.replace("/", "-")
|
||||
dest_path = dest_assets / asset_name / rel
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(src_path, dest_path)
|
||||
new_rel = f"assets/{asset_name}/{rel.as_posix()}"
|
||||
return m.group(0).replace(url, new_rel)
|
||||
|
||||
return IMG_RE.sub(repl, md)
|
||||
|
||||
|
||||
def rewrite_readme_links(
|
||||
md: str,
|
||||
src_dir: Path,
|
||||
examples_dir: Path,
|
||||
parent_index_only: set,
|
||||
current_src_path: str,
|
||||
allowlist_entries: set,
|
||||
current_output_path: str,
|
||||
) -> str:
|
||||
"""
|
||||
Rewrite links between README.md files to point to the correct .qmd files.
|
||||
"""
|
||||
|
||||
def repl(m):
|
||||
text = m.group(1)
|
||||
url = m.group(2).strip()
|
||||
|
||||
# Skip remote URLs and anchor links
|
||||
if re.match(r"^(https?:)?//", url) or url.startswith("#"):
|
||||
return m.group(0)
|
||||
|
||||
# Skip non-markdown files
|
||||
if not url.lower().endswith(".md"):
|
||||
return m.group(0)
|
||||
|
||||
# Resolve the target path
|
||||
try:
|
||||
target_path = (src_dir / url).resolve()
|
||||
|
||||
# Check if target is outside examples_dir
|
||||
try:
|
||||
rel_path = target_path.relative_to(examples_dir)
|
||||
except ValueError:
|
||||
# Target is outside examples_dir, leave as-is
|
||||
return m.group(0)
|
||||
|
||||
parts = list(rel_path.parts)
|
||||
|
||||
# Determine the output path for the target
|
||||
if len(parts) > 0 and parts[-1].lower() in ("readme.md", "readme"):
|
||||
# This is a README link
|
||||
if len(parts) == 1:
|
||||
# Link to root README -> index.qmd
|
||||
target_output = "index.qmd"
|
||||
elif len(parts) == 2:
|
||||
if parts[0] == ".":
|
||||
# Current directory README
|
||||
target_output = "index.qmd"
|
||||
else:
|
||||
# subdir/README.md
|
||||
parent_dir = parts[0]
|
||||
if parent_dir in parent_index_only:
|
||||
target_output = f"{parent_dir}/index.qmd"
|
||||
else:
|
||||
target_output = f"{parent_dir}.qmd"
|
||||
else:
|
||||
# Deeper nesting: parent/subdir/README.md
|
||||
# Build the full path like "parent/subdir"
|
||||
full_path = "/".join(parts[:-1]) # Remove README.md
|
||||
# Check if this exact path is in allowlist
|
||||
if full_path in allowlist_entries:
|
||||
# This is a sub-entry with its own entry -> use .qmd
|
||||
target_output = f"{full_path}.qmd"
|
||||
elif parts[0] == ".":
|
||||
# ./subdir/README.md -> check if subdir has own entry
|
||||
subdir = parts[1]
|
||||
if subdir in parent_index_only:
|
||||
target_output = f"{subdir}/index.qmd"
|
||||
else:
|
||||
target_output = f"{subdir}.qmd"
|
||||
else:
|
||||
# parent/subdir where parent doesn't have own entry
|
||||
target_output = f"{full_path}/index.qmd"
|
||||
else:
|
||||
# Regular .md file -> convert to .qmd, keep path structure
|
||||
target_output = "/".join(parts)[:-2] + "qmd"
|
||||
|
||||
# Compute relative path from current output file to target
|
||||
current_parts = current_output_path.split("/")
|
||||
target_parts = target_output.split("/")
|
||||
|
||||
# Special case: if current is a subdir file and target is a single-component file at root
|
||||
# Example: current="magistral/vision", target="magistral.qmd"
|
||||
if len(current_parts) > 1 and len(target_parts) == 1:
|
||||
# Current is in subdir, target is at root level
|
||||
# Go up to root: ../ for each level
|
||||
up_count = len(current_parts) - 1
|
||||
rel_parts = [".."] * up_count + [target_parts[0]]
|
||||
new_url = "/".join(rel_parts)
|
||||
else:
|
||||
# Find common prefix
|
||||
i = 0
|
||||
while (
|
||||
i < min(len(current_parts) - 1, len(target_parts))
|
||||
and current_parts[i] == target_parts[i]
|
||||
):
|
||||
i += 1
|
||||
|
||||
# Build relative path: go up (../) then down to target
|
||||
up_count = len(current_parts) - 1 - i
|
||||
rel_parts = [".."] * up_count + target_parts[i:]
|
||||
|
||||
if not rel_parts or rel_parts == [".."]:
|
||||
# Points to same directory or parent
|
||||
new_url = "/".join(rel_parts) if rel_parts else "."
|
||||
else:
|
||||
new_url = "/".join(rel_parts)
|
||||
|
||||
return f"[{text}]({new_url})"
|
||||
except (ValueError, IndexError):
|
||||
return m.group(0)
|
||||
|
||||
return LINK_RE.sub(repl, md)
|
||||
|
||||
|
||||
def write_qmd(out_path: Path, title: str, body_md: str):
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fm = f"---\ntitle: {title!r}\nexecute:\n eval: false\nformat:\n html:\n toc: true\n---\n\n"
|
||||
out_path.write_text(fm + body_md, encoding="utf-8")
|
||||
|
||||
|
||||
def update_quarto_yml(generated: list[tuple[str, str, str]]):
|
||||
"""
|
||||
Update _quarto.yml with the generated example files in the correct order.
|
||||
This keeps the sidebar in sync with the allowlist.
|
||||
|
||||
Model Guides is now nested under "Getting Started" section.
|
||||
Creates nested sections for models with sub-entries (e.g., magistral, ministral3).
|
||||
Parent pages are now flat files (e.g., ministral3.qmd) with sub-pages in subdirs.
|
||||
"""
|
||||
quarto_yml = ROOT / "_quarto.yml"
|
||||
if not quarto_yml.exists():
|
||||
print(f"[WARN] {quarto_yml} not found, skipping update", file=sys.stderr)
|
||||
return
|
||||
|
||||
content = quarto_yml.read_text(encoding="utf-8")
|
||||
|
||||
# First pass: find all parents that have sub-entries
|
||||
parents_with_subs = set()
|
||||
for path, _name, _title in generated:
|
||||
if "/" in path:
|
||||
parent = path.split("/")[0]
|
||||
parents_with_subs.add(parent)
|
||||
|
||||
# Build the YAML contents while preserving allowlist order
|
||||
lines = []
|
||||
processed_sections = set()
|
||||
|
||||
for path, _name, title in generated:
|
||||
# Check if this is a parent page that has sub-pages
|
||||
if path in parents_with_subs:
|
||||
# This is a parent page with sub-pages - create a nested section
|
||||
if path not in processed_sections:
|
||||
processed_sections.add(path)
|
||||
section_title = (
|
||||
title or path.replace("-", " ").replace("_", " ").title()
|
||||
)
|
||||
lines.append(f' - section: "{section_title}"')
|
||||
lines.append(" contents:")
|
||||
# Add the parent page first
|
||||
lines.append(f" - docs/models/{path}.qmd")
|
||||
# Then add all sub-pages
|
||||
for sub_path, _sub_name, _sub_title in generated:
|
||||
if "/" in sub_path and sub_path.split("/")[0] == path:
|
||||
lines.append(
|
||||
f" - docs/models/{sub_path}.qmd"
|
||||
)
|
||||
elif "/" not in path:
|
||||
# This is a flat item with no sub-pages
|
||||
# Skip if it was already included as part of a parent section
|
||||
if path not in processed_sections:
|
||||
lines.append(f" - docs/models/{path}.qmd")
|
||||
|
||||
yaml_content = "\n".join(lines) + "\n"
|
||||
|
||||
# Pattern to match only the Model Guides contents, stopping at the next item
|
||||
# in Getting Started (lines starting with 12 spaces: same level as the section)
|
||||
pattern = r'( - section: "Model Guides"\n contents:)([^\n]*|.*?)(?=\n - |\n - section:|\n\nformat:)'
|
||||
|
||||
def replacement(match):
|
||||
prefix = match.group(1)
|
||||
return prefix + "\n" + yaml_content
|
||||
|
||||
new_content = re.sub(pattern, replacement, content, flags=re.DOTALL)
|
||||
|
||||
if new_content != content:
|
||||
quarto_yml.write_text(new_content, encoding="utf-8")
|
||||
print(f"Updated {quarto_yml}")
|
||||
else:
|
||||
print(f"No changes needed for {quarto_yml}")
|
||||
|
||||
|
||||
def main():
|
||||
allow = read_allowlist()
|
||||
if not EXAMPLES_DIR.exists():
|
||||
print(f"[WARN] {EXAMPLES_DIR} not found", file=sys.stderr)
|
||||
return
|
||||
|
||||
(OUTPUT_DIR / "assets").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# First pass: identify which parents have their own entry vs only sub-entries
|
||||
parent_entries = set() # Parents that have their own entry
|
||||
parent_with_subs = set() # Parents that have sub-entries
|
||||
allowlist_entries = set() # All entries in allowlist
|
||||
|
||||
for item in allow:
|
||||
if isinstance(item, str):
|
||||
name = item
|
||||
else:
|
||||
name = item.get("name")
|
||||
|
||||
allowlist_entries.add(name)
|
||||
|
||||
if "/" in name:
|
||||
parent = name.split("/")[0]
|
||||
parent_with_subs.add(parent)
|
||||
else:
|
||||
parent_entries.add(name)
|
||||
|
||||
# Parents with subs that DON'T have their own entry -> use index.qmd
|
||||
parent_index_only = parent_with_subs - parent_entries
|
||||
|
||||
generated = []
|
||||
seen_dirs = set() # Track which parent directories we've created index for
|
||||
|
||||
for item in allow:
|
||||
if isinstance(item, str):
|
||||
name = item
|
||||
title = None
|
||||
else:
|
||||
name = item.get("name")
|
||||
title = item.get("title")
|
||||
|
||||
if not name:
|
||||
print(f"[WARN] Skipping item without name: {item}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
src_dir = EXAMPLES_DIR / name
|
||||
if not src_dir.exists() or not src_dir.is_dir():
|
||||
print(f"[WARN] Skipping {name} (not a directory)", file=sys.stderr)
|
||||
continue
|
||||
|
||||
readme = find_readme(src_dir)
|
||||
if not readme:
|
||||
print(f"[WARN] Skipping {name} (no README.md)", file=sys.stderr)
|
||||
continue
|
||||
|
||||
md = readme.read_text(encoding="utf-8")
|
||||
|
||||
# Determine output path first (needed for link rewriting)
|
||||
parts = name.split("/")
|
||||
if len(parts) == 1:
|
||||
# Simple case: no subdirectory
|
||||
out_path = OUTPUT_DIR / f"{parts[0]}.qmd"
|
||||
sidebar_path = parts[0]
|
||||
else:
|
||||
# Has subdirectory: e.g., magistral/think
|
||||
parent = parts[0]
|
||||
child = "-".join(parts[1:]) # handle nested subdirs
|
||||
out_path = OUTPUT_DIR / parent / f"{child}.qmd"
|
||||
sidebar_path = f"{parent}/{child}"
|
||||
|
||||
# Remove the first H1 (we use frontmatter title instead)
|
||||
md, _ = remove_first_h1(md)
|
||||
# Rewrite links between README files
|
||||
md = rewrite_readme_links(
|
||||
md,
|
||||
src_dir,
|
||||
EXAMPLES_DIR,
|
||||
parent_index_only,
|
||||
name,
|
||||
allowlist_entries,
|
||||
sidebar_path,
|
||||
)
|
||||
md = rewrite_and_copy_assets(md, src_dir, OUTPUT_DIR)
|
||||
|
||||
# Handle parent page generation for sub-entries
|
||||
if len(parts) > 1:
|
||||
# Has subdirectory: e.g., magistral/think
|
||||
parent = parts[0]
|
||||
|
||||
# Create parent.qmd if not already done and parent doesn't have own entry
|
||||
if parent not in seen_dirs and parent in parent_index_only:
|
||||
parent_readme = find_readme(EXAMPLES_DIR / parent)
|
||||
if parent_readme:
|
||||
parent_md = parent_readme.read_text(encoding="utf-8")
|
||||
parent_md, _ = remove_first_h1(parent_md)
|
||||
parent_md = rewrite_readme_links(
|
||||
parent_md,
|
||||
EXAMPLES_DIR / parent,
|
||||
EXAMPLES_DIR,
|
||||
parent_index_only,
|
||||
parent,
|
||||
allowlist_entries,
|
||||
parent,
|
||||
)
|
||||
parent_md = rewrite_and_copy_assets(
|
||||
parent_md, EXAMPLES_DIR / parent, OUTPUT_DIR
|
||||
)
|
||||
parent_title = parent.replace("-", " ").replace("_", " ").title()
|
||||
write_qmd(OUTPUT_DIR / f"{parent}.qmd", parent_title, parent_md)
|
||||
generated.append((parent, parent, parent_title))
|
||||
seen_dirs.add(parent)
|
||||
|
||||
if not title:
|
||||
title = name.replace("/", " ").replace("-", " ").title()
|
||||
|
||||
write_qmd(out_path, title, md)
|
||||
generated.append((sidebar_path, name, title))
|
||||
|
||||
# Index page - preserve allowlist order
|
||||
if generated:
|
||||
listing = "\n".join(
|
||||
[f"- [{title}]({path}.qmd)" for path, name, title in generated]
|
||||
)
|
||||
index_md = (
|
||||
"# Model Guides\n\nBelow are the curated examples for training various model architectures:\n\n"
|
||||
+ listing
|
||||
+ "\n"
|
||||
)
|
||||
index_fm = (
|
||||
"---\nexecute:\n eval: false\nformat:\n html:\n toc: true\n---\n\n"
|
||||
)
|
||||
(OUTPUT_DIR / "index.qmd").write_text(index_fm + index_md, encoding="utf-8")
|
||||
|
||||
# Auto-update _quarto.yml to keep sidebar in sync
|
||||
update_quarto_yml(generated)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
61
docs/telemetry.qmd
Normal file
61
docs/telemetry.qmd
Normal file
@@ -0,0 +1,61 @@
|
||||
---
|
||||
title: Telemetry
|
||||
description: A description of the telemetry implementation in Axolotl.
|
||||
---
|
||||
|
||||
# Telemetry in Axolotl
|
||||
|
||||
Axolotl implements anonymous telemetry to help maintainers understand how the library
|
||||
is used and where users encounter issues. This data helps prioritize features, optimize
|
||||
performance, and fix bugs.
|
||||
|
||||
## Data Collection
|
||||
|
||||
We collect:
|
||||
|
||||
- System info: OS, Python version, Axolotl version, PyTorch version, Transformers
|
||||
version, etc.
|
||||
- Hardware info: CPU count, memory, GPU count and models
|
||||
- Runtime metrics: Training progress, memory usage, timing information
|
||||
- Usage patterns: Models (from a whitelist) and configurations used
|
||||
- Error tracking: Stack traces and error messages (sanitized to remove personal
|
||||
information)
|
||||
|
||||
Personally identifiable information (PII) is not collected.
|
||||
|
||||
## Implementation
|
||||
|
||||
Telemetry is implemented using PostHog and consists of:
|
||||
|
||||
- `axolotl.telemetry.TelemetryManager`: A singleton class that initializes the
|
||||
telemetry system and provides methods for tracking events.
|
||||
- `axolotl.telemetry.errors.send_errors`: A decorator that captures exceptions and
|
||||
sends sanitized stack traces.
|
||||
- `axolotl.telemetry.runtime_metrics.RuntimeMetricsTracker`: A class that tracks
|
||||
runtime metrics during training.
|
||||
- `axolotl.telemetry.callbacks.TelemetryCallback`: A Trainer callback that sends
|
||||
runtime metrics telemetry.
|
||||
|
||||
The telemetry system will block training startup for 10 seconds to ensure users are
|
||||
aware of data collection, unless telemetry is explicitly enabled or disabled.
|
||||
|
||||
## Opt-Out Mechanism
|
||||
|
||||
Telemetry is **enabled by default** on an opt-out basis. To disable it, set
|
||||
`AXOLOTL_DO_NOT_TRACK=1` or `DO_NOT_TRACK=1`.
|
||||
|
||||
A warning message will be logged on start to clearly inform users about telemetry.
|
||||
We will remove this after some period.
|
||||
|
||||
To hide the warning message about telemetry that is displayed on train, etc. startup,
|
||||
explicitly set: `AXOLOTL_DO_NOT_TRACK=0` (enable telemetry) or `AXOLOTL_DO_NOT_TRACK=1`
|
||||
(explicitly disable telemetry).
|
||||
|
||||
## Privacy
|
||||
|
||||
- All path-like config information is automatically redacted from telemetry data
|
||||
- Model information is only collected for whitelisted organizations
|
||||
- See `axolotl/telemetry/whitelist.yaml` for the set of whitelisted organizations
|
||||
- Each run generates a unique anonymous ID
|
||||
- This allows us to link different telemetry events in a single same training run
|
||||
- Telemetry is only sent from the main process to avoid duplicate events
|
||||
@@ -40,7 +40,7 @@
|
||||
"%%capture\n",
|
||||
"# This step can take ~5-10 minutes to install dependencies\n",
|
||||
"!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
|
||||
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec\""
|
||||
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -253,7 +253,6 @@
|
||||
"source": [
|
||||
"from axolotl.utils import set_pytorch_cuda_alloc_conf\n",
|
||||
"\n",
|
||||
"# Set \"PYTORCH_CUDA_ALLOC_CONF\" env to save memory\n",
|
||||
"set_pytorch_cuda_alloc_conf()"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -52,6 +52,7 @@ gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
scaling_softmax: true
|
||||
|
||||
loss_watchdog_threshold: 5.0
|
||||
loss_watchdog_patience: 3
|
||||
|
||||
53
examples/gemma3/gemma-3-1b-fft-dft.yml
Normal file
53
examples/gemma3/gemma-3-1b-fft-dft.yml
Normal file
@@ -0,0 +1,53 @@
|
||||
base_model: google/gemma-3-1b-it
|
||||
|
||||
model_type: Gemma3ForCausalLM
|
||||
cls_model_config: Gemma3TextConfig
|
||||
|
||||
# gemma3 doesn't seem to play nice with ddp
|
||||
ddp_find_unused_parameters: true
|
||||
|
||||
chat_template: gemma3
|
||||
eot_tokens:
|
||||
- <end_of_turn>
|
||||
datasets:
|
||||
- path: cgato/SlimOrcaDedupCleaned
|
||||
type: chat_template
|
||||
field_messages: conversations
|
||||
message_property_mappings:
|
||||
role: from
|
||||
content: value
|
||||
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/gemma-3-1b-fft-dft
|
||||
|
||||
sequence_len: 2048
|
||||
|
||||
use_dynamic_finetuning: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 5e-5
|
||||
|
||||
bf16: auto
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 2
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
@@ -1,6 +1,7 @@
|
||||
base_model: google/gemma-3-1b-it
|
||||
|
||||
model_type: Gemma3ForCausalLM
|
||||
cls_model_config: Gemma3TextConfig
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
@@ -29,7 +30,7 @@ output_dir: ./outputs/out
|
||||
adapter: qlora
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_dropout: 0
|
||||
lora_target_linear: true
|
||||
|
||||
sequence_len: 2048
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
base_model: google/gemma-3-270m-it
|
||||
|
||||
model_type: Gemma3ForCausalLM
|
||||
cls_model_config: Gemma3TextConfig
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
@@ -29,7 +30,7 @@ output_dir: ./outputs/out
|
||||
adapter: qlora
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_dropout: 0
|
||||
lora_target_linear: true
|
||||
|
||||
sequence_len: 2048
|
||||
|
||||
@@ -2,6 +2,7 @@ base_model: google/gemma-3-4b-it
|
||||
|
||||
# Need to set else transformers tries to load vision too
|
||||
model_type: Gemma3ForCausalLM
|
||||
cls_model_config: Gemma3TextConfig
|
||||
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -32,8 +33,8 @@ sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||
lora_dropout: 0
|
||||
lora_target_linear: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
|
||||
@@ -31,7 +31,7 @@ pad_to_sequence_len: false
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_dropout: 0
|
||||
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||
|
||||
wandb_project:
|
||||
|
||||
@@ -32,6 +32,10 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
trackio_project_name:
|
||||
trackio_run_name:
|
||||
trackio_space_id:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
|
||||
@@ -28,6 +28,10 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
trackio_project_name:
|
||||
trackio_run_name:
|
||||
trackio_space_id:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
|
||||
@@ -29,6 +29,10 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
trackio_project_name:
|
||||
trackio_run_name:
|
||||
trackio_space_id:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
|
||||
@@ -28,6 +28,10 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
trackio_project_name:
|
||||
trackio_run_name:
|
||||
trackio_space_id:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
|
||||
@@ -41,6 +41,10 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
trackio_project_name:
|
||||
trackio_run_name:
|
||||
trackio_space_id:
|
||||
|
||||
gradient_accumulation_steps: 8
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
|
||||
@@ -41,6 +41,10 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
trackio_project_name:
|
||||
trackio_run_name:
|
||||
trackio_space_id:
|
||||
|
||||
gradient_accumulation_steps: 8
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
|
||||
65
examples/granite4/README.md
Normal file
65
examples/granite4/README.md
Normal file
@@ -0,0 +1,65 @@
|
||||
# Finetune IBM's Granite 4.0 with Axolotl
|
||||
|
||||
[Granite 4.0](https://huggingface.co/collections/ibm-granite/granite-40-language-models) are a family of open source models trained by IBM Research.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Granite4 is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
|
||||
|
||||
Here is an example of how to install from main for pip:
|
||||
|
||||
```bash
|
||||
# Ensure you have Pytorch installed (Pytorch 2.7.1 min)
|
||||
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
cd axolotl
|
||||
|
||||
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
|
||||
pip3 install --no-build-isolation -e '.[flash-attn]'
|
||||
|
||||
# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
|
||||
python scripts/cutcrossentropy_install.py | sh
|
||||
```
|
||||
|
||||
2. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/granite4/granite-4.0-tiny-fft.yaml
|
||||
```
|
||||
|
||||
This config uses about 40.8GiB VRAM.
|
||||
|
||||
Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### TIPS
|
||||
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
### Limitation
|
||||
|
||||
Adapter finetuning does not work at the moment. It would error with
|
||||
|
||||
```bash
|
||||
RuntimeError: mat1 and mat2 shapes cannot be multiplied (4096x3072 and 1x1179648)
|
||||
```
|
||||
|
||||
In addition, if adapter training works, `lora_target_linear: true` will not work due to:
|
||||
```bash
|
||||
ValueError: Target module GraniteMoeHybridParallelExperts() is not supported.
|
||||
```
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
||||
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
||||
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [Granite Docs](https://www.ibm.com/granite/docs/models/granite)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
45
examples/granite4/granite-4.0-tiny-fft.yaml
Normal file
45
examples/granite4/granite-4.0-tiny-fft.yaml
Normal file
@@ -0,0 +1,45 @@
|
||||
base_model: ibm-granite/granite-4.0-tiny-preview
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/model-out
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
43
examples/internvl3_5/README.md
Normal file
43
examples/internvl3_5/README.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# Finetune OpenGV's InternVL with Axolotl
|
||||
|
||||
[InternVL 3.5](https://huggingface.co/OpenGVLab/InternVL3_5-8B-HF) is a family of powerful vision-language models supporting dynamic resolution and multi-image understanding by OpenGV. It features a ViT-style vision encoder and strong language model backbone for tasks like visual question answering, OCR, and scene text understanding.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Install `timm` for vision model support:
|
||||
|
||||
```bash
|
||||
pip install timm==1.0.19
|
||||
```
|
||||
|
||||
3. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||
|
||||
4. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/internvl3_5/internvl3_5-8b-qlora.yml
|
||||
```
|
||||
|
||||
This config uses about 8.21 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### Tips
|
||||
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the multi-modal format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [InternVL Paper](https://huggingface.co/papers/2508.18265)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
61
examples/internvl3_5/internvl3_5-8b-qlora.yml
Normal file
61
examples/internvl3_5/internvl3_5-8b-qlora.yml
Normal file
@@ -0,0 +1,61 @@
|
||||
base_model: OpenGVLab/InternVL3_5-8B-HF
|
||||
processor_type: AutoProcessor
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_4bit: true
|
||||
|
||||
# these 3 lines are needed for now to handle vision chat templates w images
|
||||
skip_prepare_dataset: true
|
||||
remove_unused_columns: false
|
||||
sample_packing: false
|
||||
|
||||
datasets:
|
||||
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||
type: chat_template
|
||||
split: train[:1%]
|
||||
field_messages: messages
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.01
|
||||
output_dir: ./outputs/out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: true
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
eager_attention:
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
47
examples/kimi-linear/README.md
Normal file
47
examples/kimi-linear/README.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# Finetune MoonshotAI's Kimi Linear with Axolotl
|
||||
|
||||
[Kimi Linear](https://huggingface.co/collections/moonshotai/kimi-linear-a3b) is a MoE model (48B total, 3B active) by MoonshotAI using a hybrid linear attention architecture to achieve a 1M token context length. It uses Kimi Delta Attention (KDA), a refined version of Gated DeltaNet that reduces KV cache size by up to 75% and boosts decoding throughput by up to 6x for long contexts.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
**Note:** Axolotl uses experimental training code for Kimi Linear as their original modeling code is inference-only.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Install CCE via [docs](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy)
|
||||
|
||||
3. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/kimi-linear/kimi-48b-lora.yaml
|
||||
```
|
||||
|
||||
This config uses about 98.7GiB VRAM.
|
||||
|
||||
Let us know how it goes. Happy finetuning!
|
||||
|
||||
### TIPS
|
||||
|
||||
- Kimi Linear requires `trust_remote_code: true`.
|
||||
- You can run a full finetuning by removing the `adapter: lora` and `load_in_8bit: true`.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html)
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template)
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
See 👉 [docs](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Limitations
|
||||
|
||||
This is not yet compatible with MoE kernels from transformers v5.
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [Kimi Linear Paper](https://huggingface.co/papers/2510.26692)
|
||||
- [Kimi Linear GitHub](https://github.com/MoonshotAI/Kimi-Linear)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
81
examples/kimi-linear/kimi-48b-lora.yaml
Normal file
81
examples/kimi-linear/kimi-48b-lora.yaml
Normal file
@@ -0,0 +1,81 @@
|
||||
base_model: moonshotai/Kimi-Linear-48B-A3B-Instruct
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
split: train
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.2
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
lora_r: 16
|
||||
lora_alpha: 32
|
||||
lora_dropout: 0.05
|
||||
lora_fan_in_fan_out:
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
loss_watchdog_threshold: 5.0
|
||||
loss_watchdog_patience: 3
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 2
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
@@ -29,7 +29,6 @@ flex_attention: true
|
||||
flex_attn_compile_kwargs:
|
||||
dynamic: false
|
||||
mode: max-autotune-no-cudagraphs
|
||||
save_strategy: no
|
||||
torch_compile: true
|
||||
|
||||
wandb_project:
|
||||
|
||||
@@ -13,7 +13,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for these
|
||||
Here is an example of how to install from pip:
|
||||
|
||||
```bash
|
||||
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
|
||||
# Ensure you have Pytorch installed (Pytorch 2.7.0 min)
|
||||
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
|
||||
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
|
||||
```
|
||||
|
||||
@@ -5,6 +5,7 @@ This guide covers fine-tuning [Magistral Small 2507](https://huggingface.co/mist
|
||||
## Prerequisites
|
||||
|
||||
Before starting, ensure you have:
|
||||
|
||||
- Installed Axolotl (see [main README](../README.md))
|
||||
|
||||
## Getting Started
|
||||
|
||||
@@ -5,7 +5,8 @@ This guide covers fine-tuning [Magistral Small 2509](https://huggingface.co/mist
|
||||
## Prerequisites
|
||||
|
||||
Before starting, ensure you have:
|
||||
- Installed Axolotl from source (see [main README](../README.md#getting-started))
|
||||
|
||||
- Installed Axolotl from source (see [main README](../README.md))
|
||||
|
||||
## Getting started
|
||||
|
||||
|
||||
39
examples/mimo/README.md
Normal file
39
examples/mimo/README.md
Normal file
@@ -0,0 +1,39 @@
|
||||
# Finetune Xiaomi's MiMo with Axolotl
|
||||
|
||||
[MiMo](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL) is a family of models trained from scratch for reasoning tasks, incorporating **Multiple-Token Prediction (MTP)** as an additional training objective for enhanced performance and faster inference. Pre-trained on ~25T tokens with a three-stage data mixture strategy and optimized reasoning pattern density.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/mimo/mimo-7b-qlora.yaml
|
||||
```
|
||||
|
||||
This config uses about 17.2 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### Tips
|
||||
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Limitations
|
||||
|
||||
**Cut Cross Entropy (CCE)**: Currently not supported. We plan to include CCE support for MiMo in the near future.
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [MiMo Paper](https://arxiv.org/abs/2505.07608)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
67
examples/mimo/mimo-7b-qlora.yaml
Normal file
67
examples/mimo/mimo-7b-qlora.yaml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: XiaomiMiMo/MiMo-7B-RL
|
||||
trust_remote_code: true
|
||||
revision_of_model: 6299b5a
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
# CCE - N/A as of now
|
||||
# plugins:
|
||||
# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
50
examples/ministral/README.md
Normal file
50
examples/ministral/README.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# Finetune Ministral with Axolotl
|
||||
|
||||
Ministral is a family of openweight models from MistralAI found on [HuggingFace](mistralai/Ministral-8B-Instruct-2410). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||
|
||||
3. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/ministral/ministral-small-qlora.yaml
|
||||
```
|
||||
|
||||
This config uses about 8.76 GiB VRAM.
|
||||
|
||||
Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### Tips
|
||||
|
||||
- We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`.
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Limitations
|
||||
|
||||
We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
|
||||
|
||||
In addition, we do not support overriding tokens yet.
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [MistralAI Ministral Blog](https://mistral.ai/news/ministraux)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
|
||||
|
||||
## Future Work
|
||||
|
||||
- Add parity to Preference Tuning, RL, etc.
|
||||
- Add parity to other tokenizer configs like overriding tokens.
|
||||
67
examples/ministral/ministral-small-qlora.yaml
Normal file
67
examples/ministral/ministral-small-qlora.yaml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: mistralai/Ministral-8B-Instruct-2410
|
||||
|
||||
# Enable to use mistral-common tokenizer
|
||||
tokenizer_use_mistral_common: true
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
79
examples/ministral3/README.md
Normal file
79
examples/ministral3/README.md
Normal file
@@ -0,0 +1,79 @@
|
||||
# Finetune Ministral3 with Axolotl
|
||||
|
||||
Ministral3 is a family of open-weight models from MistralAI found on [HuggingFace](https://huggingface.co/collections/mistralai/ministral-3). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
Please see [Thinking](#thinking) and [Vision](#vision) for their respective fine-tuning.
|
||||
|
||||
Thanks to the team at MistralAI for giving us early access to prepare for these releases.
|
||||
|
||||
Note: This is still experimental given it is based on transformers v5 RC.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl from source following the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build).
|
||||
|
||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||
|
||||
3. Swap to the Axolotl transformers v5 branch
|
||||
|
||||
```bash
|
||||
cp examples/ministral3/ministral3-3b-qlora.yaml ministral3-3b-qlora.yaml
|
||||
|
||||
git fetch
|
||||
git checkout transformers-v5
|
||||
|
||||
# Install packages for transformers v5
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
4. Run the fine-tuning:
|
||||
|
||||
```bash
|
||||
axolotl train ministral3-3b-qlora.yaml
|
||||
```
|
||||
|
||||
Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
|
||||
### Tips
|
||||
|
||||
- We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`.
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
### Thinking
|
||||
|
||||
Ministral3 2512 model supports thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps.
|
||||
|
||||
📚 **[See the Thinking fine-tuning guide →](./think/README.md)**
|
||||
|
||||
### Vision
|
||||
|
||||
Ministral3 2512 model also supports vision capabilities.
|
||||
|
||||
📚 **[See the Vision fine-tuning guide →](./vision/README.md)**
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Limitations
|
||||
|
||||
We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
|
||||
|
||||
In addition, we do not support overriding tokens yet.
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [MistralAI Mistral3 Blog](https://mistral.ai/news/mistral-3)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
|
||||
|
||||
## Future Work
|
||||
|
||||
- Add parity to Preference Tuning, RL, etc.
|
||||
- Add parity to other tokenizer configs like overriding tokens.
|
||||
68
examples/ministral3/ministral3-3b-qlora.yaml
Normal file
68
examples/ministral3/ministral3-3b-qlora.yaml
Normal file
@@ -0,0 +1,68 @@
|
||||
base_model: mistralai/Ministral-3-3B-Reasoning-2512
|
||||
|
||||
# Enable to use mistral-common tokenizer
|
||||
tokenizer_use_mistral_common: true
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
scaling_softmax: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
74
examples/ministral3/think/README.md
Normal file
74
examples/ministral3/think/README.md
Normal file
@@ -0,0 +1,74 @@
|
||||
# Ministral3 2512 Thinking Fine-tuning
|
||||
|
||||
This guide covers fine-tuning [Ministral3 2512](https://huggingface.co/collections/mistralai/ministral-3) with thinking capabilities using Axolotl. The thinking model enables explicit Chain-of-Thought reasoning with separate thinking and response sections.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before starting, ensure you have:
|
||||
|
||||
- Installed Axolotl (see [main README](../README.md))
|
||||
|
||||
## Getting Started
|
||||
|
||||
Run the thinking model fine-tuning:
|
||||
|
||||
```bash
|
||||
axolotl train examples/ministral3/think/ministral3-3b-think-qlora.yaml
|
||||
```
|
||||
|
||||
This config uses about 4.76 GiB VRAM.
|
||||
|
||||
### Tips
|
||||
|
||||
- Dataset uses multi-content format with `type: thinking` support. See [Dataset Format](#dataset-format) below.
|
||||
- You cannot mix `content: str` and `content: list[dict]`, otherwise, dataset loading will fail. Keep it consistent.
|
||||
|
||||
## Dataset Format
|
||||
|
||||
The thinking model requires the multi-content dataset format with support for an extra `role: thinking` within system and assistant messages.
|
||||
|
||||
Example format:
|
||||
|
||||
```json
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{ "type": "text", "text": "{SYSTEM_PROMPT}"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{ "type": "text", "text": "Solve this step by step: What is 15% of 240?"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"type": "thinking",
|
||||
"thinking": "I need to calculate 15% of 240. First, I'll convert 15% to decimal: 0.15. Then multiply: 0.15 × 240 = 36."
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "To find 15% of 240, I'll multiply 240 by 0.15:\n\n240 × 0.15 = 36\n\nTherefore, 15% of 240 is 36."
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Advanced Options
|
||||
|
||||
The `thinking` section supports an optional `closed` parameter:
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "thinking",
|
||||
"thinking": "Internal reasoning here...",
|
||||
"closed": true // Default: true, controls adding the closing [/THINK] tag
|
||||
}
|
||||
```
|
||||
67
examples/ministral3/think/ministral3-3b-think-qlora.yaml
Normal file
67
examples/ministral3/think/ministral3-3b-think-qlora.yaml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: mistralai/Ministral-3-3B-Reasoning-2512
|
||||
|
||||
# Enable to use mistral-common tokenizer
|
||||
tokenizer_use_mistral_common: true
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
datasets:
|
||||
- path: Nanobit/text-think-2k-test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
58
examples/ministral3/vision/README.md
Normal file
58
examples/ministral3/vision/README.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# Ministral3 2512 Vision Fine-tuning
|
||||
|
||||
This guide covers fine-tuning [Ministral3 2512](https://huggingface.co/collections/mistralai/ministral-3) with vision capabilities using Axolotl.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before starting, ensure you have:
|
||||
|
||||
- Installed Axolotl from source (see [main README](../README.md))
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install the required vision lib:
|
||||
```bash
|
||||
pip install 'mistral-common[opencv]==1.8.6'
|
||||
```
|
||||
|
||||
2. Download the example dataset image:
|
||||
```bash
|
||||
wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
|
||||
```
|
||||
|
||||
3. Run the fine-tuning:
|
||||
```bash
|
||||
axolotl train examples/ministral3/vision/ministral3-3b-vision-qlora.yml
|
||||
```
|
||||
|
||||
WARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.
|
||||
|
||||
### Tips
|
||||
|
||||
Key differences from text-only model:
|
||||
- Multi-modal dataset format required
|
||||
- Sample packing not supported
|
||||
|
||||
## Dataset Format
|
||||
|
||||
The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
|
||||
|
||||
One exception is that, passing `"image": PIL.Image` is not supported. MistralTokenizer only supports `path`, `url`, and `base64` for now.
|
||||
|
||||
Example:
|
||||
```json
|
||||
{
|
||||
"messages": [
|
||||
{"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
|
||||
{"role": "user", "content": [
|
||||
{ "type": "text", "text": "What's in this image?"},
|
||||
{"type": "image", "path": "path/to/image.jpg" }
|
||||
]},
|
||||
{"role": "assistant", "content": [{ "type": "text", "text": "..." }]},
|
||||
],
|
||||
}
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- Sample Packing is not supported for multi-modality training currently.
|
||||
64
examples/ministral3/vision/ministral3-3b-vision-qlora.yml
Normal file
64
examples/ministral3/vision/ministral3-3b-vision-qlora.yml
Normal file
@@ -0,0 +1,64 @@
|
||||
base_model: mistralai/Ministral-3-3B-Reasoning-2512
|
||||
processor_type: AutoProcessor
|
||||
|
||||
# Enable to use mistral-common tokenizer
|
||||
tokenizer_use_mistral_common: true
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_4bit: true
|
||||
|
||||
# these 3 lines are needed for now to handle vision chat templates w images
|
||||
skip_prepare_dataset: true
|
||||
remove_unused_columns: false
|
||||
sample_packing: false
|
||||
|
||||
# sample dataset below requires downloading image in advance
|
||||
# wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
|
||||
datasets:
|
||||
- path: Nanobit/text-vision-2k-test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.01
|
||||
output_dir: ./outputs/out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: true
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
@@ -5,6 +5,7 @@ This guide covers fine-tuning [Mistral Small 3.1](mistralai/Mistral-Small-3.1-24
|
||||
## Prerequisites
|
||||
|
||||
Before starting, ensure you have:
|
||||
|
||||
- Installed Axolotl (see [Installation docs](https://docs.axolotl.ai/docs/installation.html))
|
||||
|
||||
## Getting Started
|
||||
38
examples/olmo3/README.md
Normal file
38
examples/olmo3/README.md
Normal file
@@ -0,0 +1,38 @@
|
||||
# Finetune Allenai's Olmo 3 with Axolotl
|
||||
|
||||
[Olmo 3](https://huggingface.co/collections/allenai/olmo-3) are a family of 7B and 32B models open source models trained by The Allen Institute for Artificial Intelligence.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||
|
||||
3. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/olmo3/olmo3-7b-qlora.yaml
|
||||
```
|
||||
|
||||
This uses about 11.3 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### TIPS
|
||||
|
||||
- The example config can be re-used for Olmo and Olmo 2.
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [Olmo 3 Blog](https://allenai.org/blog/olmo3)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
64
examples/olmo3/olmo3-7b-qlora.yaml
Normal file
64
examples/olmo3/olmo3-7b-qlora.yaml
Normal file
@@ -0,0 +1,64 @@
|
||||
base_model: allenai/Olmo-3-7B-Instruct-SFT
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
42
examples/plano/README.md
Normal file
42
examples/plano/README.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# Finetune Katanemo's Plano-Orchestrator with Axolotl
|
||||
|
||||
[Plano-Orchestrator](https://huggingface.co/collections/katanemo/plano-orchestrator) is a family of 4B and 30B-A3B routing and orchestration models designed for multi-agent systems. It analyzes user intent and conversation context to make precise routing decisions, excelling at multi-turn context understanding, multi-intent detection, and context-dependent routing.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||
|
||||
3. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/plano/plano-4b-qlora.yaml
|
||||
```
|
||||
|
||||
This config uses about 5.1 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### Orchestration Prompt
|
||||
|
||||
Plano-Orchestrator uses a specific orchestration prompt format for routing/agent decisions. Please check the [official model card](https://huggingface.co/katanemo/Plano-Orchestrator-4B) for proper prompt formatting and the `ORCHESTRATION_PROMPT` template.
|
||||
|
||||
### Tips
|
||||
|
||||
- To use the larger [Plano-Orchestrator-30B-A3B](https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B) MoE model, simply change `base_model: katanemo/Plano-Orchestrator-30B-A3B` in the config and enable multi-GPU training if needed.
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [Plano GitHub](https://github.com/katanemo/plano)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
65
examples/plano/plano-4b-qlora.yaml
Normal file
65
examples/plano/plano-4b-qlora.yaml
Normal file
@@ -0,0 +1,65 @@
|
||||
base_model: katanemo/Plano-Orchestrator-4B
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
chat_template: qwen3
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
67
examples/qat_nvfp4/Gemma3-12B_baseline.yml
Normal file
67
examples/qat_nvfp4/Gemma3-12B_baseline.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: google/gemma-3-12b-it
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: gemma3
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
|
||||
output_dir: ./outputs/out_gemma/
|
||||
|
||||
sequence_len: 8096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 16
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 4e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
72
examples/qat_nvfp4/Gemma3-12B_qat.yml
Normal file
72
examples/qat_nvfp4/Gemma3-12B_qat.yml
Normal file
@@ -0,0 +1,72 @@
|
||||
base_model: google/gemma-3-12b-it
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: gemma3
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
|
||||
output_dir: ./outputs/qat_out_gemma/
|
||||
|
||||
sequence_len: 8096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
qat:
|
||||
activation_dtype: nvfp4
|
||||
weight_dtype: nvfp4
|
||||
group_size: 16 # only group_size of 16 is supported with nvfp4
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 16
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 4e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
67
examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml
Normal file
67
examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: google/gemma-3-12b-it
|
||||
# Math finetuning configuration for Gemma3-12B
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: gemma3
|
||||
datasets:
|
||||
- path: AI-MO/NuminaMath-CoT
|
||||
type: chat_template
|
||||
|
||||
output_dir: ./outputs/out_math_gemma/
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 3e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
72
examples/qat_nvfp4/Math-Gemma3-12B_qat.yml
Normal file
72
examples/qat_nvfp4/Math-Gemma3-12B_qat.yml
Normal file
@@ -0,0 +1,72 @@
|
||||
base_model: google/gemma-3-12b-it
|
||||
# Math finetuning configuration for Gemma3-12B
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: gemma3
|
||||
datasets:
|
||||
- path: AI-MO/NuminaMath-CoT
|
||||
type: chat_template
|
||||
|
||||
output_dir: ./outputs/qat_out_math_gemma/
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
qat:
|
||||
activation_dtype: nvfp4
|
||||
weight_dtype: nvfp4
|
||||
group_size: 16 # only group_size of 16 is supported with nvfp4
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 3e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
68
examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml
Normal file
68
examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml
Normal file
@@ -0,0 +1,68 @@
|
||||
base_model: google/gemma-3-27b-it
|
||||
# Math finetuning configuration for Gemma3-27B
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: gemma3
|
||||
datasets:
|
||||
- path: AI-MO/NuminaMath-CoT
|
||||
type: chat_template
|
||||
|
||||
output_dir: ./outputs/out_math_gemma27/
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 16
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 5e-6
|
||||
eta_min: 7e-7
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
73
examples/qat_nvfp4/Math-Gemma3-27B_qat.yml
Normal file
73
examples/qat_nvfp4/Math-Gemma3-27B_qat.yml
Normal file
@@ -0,0 +1,73 @@
|
||||
base_model: google/gemma-3-27b-it
|
||||
# Math finetuning configuration for Gemma3-27B
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: gemma3
|
||||
datasets:
|
||||
- path: AI-MO/NuminaMath-CoT
|
||||
type: chat_template
|
||||
|
||||
output_dir: ./outputs/qat_out_math_gemma27/
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
qat:
|
||||
activation_dtype: nvfp4
|
||||
weight_dtype: nvfp4
|
||||
group_size: 16 # only group_size of 16 is supported with nvfp4
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 16
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 5e-6
|
||||
eta_min: 7e-7
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
67
examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml
Normal file
67
examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: Qwen/Qwen2.5-72B
|
||||
# Math finetuning configuration for Qwen2.5-72B (non-instruct)
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: qwen_25
|
||||
datasets:
|
||||
- path: AI-MO/NuminaMath-CoT
|
||||
type: chat_template
|
||||
|
||||
output_dir: ./outputs/out_math_72b/
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 5e-6
|
||||
eta_min: 7e-7
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
72
examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml
Normal file
72
examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml
Normal file
@@ -0,0 +1,72 @@
|
||||
base_model: Qwen/Qwen2.5-72B
|
||||
# Math finetuning configuration for Qwen2.5-72B (non-instruct)
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: qwen_25
|
||||
datasets:
|
||||
- path: AI-MO/NuminaMath-CoT
|
||||
type: chat_template
|
||||
|
||||
output_dir: ./outputs/qat_out_math_72b/
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
qat:
|
||||
activation_dtype: nvfp4
|
||||
weight_dtype: nvfp4
|
||||
group_size: 16 # only group_size of 16 is supported with nvfp4
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 5e-6
|
||||
eta_min: 7e-7
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
67
examples/qat_nvfp4/Qwen2.5-72B_baseline.yml
Normal file
67
examples/qat_nvfp4/Qwen2.5-72B_baseline.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: Qwen/Qwen2.5-72B
|
||||
# Alpaca finetuning configuration for Qwen2.5-72B
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: qwen_25
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
|
||||
output_dir: ./outputs/out_qwen72b/
|
||||
|
||||
sequence_len: 8096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 16
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 2e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
72
examples/qat_nvfp4/Qwen2.5-72B_qat.yml
Normal file
72
examples/qat_nvfp4/Qwen2.5-72B_qat.yml
Normal file
@@ -0,0 +1,72 @@
|
||||
base_model: Qwen/Qwen2.5-72B
|
||||
# Alpaca finetuning configuration for Qwen2.5-72B
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: qwen_25
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
|
||||
output_dir: ./outputs/qat_out_qwen72b/
|
||||
|
||||
sequence_len: 8096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
qat:
|
||||
activation_dtype: nvfp4
|
||||
weight_dtype: nvfp4
|
||||
group_size: 16 # only group_size of 16 is supported with nvfp4
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 16
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 2e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
70
examples/qwen2/adamw-pretrain-fsdp2.yaml
Normal file
70
examples/qwen2/adamw-pretrain-fsdp2.yaml
Normal file
@@ -0,0 +1,70 @@
|
||||
base_model: Qwen/Qwen2.5-0.5B
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
# Use random initialization for fair comparison
|
||||
reinit_weights: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
# Pretraining dataset
|
||||
pretraining_dataset:
|
||||
- path: allenai/c4
|
||||
name: en
|
||||
type: pretrain
|
||||
split: train
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/compare-adamw-pretrain
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
wandb_project: dist_muon
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name: adamw
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 4
|
||||
num_epochs: 1
|
||||
max_steps: 305
|
||||
|
||||
# AdamW optimizer settings (standard LR for AdamW)
|
||||
optimizer: adamw_torch_fused
|
||||
learning_rate: 0.0002
|
||||
weight_decay: 0.01
|
||||
lr_scheduler: cosine
|
||||
|
||||
train_on_inputs: true
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16: false
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: false
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 0
|
||||
saves_per_epoch: 1
|
||||
|
||||
# Reproducibility
|
||||
seed: 42
|
||||
|
||||
fsdp_config:
|
||||
fsdp_version: 2
|
||||
fsdp_offload_params: false
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_cpu_ram_efficient_loading: false
|
||||
fsdp_reshard_after_forward: true
|
||||
|
||||
special_tokens:
|
||||
70
examples/qwen2/muon-pretrain-fsdp2.yaml
Normal file
70
examples/qwen2/muon-pretrain-fsdp2.yaml
Normal file
@@ -0,0 +1,70 @@
|
||||
base_model: Qwen/Qwen2.5-0.5B
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
# Use random initialization for fair comparison
|
||||
reinit_weights: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
# Pretraining dataset
|
||||
pretraining_dataset:
|
||||
- path: allenai/c4
|
||||
name: en
|
||||
type: pretrain
|
||||
split: train
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/compare-muon-pretrain
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
wandb_project: dist_muon
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name: muon
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 4
|
||||
num_epochs: 1
|
||||
max_steps: 305
|
||||
|
||||
# Muon optimizer settings
|
||||
optimizer: muon
|
||||
learning_rate: 0.02
|
||||
weight_decay: 0.01
|
||||
lr_scheduler: cosine
|
||||
|
||||
train_on_inputs: true
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16: false
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: false
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 0
|
||||
saves_per_epoch: 1
|
||||
|
||||
# Reproducibility
|
||||
seed: 42
|
||||
|
||||
fsdp_config:
|
||||
fsdp_version: 2
|
||||
fsdp_offload_params: false
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_cpu_ram_efficient_loading: false
|
||||
fsdp_reshard_after_forward: true
|
||||
|
||||
special_tokens:
|
||||
46
examples/qwen3/README.md
Normal file
46
examples/qwen3/README.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# Finetune Qwen3 with Axolotl
|
||||
|
||||
[Qwen3](https://huggingface.co/collections/Qwen/qwen3) are a family of open source models trained by Alibaba.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||
|
||||
3. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/qwen3/32b-qlora.yaml
|
||||
```
|
||||
|
||||
Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### Chat template masking a few tokens off
|
||||
|
||||
If you notice that the `chat_template` masking for assistant prompts are off by a few tokens, please ensure that you are adding the below to the yaml.
|
||||
|
||||
```yaml
|
||||
chat_template: qwen3
|
||||
```
|
||||
|
||||
### TIPS
|
||||
|
||||
- For inference, please check the official model card as it depends on your reasoning mode.
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [Qwen3 Blog](https://qwenlm.github.io/blog/qwen3/)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
@@ -6,21 +6,17 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Seed-OSS is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
Here is an example of how to install from main for pip:
|
||||
Here is an example of how to install from pip:
|
||||
```bash
|
||||
# Ensure you have a compatible version of Pytorch installed
|
||||
pip3 install packaging setuptools wheel ninja
|
||||
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
|
||||
|
||||
```bash
|
||||
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
|
||||
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
cd axolotl
|
||||
|
||||
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
|
||||
pip3 install --no-build-isolation -e '.[flash-attn]'
|
||||
|
||||
# Install Cut Cross Entropy
|
||||
python scripts/cutcrossentropy_install.py | sh
|
||||
```
|
||||
# Install Cut Cross Entropy
|
||||
python scripts/cutcrossentropy_install.py | sh
|
||||
```
|
||||
|
||||
2. Run the finetuning example:
|
||||
|
||||
@@ -41,9 +37,7 @@ Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
||||
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
||||
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Related Resources
|
||||
|
||||
|
||||
@@ -37,9 +37,7 @@ This guide shows how to fine-tune SmolVLM2 models with Axolotl.
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
||||
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
|
||||
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Related Resources
|
||||
|
||||
|
||||
285
examples/swanlab/README.md
Normal file
285
examples/swanlab/README.md
Normal file
@@ -0,0 +1,285 @@
|
||||
# SwanLab Integration Examples
|
||||
|
||||
This directory contains example configurations demonstrating SwanLab integration with Axolotl.
|
||||
|
||||
## Examples Overview
|
||||
|
||||
### 1. DPO with Completion Logging
|
||||
**File**: `dpo-swanlab-completions.yml`
|
||||
|
||||
Demonstrates DPO (Direct Preference Optimization) training with RLHF completion table logging.
|
||||
|
||||
**Features**:
|
||||
- Basic SwanLab experiment tracking
|
||||
- Completion table logging (prompts, chosen/rejected responses, rewards)
|
||||
- Memory-bounded buffer for long training runs
|
||||
- Cloud sync configuration
|
||||
|
||||
**Best for**: RLHF practitioners who want to analyze model outputs qualitatively
|
||||
|
||||
**Quick start**:
|
||||
```bash
|
||||
export SWANLAB_API_KEY=your-api-key
|
||||
accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-completions.yml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. LoRA with Performance Profiling
|
||||
**File**: `lora-swanlab-profiling.yml`
|
||||
|
||||
Demonstrates standard LoRA fine-tuning with performance profiling enabled.
|
||||
|
||||
**Features**:
|
||||
- SwanLab experiment tracking
|
||||
- Automatic profiling of trainer methods
|
||||
- Profiling metrics visualization
|
||||
- Performance optimization guidance
|
||||
|
||||
**Best for**: Engineers optimizing training performance and comparing different configurations
|
||||
|
||||
**Quick start**:
|
||||
```bash
|
||||
export SWANLAB_API_KEY=your-api-key
|
||||
accelerate launch -m axolotl.cli.train examples/swanlab/lora-swanlab-profiling.yml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Full-Featured DPO Production Setup
|
||||
**File**: `dpo-swanlab-full-featured.yml`
|
||||
|
||||
Comprehensive production-ready configuration with ALL SwanLab features enabled.
|
||||
|
||||
**Features**:
|
||||
- Experiment tracking with team workspace
|
||||
- RLHF completion logging
|
||||
- Performance profiling
|
||||
- Lark (Feishu) team notifications
|
||||
- Private deployment support
|
||||
- Production checklist and troubleshooting
|
||||
|
||||
**Best for**: Production RLHF training with team collaboration
|
||||
|
||||
**Quick start**:
|
||||
```bash
|
||||
export SWANLAB_API_KEY=your-api-key
|
||||
export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
|
||||
export SWANLAB_LARK_SECRET=your-webhook-secret
|
||||
accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-full-featured.yml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 4. Custom Trainer Profiling (Python)
|
||||
**File**: `custom_trainer_profiling.py`
|
||||
|
||||
Python code examples showing how to add SwanLab profiling to custom trainers.
|
||||
|
||||
**Features**:
|
||||
- `@swanlab_profile` decorator examples
|
||||
- Context manager profiling for fine-grained timing
|
||||
- `ProfilingConfig` for advanced filtering and throttling
|
||||
- Multiple profiling patterns and best practices
|
||||
|
||||
**Best for**: Advanced users creating custom trainers
|
||||
|
||||
**Usage**:
|
||||
```python
|
||||
from custom_trainer_profiling import CustomTrainerWithProfiling
|
||||
# See file for detailed examples and patterns
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Feature Matrix
|
||||
|
||||
| Example | Tracking | Completion Logging | Profiling | Lark Notifications | Team Workspace |
|
||||
|---------|----------|-------------------|-----------|-------------------|----------------|
|
||||
| dpo-swanlab-completions.yml | ✅ | ✅ | ✅ (auto) | ➖ (commented) | ➖ (commented) |
|
||||
| lora-swanlab-profiling.yml | ✅ | ➖ (disabled) | ✅ (auto) | ➖ (commented) | ➖ (commented) |
|
||||
| dpo-swanlab-full-featured.yml | ✅ | ✅ | ✅ (auto) | ✅ | ✅ |
|
||||
| custom_trainer_profiling.py | N/A | N/A | ✅ (manual) | N/A | N/A |
|
||||
|
||||
---
|
||||
|
||||
## Configuration Quick Reference
|
||||
|
||||
### Basic SwanLab Setup
|
||||
```yaml
|
||||
plugins:
|
||||
- axolotl.integrations.swanlab.SwanLabPlugin
|
||||
|
||||
use_swanlab: true
|
||||
swanlab_project: my-project
|
||||
swanlab_experiment_name: my-experiment
|
||||
swanlab_mode: cloud # cloud, local, offline, disabled
|
||||
```
|
||||
|
||||
### RLHF Completion Logging
|
||||
```yaml
|
||||
swanlab_log_completions: true
|
||||
swanlab_completion_log_interval: 100 # Log every 100 steps
|
||||
swanlab_completion_max_buffer: 128 # Memory-bounded buffer
|
||||
```
|
||||
|
||||
### Lark Team Notifications
|
||||
```yaml
|
||||
swanlab_lark_webhook_url: https://open.feishu.cn/...
|
||||
swanlab_lark_secret: your-webhook-secret # Required for production
|
||||
```
|
||||
|
||||
### Team Workspace
|
||||
```yaml
|
||||
swanlab_workspace: my-research-team
|
||||
```
|
||||
|
||||
### Private Deployment
|
||||
```yaml
|
||||
swanlab_web_host: https://swanlab.yourcompany.com
|
||||
swanlab_api_host: https://api.swanlab.yourcompany.com
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Authentication
|
||||
|
||||
### Recommended: Environment Variable
|
||||
```bash
|
||||
export SWANLAB_API_KEY=your-api-key
|
||||
export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
|
||||
export SWANLAB_LARK_SECRET=your-webhook-secret
|
||||
```
|
||||
|
||||
### Alternative: Config File (less secure)
|
||||
```yaml
|
||||
swanlab_api_key: your-api-key
|
||||
swanlab_lark_webhook_url: https://open.feishu.cn/...
|
||||
swanlab_lark_secret: your-webhook-secret
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### Use Case 1: Migrate from WandB to SwanLab
|
||||
Start with `lora-swanlab-profiling.yml`, add your model/dataset config, disable WandB:
|
||||
```yaml
|
||||
use_swanlab: true
|
||||
use_wandb: false
|
||||
```
|
||||
|
||||
### Use Case 2: Analyze DPO Model Outputs
|
||||
Use `dpo-swanlab-completions.yml`, adjust completion logging interval based on your training length:
|
||||
```yaml
|
||||
swanlab_completion_log_interval: 50 # More frequent for short training
|
||||
swanlab_completion_log_interval: 200 # Less frequent for long training
|
||||
```
|
||||
|
||||
### Use Case 3: Optimize Training Performance
|
||||
Use `lora-swanlab-profiling.yml`, run multiple experiments with different optimizations:
|
||||
- Baseline: `flash_attention: false, gradient_checkpointing: false`
|
||||
- Flash Attention: `flash_attention: true`
|
||||
- Gradient Checkpointing: `gradient_checkpointing: true`
|
||||
- Both: `flash_attention: true, gradient_checkpointing: true`
|
||||
|
||||
Compare profiling metrics in SwanLab dashboard.
|
||||
|
||||
### Use Case 4: Production RLHF with Team Collaboration
|
||||
Use `dpo-swanlab-full-featured.yml`, set up team workspace and Lark notifications:
|
||||
```yaml
|
||||
swanlab_workspace: ml-team
|
||||
swanlab_lark_webhook_url: ...
|
||||
swanlab_lark_secret: ...
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Viewing Your Experiments
|
||||
|
||||
### Cloud Mode
|
||||
Visit [https://swanlab.cn](https://swanlab.cn) and navigate to your project.
|
||||
|
||||
**Dashboard sections**:
|
||||
- **Metrics**: Training loss, learning rate, profiling metrics
|
||||
- **Tables**: RLHF completions (for DPO/KTO/ORPO/GRPO)
|
||||
- **Config**: Hyperparameters and configuration
|
||||
- **System**: Resource usage (GPU, memory, CPU)
|
||||
- **Files**: Logged artifacts
|
||||
|
||||
### Local Mode
|
||||
```bash
|
||||
swanlab watch ./swanlog
|
||||
# Open browser to http://localhost:5092
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### SwanLab not initializing
|
||||
```bash
|
||||
# Check API key
|
||||
echo $SWANLAB_API_KEY
|
||||
|
||||
# Verify SwanLab is installed
|
||||
pip show swanlab
|
||||
|
||||
# Check config
|
||||
grep -A 5 "use_swanlab" your-config.yml
|
||||
```
|
||||
|
||||
### Completions not appearing
|
||||
- Verify you're using an RLHF trainer (DPO/KTO/ORPO/GRPO)
|
||||
- Check `swanlab_log_completions: true`
|
||||
- Wait for `swanlab_completion_log_interval` steps
|
||||
- Look for "Registered SwanLab RLHF completion logging" in logs
|
||||
|
||||
### Lark notifications not working
|
||||
- Test webhook manually: `curl -X POST "$SWANLAB_LARK_WEBHOOK_URL" ...`
|
||||
- Verify `SWANLAB_LARK_SECRET` is set correctly
|
||||
- Check bot is added to Lark group chat
|
||||
- Look for "Registered Lark notification callback" in logs
|
||||
|
||||
### Profiling metrics not appearing
|
||||
- Verify `use_swanlab: true`
|
||||
- Check SwanLab is initialized (look for init log message)
|
||||
- Profiling metrics are under "profiling/" namespace
|
||||
- Profiling auto-enabled when SwanLab is enabled
|
||||
|
||||
---
|
||||
|
||||
## Performance Notes
|
||||
|
||||
### Overhead Comparison
|
||||
|
||||
| Feature | Overhead per Step | Memory Usage |
|
||||
|---------|------------------|--------------|
|
||||
| Basic tracking | < 0.1% | ~10 MB |
|
||||
| Completion logging | < 0.5% | ~64 KB (buffer=128) |
|
||||
| Profiling | < 0.1% | ~1 KB |
|
||||
| **Total** | **< 0.7%** | **~10 MB** |
|
||||
|
||||
### Best Practices
|
||||
1. Use ONE logging tool in production (disable WandB/MLflow when using SwanLab)
|
||||
2. Adjust completion log interval based on training length (100-200 steps)
|
||||
3. Keep completion buffer size reasonable (128-512)
|
||||
4. Profile critical path methods first (training_step, compute_loss)
|
||||
5. Use ProfilingConfig to throttle high-frequency operations
|
||||
|
||||
---
|
||||
|
||||
## Further Reading
|
||||
|
||||
- **Full Documentation**: [src/axolotl/integrations/swanlab/README.md](../../src/axolotl/integrations/swanlab/README.md)
|
||||
- **SwanLab Docs**: [https://docs.swanlab.cn](https://docs.swanlab.cn)
|
||||
- **Axolotl Docs**: [https://axolotl-ai-cloud.github.io/axolotl/](https://axolotl-ai-cloud.github.io/axolotl/)
|
||||
- **DPO Paper**: [Direct Preference Optimization](https://arxiv.org/abs/2305.18290)
|
||||
|
||||
---
|
||||
|
||||
## Contributing
|
||||
|
||||
Found an issue or have an improvement? Please submit a PR or open an issue:
|
||||
- [Axolotl Issues](https://github.com/axolotl-ai-cloud/axolotl/issues)
|
||||
- [SwanLab Issues](https://github.com/SwanHubX/SwanLab/issues)
|
||||
299
examples/swanlab/custom_trainer_profiling.py
Normal file
299
examples/swanlab/custom_trainer_profiling.py
Normal file
@@ -0,0 +1,299 @@
|
||||
"""Example: Custom Trainer with SwanLab Profiling
|
||||
|
||||
This example demonstrates how to add SwanLab profiling to your custom trainer.
|
||||
|
||||
Features:
|
||||
- @swanlab_profile decorator for automatic profiling
|
||||
- swanlab_profiling_context for fine-grained profiling
|
||||
- ProfilingConfig for advanced filtering and throttling
|
||||
|
||||
Usage:
|
||||
1. Create your custom trainer extending AxolotlTrainer
|
||||
2. Add @swanlab_profile decorators to methods you want to profile
|
||||
3. Use swanlab_profiling_context for fine-grained profiling within methods
|
||||
4. Enable SwanLab in your config (use_swanlab: true)
|
||||
|
||||
See also:
|
||||
- examples/swanlab/lora-swanlab-profiling.yml for config
|
||||
- src/axolotl/integrations/swanlab/profiling.py for implementation
|
||||
"""
|
||||
|
||||
from axolotl.core.trainers.base import AxolotlTrainer
|
||||
from axolotl.integrations.swanlab.profiling import (
|
||||
ProfilingConfig,
|
||||
swanlab_profile,
|
||||
swanlab_profiling_context,
|
||||
swanlab_profiling_context_advanced,
|
||||
)
|
||||
|
||||
|
||||
class CustomTrainerWithProfiling(AxolotlTrainer):
|
||||
"""Custom trainer with SwanLab profiling enabled.
|
||||
|
||||
This trainer demonstrates three profiling patterns:
|
||||
1. Decorator-based profiling (@swanlab_profile)
|
||||
2. Context manager profiling (swanlab_profiling_context)
|
||||
3. Advanced profiling with filtering (ProfilingConfig)
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# Create custom profiling config for high-frequency operations
|
||||
self.fast_op_config = ProfilingConfig(
|
||||
enabled=True,
|
||||
min_duration_ms=0.5, # Only log if duration > 0.5ms
|
||||
log_interval=50, # Log every 50th call
|
||||
)
|
||||
|
||||
# ========================================================================
|
||||
# Pattern 1: Decorator-based Profiling
|
||||
# ========================================================================
|
||||
# Best for: Methods you always want to profile
|
||||
# Overhead: ~2-5 microseconds per call (negligible)
|
||||
|
||||
@swanlab_profile
|
||||
def training_step(self, model, inputs):
|
||||
"""Main training step - always profile.
|
||||
|
||||
Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.training_step
|
||||
"""
|
||||
return super().training_step(model, inputs)
|
||||
|
||||
@swanlab_profile
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
"""Loss computation - always profile.
|
||||
|
||||
Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.compute_loss
|
||||
"""
|
||||
return super().compute_loss(model, inputs, return_outputs)
|
||||
|
||||
@swanlab_profile
|
||||
def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
|
||||
"""Prediction step - always profile.
|
||||
|
||||
Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prediction_step
|
||||
"""
|
||||
return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
|
||||
|
||||
# ========================================================================
|
||||
# Pattern 2: Fine-grained Context Manager Profiling
|
||||
# ========================================================================
|
||||
# Best for: Profiling specific code blocks within a method
|
||||
# Use case: When you want to profile forward vs backward separately
|
||||
|
||||
def complex_training_step(self, model, inputs):
|
||||
"""Training step with fine-grained profiling.
|
||||
|
||||
Profiling metrics:
|
||||
- profiling/Time taken: CustomTrainerWithProfiling.forward_pass
|
||||
- profiling/Time taken: CustomTrainerWithProfiling.backward_pass
|
||||
- profiling/Time taken: CustomTrainerWithProfiling.optimizer_step
|
||||
"""
|
||||
# Profile just the forward pass
|
||||
with swanlab_profiling_context(self, "forward_pass"):
|
||||
outputs = model(**inputs)
|
||||
loss = outputs.loss
|
||||
|
||||
# Profile just the backward pass
|
||||
with swanlab_profiling_context(self, "backward_pass"):
|
||||
loss.backward()
|
||||
|
||||
# Profile optimizer step
|
||||
with swanlab_profiling_context(self, "optimizer_step"):
|
||||
self.optimizer.step()
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
return outputs
|
||||
|
||||
# ========================================================================
|
||||
# Pattern 3: Advanced Profiling with Filtering
|
||||
# ========================================================================
|
||||
# Best for: High-frequency operations where you want to throttle logging
|
||||
# Use case: Methods called 100+ times per step
|
||||
|
||||
def _prepare_inputs(self, inputs):
|
||||
"""Prepare inputs - throttled profiling.
|
||||
|
||||
This method is called frequently (once per batch), so we throttle
|
||||
profiling to reduce overhead:
|
||||
- Only log if duration > 0.5ms (skip very fast operations)
|
||||
- Only log every 50th call (reduce logging frequency)
|
||||
|
||||
Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prepare_inputs
|
||||
"""
|
||||
with swanlab_profiling_context_advanced(
|
||||
self, "prepare_inputs", config=self.fast_op_config
|
||||
):
|
||||
return super()._prepare_inputs(inputs)
|
||||
|
||||
def _prepare_input_for_model(self, input_ids):
|
||||
"""Another high-frequency operation - throttled profiling.
|
||||
|
||||
Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prepare_input_for_model
|
||||
"""
|
||||
with swanlab_profiling_context_advanced(
|
||||
self, "prepare_input_for_model", config=self.fast_op_config
|
||||
):
|
||||
# Your custom input preparation logic
|
||||
return input_ids
|
||||
|
||||
# ========================================================================
|
||||
# Pattern 4: Exception-safe Profiling
|
||||
# ========================================================================
|
||||
# Profiling is exception-safe: duration is logged even if method raises
|
||||
|
||||
@swanlab_profile
|
||||
def potentially_failing_method(self):
|
||||
"""This method may raise an exception.
|
||||
|
||||
SwanLab profiling will still log the duration before re-raising.
|
||||
Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.potentially_failing_method
|
||||
"""
|
||||
# Do some work
|
||||
result = self._do_risky_computation()
|
||||
|
||||
# If this raises, profiling duration is still logged
|
||||
if result < 0:
|
||||
raise ValueError("Invalid result")
|
||||
|
||||
return result
|
||||
|
||||
def _do_risky_computation(self):
|
||||
"""Placeholder for risky computation."""
|
||||
return 42
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Advanced Example: Custom ProfilingConfig Per Method
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class AdvancedProfilingTrainer(AxolotlTrainer):
|
||||
"""Trainer with method-specific profiling configurations."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# Different profiling configs for different method types
|
||||
self.critical_path_config = ProfilingConfig(
|
||||
enabled=True,
|
||||
min_duration_ms=0.0, # Log everything on critical path
|
||||
log_interval=1, # Log every call
|
||||
)
|
||||
|
||||
self.fast_path_config = ProfilingConfig(
|
||||
enabled=True,
|
||||
min_duration_ms=1.0, # Only log if > 1ms
|
||||
log_interval=100, # Log every 100th call
|
||||
)
|
||||
|
||||
self.debug_config = ProfilingConfig(
|
||||
enabled=True,
|
||||
min_duration_ms=0.0, # Log everything
|
||||
log_interval=1, # Log every call
|
||||
)
|
||||
|
||||
def training_step(self, model, inputs):
|
||||
"""Critical path - log everything."""
|
||||
with swanlab_profiling_context_advanced(
|
||||
self, "training_step", config=self.critical_path_config
|
||||
):
|
||||
return super().training_step(model, inputs)
|
||||
|
||||
def _prepare_inputs(self, inputs):
|
||||
"""Fast path - throttle logging."""
|
||||
with swanlab_profiling_context_advanced(
|
||||
self, "prepare_inputs", config=self.fast_path_config
|
||||
):
|
||||
return super()._prepare_inputs(inputs)
|
||||
|
||||
def _debug_method(self, data):
|
||||
"""Debug-only method - verbose logging."""
|
||||
with swanlab_profiling_context_advanced(
|
||||
self, "debug_method", config=self.debug_config
|
||||
):
|
||||
# Your debug logic
|
||||
pass
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# How to Use This Custom Trainer
|
||||
# ============================================================================
|
||||
|
||||
"""
|
||||
To use this custom trainer:
|
||||
|
||||
1. Save this file to your project (e.g., my_custom_trainer.py)
|
||||
|
||||
2. Create a config file that uses your custom trainer:
|
||||
|
||||
# config.yml
|
||||
base_model: NousResearch/Llama-3.2-1B
|
||||
|
||||
# ... other config ...
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.swanlab.SwanLabPlugin
|
||||
|
||||
use_swanlab: true
|
||||
swanlab_project: my-profiling-experiment
|
||||
|
||||
# Optional: Specify custom trainer
|
||||
# (Or modify axolotl to use your custom trainer class)
|
||||
|
||||
3. Run training:
|
||||
|
||||
export SWANLAB_API_KEY=your-api-key
|
||||
accelerate launch -m axolotl.cli.train config.yml
|
||||
|
||||
4. View profiling metrics in SwanLab dashboard:
|
||||
- profiling/Time taken: CustomTrainerWithProfiling.training_step
|
||||
- profiling/Time taken: CustomTrainerWithProfiling.forward_pass
|
||||
- profiling/Time taken: CustomTrainerWithProfiling.backward_pass
|
||||
- etc.
|
||||
|
||||
5. Compare profiling metrics across runs:
|
||||
- Run baseline without optimizations
|
||||
- Run with flash_attention enabled
|
||||
- Run with gradient_checkpointing enabled
|
||||
- Compare profiling metrics to see performance impact
|
||||
"""
|
||||
|
||||
# ============================================================================
|
||||
# Tips for Effective Profiling
|
||||
# ============================================================================
|
||||
|
||||
"""
|
||||
1. Profile the critical path first:
|
||||
- training_step, compute_loss, prediction_step
|
||||
- These methods are called most frequently and have biggest impact
|
||||
|
||||
2. Use throttling for high-frequency operations:
|
||||
- Methods called 100+ times per step
|
||||
- Use log_interval=50 or log_interval=100
|
||||
- Reduces profiling overhead and dashboard clutter
|
||||
|
||||
3. Filter noise with min_duration_ms:
|
||||
- Set min_duration_ms=1.0 to skip very fast operations
|
||||
- Focus on operations that actually take time
|
||||
|
||||
4. Compare across runs:
|
||||
- Run same config multiple times to check consistency
|
||||
- Compare different optimization strategies
|
||||
- Track profiling trends over time
|
||||
|
||||
5. Monitor distributed training:
|
||||
- Check for per-rank timing differences
|
||||
- Look for stragglers (slower ranks)
|
||||
- Identify synchronization bottlenecks
|
||||
|
||||
6. Disable profiling in production:
|
||||
- from axolotl.integrations.swanlab.profiling import DEFAULT_PROFILING_CONFIG
|
||||
- DEFAULT_PROFILING_CONFIG.enabled = False
|
||||
|
||||
7. Exception handling:
|
||||
- Profiling is exception-safe
|
||||
- Duration logged even if method raises
|
||||
- Useful for debugging methods that fail intermittently
|
||||
"""
|
||||
168
examples/swanlab/dpo-swanlab-completions.yml
Normal file
168
examples/swanlab/dpo-swanlab-completions.yml
Normal file
@@ -0,0 +1,168 @@
|
||||
# SwanLab DPO Training Example with Completion Logging
|
||||
#
|
||||
# This example demonstrates DPO (Direct Preference Optimization) training
|
||||
# with SwanLab integration for experiment tracking and completion table logging.
|
||||
#
|
||||
# Features enabled:
|
||||
# - SwanLab experiment tracking
|
||||
# - RLHF completion table logging (prompts, chosen/rejected responses, rewards)
|
||||
# - Lark (Feishu) team notifications (optional)
|
||||
#
|
||||
# To run:
|
||||
# export SWANLAB_API_KEY=your-api-key
|
||||
# accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-completions.yml
|
||||
|
||||
# Model Configuration
|
||||
base_model: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
special_tokens:
|
||||
pad_token: <|finetune_right_pad_id|>
|
||||
eos_token: <|eot_id|>
|
||||
|
||||
# Quantization
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
# LoRA Configuration
|
||||
adapter: lora
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
|
||||
# DPO Configuration
|
||||
chat_template: llama3
|
||||
rl: dpo
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_dpo_test
|
||||
type: chat_template.default
|
||||
field_messages: conversation
|
||||
field_chosen: chosen
|
||||
field_rejected: rejected
|
||||
message_property_mappings:
|
||||
role: role
|
||||
content: content
|
||||
roles:
|
||||
system:
|
||||
- system
|
||||
user:
|
||||
- user
|
||||
assistant:
|
||||
- assistant
|
||||
|
||||
# Dataset and Output
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/dpo-swanlab-out
|
||||
|
||||
# Training Configuration
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
micro_batch_size: 2
|
||||
gradient_accumulation_steps: 4
|
||||
num_epochs: 4
|
||||
|
||||
# Optimization
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
|
||||
# Precision
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
# Performance
|
||||
gradient_checkpointing: true
|
||||
flash_attention: true
|
||||
|
||||
# Checkpointing and Logging
|
||||
logging_steps: 1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
|
||||
# ============================================================================
|
||||
# SwanLab Integration
|
||||
# ============================================================================
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.swanlab.SwanLabPlugin
|
||||
|
||||
# Basic SwanLab Configuration
|
||||
use_swanlab: true
|
||||
swanlab_project: dpo-training
|
||||
swanlab_experiment_name: llama-3-dpo-completions-demo
|
||||
swanlab_description: "DPO training with completion table logging"
|
||||
swanlab_mode: cloud # Options: cloud, local, offline, disabled
|
||||
|
||||
# SwanLab Authentication
|
||||
# Recommended: Set via environment variable
|
||||
# export SWANLAB_API_KEY=your-api-key
|
||||
# Or set in config (less secure):
|
||||
# swanlab_api_key: your-api-key
|
||||
|
||||
# Optional: Team workspace
|
||||
# swanlab_workspace: my-research-team
|
||||
|
||||
# ============================================================================
|
||||
# RLHF Completion Table Logging
|
||||
# ============================================================================
|
||||
#
|
||||
# Automatically logs model completions to SwanLab for qualitative analysis:
|
||||
# - Prompts from your DPO dataset
|
||||
# - Chosen responses (preferred)
|
||||
# - Rejected responses (non-preferred)
|
||||
# - Reward differences
|
||||
#
|
||||
# View the table in SwanLab dashboard under "rlhf_completions"
|
||||
|
||||
swanlab_log_completions: true
|
||||
swanlab_completion_log_interval: 100 # Log every 100 training steps
|
||||
swanlab_completion_max_buffer: 128 # Keep last 128 completions in memory
|
||||
|
||||
# Memory Usage Notes:
|
||||
# - Buffer size 128: ~64 KB (default, recommended)
|
||||
# - Buffer size 512: ~256 KB (for more historical completions)
|
||||
# - Buffer size 1024: ~512 KB (maximum for very long training runs)
|
||||
|
||||
# Performance Notes:
|
||||
# - Completion logging overhead: < 0.5% per training step
|
||||
# - Only logs every N steps to minimize impact
|
||||
# - Memory-bounded buffer prevents memory leaks
|
||||
|
||||
# ============================================================================
|
||||
# Optional: Lark (Feishu) Team Notifications
|
||||
# ============================================================================
|
||||
#
|
||||
# Get real-time training notifications in your team chat
|
||||
# Uncomment to enable:
|
||||
|
||||
# swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
|
||||
# swanlab_lark_secret: your-webhook-secret # Recommended for production
|
||||
|
||||
# Notifications sent for:
|
||||
# - Training start
|
||||
# - Training completion
|
||||
# - Training errors
|
||||
# - Metric milestones (if configured)
|
||||
|
||||
# ============================================================================
|
||||
# Optional: Private SwanLab Deployment
|
||||
# ============================================================================
|
||||
#
|
||||
# For enterprise users with private SwanLab deployment:
|
||||
|
||||
# swanlab_web_host: https://swanlab.yourcompany.com
|
||||
# swanlab_api_host: https://api.swanlab.yourcompany.com
|
||||
|
||||
# ============================================================================
|
||||
# Disable WandB if you're migrating from it
|
||||
# ============================================================================
|
||||
|
||||
# wandb_project:
|
||||
# wandb_entity:
|
||||
# use_wandb: false
|
||||
329
examples/swanlab/dpo-swanlab-full-featured.yml
Normal file
329
examples/swanlab/dpo-swanlab-full-featured.yml
Normal file
@@ -0,0 +1,329 @@
|
||||
# SwanLab Full-Featured DPO Training Example
|
||||
#
|
||||
# This example demonstrates ALL SwanLab integration features:
|
||||
# - Experiment tracking with cloud sync
|
||||
# - RLHF completion table logging
|
||||
# - Performance profiling
|
||||
# - Lark (Feishu) team notifications
|
||||
# - Team workspace collaboration
|
||||
#
|
||||
# Use this as a reference for production RLHF training setups.
|
||||
#
|
||||
# To run:
|
||||
# export SWANLAB_API_KEY=your-api-key
|
||||
# export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
|
||||
# export SWANLAB_LARK_SECRET=your-webhook-secret
|
||||
# accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-full-featured.yml
|
||||
|
||||
# ============================================================================
|
||||
# Model Configuration
|
||||
# ============================================================================
|
||||
|
||||
base_model: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
special_tokens:
|
||||
pad_token: <|finetune_right_pad_id|>
|
||||
eos_token: <|eot_id|>
|
||||
|
||||
# Quantization for efficient training
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
# ============================================================================
|
||||
# LoRA Configuration
|
||||
# ============================================================================
|
||||
|
||||
adapter: lora
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true # Target all linear layers
|
||||
|
||||
# ============================================================================
|
||||
# DPO (Direct Preference Optimization) Configuration
|
||||
# ============================================================================
|
||||
|
||||
chat_template: llama3
|
||||
rl: dpo # Enable DPO trainer
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_dpo_test
|
||||
type: chat_template.default
|
||||
field_messages: conversation
|
||||
field_chosen: chosen
|
||||
field_rejected: rejected
|
||||
message_property_mappings:
|
||||
role: role
|
||||
content: content
|
||||
roles:
|
||||
system:
|
||||
- system
|
||||
user:
|
||||
- user
|
||||
assistant:
|
||||
- assistant
|
||||
|
||||
# ============================================================================
|
||||
# Dataset and Output Configuration
|
||||
# ============================================================================
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/dpo-swanlab-full-featured-out
|
||||
|
||||
# ============================================================================
|
||||
# Training Configuration
|
||||
# ============================================================================
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
|
||||
micro_batch_size: 2
|
||||
gradient_accumulation_steps: 4
|
||||
num_epochs: 4
|
||||
|
||||
# ============================================================================
|
||||
# Optimization
|
||||
# ============================================================================
|
||||
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
|
||||
# ============================================================================
|
||||
# Precision and Performance
|
||||
# ============================================================================
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
flash_attention: true
|
||||
|
||||
# ============================================================================
|
||||
# Checkpointing and Logging
|
||||
# ============================================================================
|
||||
|
||||
logging_steps: 1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
|
||||
# ============================================================================
|
||||
# SwanLab Integration - Full Configuration
|
||||
# ============================================================================
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.swanlab.SwanLabPlugin
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Basic SwanLab Configuration
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
use_swanlab: true
|
||||
swanlab_project: dpo-production
|
||||
swanlab_experiment_name: llama-3-dpo-full-featured-v1
|
||||
swanlab_description: |
|
||||
Production DPO training with all SwanLab features enabled:
|
||||
- Completion table logging for qualitative analysis
|
||||
- Performance profiling for optimization
|
||||
- Lark notifications for team collaboration
|
||||
|
||||
swanlab_mode: cloud # Options: cloud, local, offline, disabled
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Team Collaboration
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
# Workspace for team collaboration (shared experiments)
|
||||
swanlab_workspace: ml-research-team
|
||||
|
||||
# Authentication (recommended: use environment variable)
|
||||
# export SWANLAB_API_KEY=your-api-key
|
||||
# Or set in config (less secure):
|
||||
# swanlab_api_key: your-api-key
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# RLHF Completion Table Logging
|
||||
# ------------------------------------------------------------------------------
|
||||
# Automatically logs model completions for qualitative analysis:
|
||||
# - Prompts from your DPO dataset
|
||||
# - Chosen responses (preferred)
|
||||
# - Rejected responses (non-preferred)
|
||||
# - Reward differences
|
||||
#
|
||||
# View in SwanLab dashboard under "rlhf_completions" table
|
||||
|
||||
swanlab_log_completions: true
|
||||
swanlab_completion_log_interval: 100 # Log every 100 steps
|
||||
swanlab_completion_max_buffer: 256 # Larger buffer for long training runs
|
||||
|
||||
# Buffer size recommendations:
|
||||
# - 128: Default, ~64 KB memory (recommended for most cases)
|
||||
# - 256: ~128 KB memory (this config, good for longer training)
|
||||
# - 512: ~256 KB memory (maximum for very long runs)
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Lark (Feishu) Team Notifications
|
||||
# ------------------------------------------------------------------------------
|
||||
# Get real-time training notifications in your team chat
|
||||
#
|
||||
# Notifications sent for:
|
||||
# - Training start
|
||||
# - Training completion
|
||||
# - Training errors
|
||||
# - Metric milestones (if configured)
|
||||
|
||||
# Recommended: Set via environment variables
|
||||
# export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
|
||||
# export SWANLAB_LARK_SECRET=your-webhook-secret
|
||||
|
||||
# Or set in config (less secure):
|
||||
# swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
|
||||
# swanlab_lark_secret: your-webhook-secret # REQUIRED for production
|
||||
|
||||
# Security note: ALWAYS use swanlab_lark_secret in production to prevent
|
||||
# unauthorized parties from sending fake notifications to your team chat.
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Performance Profiling
|
||||
# ------------------------------------------------------------------------------
|
||||
# Profiling is automatically enabled when SwanLab is enabled.
|
||||
# Metrics logged to SwanLab under "profiling/" namespace:
|
||||
# profiling/Time taken: AxolotlTrainer.training_step
|
||||
# profiling/Time taken: AxolotlTrainer.compute_loss
|
||||
# profiling/Time taken: AxolotlTrainer.prediction_step
|
||||
#
|
||||
# Use these metrics to:
|
||||
# - Identify bottlenecks in training loop
|
||||
# - Compare performance across different configurations
|
||||
# - Monitor performance regressions over time
|
||||
# - Debug unexpected slowdowns
|
||||
|
||||
# For custom profiling in your own trainer, see:
|
||||
# examples/swanlab/custom_trainer_profiling.py
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Optional: Private SwanLab Deployment
|
||||
# ------------------------------------------------------------------------------
|
||||
# For enterprise users with private SwanLab deployment:
|
||||
|
||||
# swanlab_web_host: https://swanlab.yourcompany.com
|
||||
# swanlab_api_host: https://api.swanlab.yourcompany.com
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# Optional: Model Checkpointing to SwanLab
|
||||
# ------------------------------------------------------------------------------
|
||||
# Log model checkpoints to SwanLab (coming soon)
|
||||
|
||||
swanlab_log_model: false
|
||||
|
||||
# ============================================================================
|
||||
# Disable Other Logging Tools (Recommended)
|
||||
# ============================================================================
|
||||
# Using multiple logging tools simultaneously can impact performance:
|
||||
# - Expected overhead: ~1-2% per logger
|
||||
# - Potential config/callback conflicts
|
||||
#
|
||||
# For production training, use ONLY SwanLab:
|
||||
|
||||
# wandb_project:
|
||||
# use_wandb: false
|
||||
#
|
||||
# use_mlflow: false
|
||||
#
|
||||
# use_comet: false
|
||||
|
||||
# ============================================================================
|
||||
# Expected Training Behavior
|
||||
# ============================================================================
|
||||
|
||||
# With this configuration, you should see:
|
||||
#
|
||||
# 1. SwanLab Initialization (rank 0 only):
|
||||
# INFO: SwanLab initialized for project: dpo-production
|
||||
# INFO: SwanLab experiment: llama-3-dpo-full-featured-v1
|
||||
# INFO: SwanLab mode: cloud
|
||||
# INFO: SwanLab workspace: ml-research-team
|
||||
#
|
||||
# 2. Completion Logging (rank 0 only):
|
||||
# INFO: Registered SwanLab RLHF completion logging callback for DPOTrainer
|
||||
# (log_interval=100, max_buffer=256)
|
||||
#
|
||||
# 3. Lark Notifications (rank 0 only):
|
||||
# INFO: Registered Lark notification callback with HMAC authentication
|
||||
#
|
||||
# 4. Distributed Training Detection (if multi-GPU):
|
||||
# INFO: Distributed training detected (world_size=N)
|
||||
# INFO: Only rank 0 will initialize SwanLab
|
||||
# INFO: Other ranks will skip SwanLab to avoid conflicts
|
||||
#
|
||||
# 5. Training Start Notification (Lark):
|
||||
# Your team chat receives: "Training started: llama-3-dpo-full-featured-v1"
|
||||
#
|
||||
# 6. Periodic Completion Logging:
|
||||
# Every 100 steps, completion table is updated in SwanLab dashboard
|
||||
#
|
||||
# 7. Training Complete Notification (Lark):
|
||||
# Your team chat receives: "Training completed: llama-3-dpo-full-featured-v1"
|
||||
# With link to SwanLab dashboard and final metrics
|
||||
#
|
||||
# 8. SwanLab Dashboard Shows:
|
||||
# - Training metrics (loss, learning rate, etc.)
|
||||
# - Completion table (rlhf_completions)
|
||||
# - Profiling metrics (profiling/Time taken: ...)
|
||||
# - Hyperparameters and configuration
|
||||
# - System resource usage
|
||||
|
||||
# ============================================================================
|
||||
# Production Checklist
|
||||
# ============================================================================
|
||||
|
||||
# Before deploying to production, verify:
|
||||
# ✅ SwanLab API key is set via environment variable (not in config)
|
||||
# ✅ Lark webhook secret is set (required for HMAC authentication)
|
||||
# ✅ Workspace is set to your team's workspace
|
||||
# ✅ Experiment name is descriptive and unique
|
||||
# ✅ Only SwanLab is enabled (other loggers disabled)
|
||||
# ✅ Completion logging buffer size is appropriate for your training duration
|
||||
# ✅ Private deployment hosts are set (if using enterprise SwanLab)
|
||||
# ✅ Test run completes successfully and shows up in SwanLab dashboard
|
||||
# ✅ Lark notifications are received in team chat
|
||||
# ✅ Profiling metrics are logged correctly
|
||||
|
||||
# ============================================================================
|
||||
# Troubleshooting
|
||||
# ============================================================================
|
||||
|
||||
# If SwanLab initialization fails:
|
||||
# 1. Check SWANLAB_API_KEY environment variable is set
|
||||
# 2. Verify swanlab_project is set in config
|
||||
# 3. Check swanlab_mode is valid (cloud/local/offline/disabled)
|
||||
# 4. Verify internet connectivity (for cloud mode)
|
||||
|
||||
# If Lark notifications not received:
|
||||
# 1. Check SWANLAB_LARK_WEBHOOK_URL is set correctly
|
||||
# 2. Verify SWANLAB_LARK_SECRET matches your Lark bot settings
|
||||
# 3. Test webhook manually: curl -X POST "$SWANLAB_LARK_WEBHOOK_URL" ...
|
||||
# 4. Check training logs for "Registered Lark notification callback"
|
||||
# 5. Verify bot is added to the target Lark group chat
|
||||
|
||||
# If completions not appearing in SwanLab:
|
||||
# 1. Verify you're using an RLHF trainer (DPO/KTO/ORPO/GRPO)
|
||||
# 2. Check swanlab_log_completions is true
|
||||
# 3. Wait for log_interval steps (default: 100)
|
||||
# 4. Check training logs for "Registered SwanLab RLHF completion logging"
|
||||
|
||||
# If profiling metrics not appearing:
|
||||
# 1. Verify use_swanlab is true
|
||||
# 2. Check SwanLab is initialized (check logs)
|
||||
# 3. Look under "profiling/" namespace in dashboard
|
||||
# 4. Profiling may be disabled if DEFAULT_PROFILING_CONFIG.enabled = False
|
||||
|
||||
# For more help:
|
||||
# - SwanLab docs: https://docs.swanlab.cn
|
||||
# - Axolotl SwanLab integration: src/axolotl/integrations/swanlab/README.md
|
||||
# - GitHub issues: https://github.com/axolotl-ai-cloud/axolotl/issues
|
||||
178
examples/swanlab/lora-swanlab-profiling.yml
Normal file
178
examples/swanlab/lora-swanlab-profiling.yml
Normal file
@@ -0,0 +1,178 @@
|
||||
# SwanLab LoRA Training Example with Performance Profiling
|
||||
#
|
||||
# This example demonstrates standard LoRA fine-tuning with SwanLab integration
|
||||
# for performance profiling and optimization.
|
||||
#
|
||||
# Features enabled:
|
||||
# - SwanLab experiment tracking
|
||||
# - Performance profiling (training step, forward/backward pass timing)
|
||||
# - Real-time metrics visualization
|
||||
#
|
||||
# To run:
|
||||
# export SWANLAB_API_KEY=your-api-key
|
||||
# accelerate launch -m axolotl.cli.train examples/swanlab/lora-swanlab-profiling.yml
|
||||
|
||||
# Model Configuration
|
||||
base_model: NousResearch/Llama-3.2-1B
|
||||
|
||||
# Dataset Configuration
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
type: alpaca
|
||||
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-swanlab-profiling-out
|
||||
|
||||
# LoRA Configuration
|
||||
adapter: lora
|
||||
lora_r: 16
|
||||
lora_alpha: 32
|
||||
lora_dropout: 0.05
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
# Training Configuration
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
eval_sample_packing: true
|
||||
|
||||
micro_batch_size: 2
|
||||
gradient_accumulation_steps: 2
|
||||
num_epochs: 1
|
||||
|
||||
# Optimization
|
||||
optimizer: adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
|
||||
# Precision
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
# Performance
|
||||
gradient_checkpointing: true
|
||||
flash_attention: true
|
||||
|
||||
# Checkpointing and Logging
|
||||
logging_steps: 1
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
|
||||
# Loss Monitoring
|
||||
loss_watchdog_threshold: 5.0
|
||||
loss_watchdog_patience: 3
|
||||
|
||||
special_tokens:
|
||||
pad_token: "<|end_of_text|>"
|
||||
|
||||
# ============================================================================
|
||||
# SwanLab Integration
|
||||
# ============================================================================
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.swanlab.SwanLabPlugin
|
||||
|
||||
# Basic SwanLab Configuration
|
||||
use_swanlab: true
|
||||
swanlab_project: lora-profiling
|
||||
swanlab_experiment_name: llama-3.2-1b-profiling-demo
|
||||
swanlab_description: "LoRA fine-tuning with performance profiling"
|
||||
swanlab_mode: cloud # Options: cloud, local, offline, disabled
|
||||
|
||||
# SwanLab Authentication
|
||||
# Recommended: Set via environment variable
|
||||
# export SWANLAB_API_KEY=your-api-key
|
||||
# Or set in config (less secure):
|
||||
# swanlab_api_key: your-api-key
|
||||
|
||||
# Optional: Team workspace
|
||||
# swanlab_workspace: my-ml-team
|
||||
|
||||
# ============================================================================
|
||||
# Performance Profiling
|
||||
# ============================================================================
|
||||
#
|
||||
# SwanLab automatically profiles trainer methods when enabled.
|
||||
# Profiling metrics appear in SwanLab dashboard under "profiling/" namespace.
|
||||
#
|
||||
# Built-in profiling:
|
||||
# - Minimal overhead (< 0.1% per step)
|
||||
# - High-precision timing (microsecond accuracy)
|
||||
# - Exception-safe (logs duration even if method fails)
|
||||
#
|
||||
# View profiling metrics in SwanLab dashboard:
|
||||
# profiling/Time taken: AxolotlTrainer.training_step
|
||||
# profiling/Time taken: AxolotlTrainer.compute_loss
|
||||
# profiling/Time taken: AxolotlTrainer.prediction_step
|
||||
#
|
||||
# For custom profiling in your own trainer, see:
|
||||
# examples/swanlab/custom_trainer_profiling.py
|
||||
|
||||
# Completion logging is disabled for non-RLHF trainers
|
||||
swanlab_log_completions: false # Only works with DPO/KTO/ORPO/GRPO
|
||||
|
||||
# ============================================================================
|
||||
# Optional: Compare with Multiple Runs
|
||||
# ============================================================================
|
||||
#
|
||||
# To compare profiling metrics across different configurations:
|
||||
#
|
||||
# 1. Run baseline without flash attention:
|
||||
# swanlab_experiment_name: llama-3.2-1b-no-flash-attn
|
||||
# flash_attention: false
|
||||
#
|
||||
# 2. Run with gradient checkpointing:
|
||||
# swanlab_experiment_name: llama-3.2-1b-grad-checkpoint
|
||||
# gradient_checkpointing: true
|
||||
#
|
||||
# 3. Run with both:
|
||||
# swanlab_experiment_name: llama-3.2-1b-optimized
|
||||
# flash_attention: true
|
||||
# gradient_checkpointing: true
|
||||
#
|
||||
# Then compare profiling metrics in SwanLab dashboard to see performance impact
|
||||
|
||||
# ============================================================================
|
||||
# Optional: Lark (Feishu) Team Notifications
|
||||
# ============================================================================
|
||||
#
|
||||
# Get notified when profiling experiments complete:
|
||||
|
||||
# swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
|
||||
# swanlab_lark_secret: your-webhook-secret
|
||||
|
||||
# ============================================================================
|
||||
# Profiling Best Practices
|
||||
# ============================================================================
|
||||
#
|
||||
# 1. Run multiple epochs to see profiling trends over time
|
||||
# 2. Ignore first ~10 steps (warmup period, slower)
|
||||
# 3. Look for outliers (steps that take significantly longer)
|
||||
# 4. Compare profiling metrics before/after optimization changes
|
||||
# 5. Monitor per-rank profiling in distributed training
|
||||
#
|
||||
# Common bottlenecks to profile:
|
||||
# - training_step: Overall step time (should be consistent)
|
||||
# - compute_loss: Loss computation (scales with sequence length)
|
||||
# - prediction_step: Evaluation time (can be slow for large val sets)
|
||||
#
|
||||
# If you see inconsistent timing:
|
||||
# - Check for data loading bottlenecks
|
||||
# - Monitor GPU utilization (may be CPU-bound)
|
||||
# - Check for gradient accumulation effects
|
||||
# - Verify CUDA kernel synchronization
|
||||
|
||||
# ============================================================================
|
||||
# Disable WandB if you're migrating from it
|
||||
# ============================================================================
|
||||
|
||||
# wandb_project:
|
||||
# use_wandb: false
|
||||
42
examples/trinity/README.md
Normal file
42
examples/trinity/README.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# Finetune ArceeAI's Trinity with Axolotl
|
||||
|
||||
[Trinity](https://huggingface.co/collections/arcee-ai/trinity) is a family of open weight MoE models trained by Arcee.ai.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the main from the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build).
|
||||
|
||||
2. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/trinity/trinity-nano-preview-qlora.yaml
|
||||
```
|
||||
|
||||
This config uses about 24.9 GiB VRAM.
|
||||
|
||||
Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### TIPS
|
||||
|
||||
- For inference, the official Arcee.ai team recommends `top_p: 0.75`, `temperature: 0.15`, `top_k: 50`, and `min_p: 0.06`.
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Limitations
|
||||
|
||||
**Cut Cross Entropy (CCE)**: Currently not supported. We plan to include CCE support for Trinity in the near future.
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [Trinity Blog](https://www.arcee.ai/blog/the-trinity-manifesto)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
68
examples/trinity/trinity-nano-preview-qlora.yaml
Normal file
68
examples/trinity/trinity-nano-preview-qlora.yaml
Normal file
@@ -0,0 +1,68 @@
|
||||
base_model: arcee-ai/Trinity-Nano-Preview
|
||||
trust_remote_code: true
|
||||
revision_of_model: 2ee94b0
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
# CCE - N/A as of now
|
||||
# plugins:
|
||||
# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
# flash_attention: true # Not supported
|
||||
sdp_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
@@ -1,5 +1,5 @@
|
||||
base_model: mistralai/Voxtral-Mini-3B-2507
|
||||
processor_type: AutoProcessor
|
||||
processor_type: VoxtralProcessor
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
@@ -1,33 +1,34 @@
|
||||
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||
|
||||
# START section of dependencies that don't install on Darwin/MacOS
|
||||
bitsandbytes==0.47.0
|
||||
bitsandbytes==0.49.1
|
||||
triton>=3.0.0
|
||||
mamba-ssm==1.2.0.post1
|
||||
xformers>=0.0.23.post1
|
||||
liger-kernel==0.6.3
|
||||
liger-kernel==0.6.4
|
||||
# END section
|
||||
|
||||
packaging==23.2
|
||||
|
||||
huggingface_hub>=0.36.0
|
||||
peft>=0.17.1
|
||||
tokenizers>=0.21.1
|
||||
peft>=0.18.0
|
||||
tokenizers>=0.22.1
|
||||
transformers==4.57.1
|
||||
accelerate==1.10.1
|
||||
datasets==4.3.0
|
||||
deepspeed>=0.17.0
|
||||
trl==0.24.0
|
||||
accelerate==1.12.0
|
||||
datasets==4.4.2
|
||||
deepspeed>=0.18.3
|
||||
trl==0.25.1
|
||||
hf_xet==1.2.0
|
||||
kernels>=0.9.0
|
||||
trackio
|
||||
kernels==0.11.5
|
||||
trackio>=0.13.0
|
||||
typing-extensions>=4.15.0
|
||||
|
||||
optimum==1.16.2
|
||||
hf_transfer
|
||||
sentencepiece
|
||||
gradio==5.49.1
|
||||
gradio>=6.2.0,<7.0
|
||||
|
||||
modal==1.0.2
|
||||
modal==1.3.0.post1
|
||||
pydantic>=2.10.6
|
||||
addict
|
||||
fire
|
||||
@@ -42,7 +43,6 @@ numpy>=2.2.6
|
||||
# qlora things
|
||||
evaluate==0.4.1
|
||||
scipy
|
||||
scikit-learn==1.4.2
|
||||
nvidia-ml-py==12.560.30
|
||||
art
|
||||
tensorboard
|
||||
@@ -64,9 +64,12 @@ immutabledict==4.2.0
|
||||
antlr4-python3-runtime==4.13.2
|
||||
|
||||
torchao==0.13.0
|
||||
openenv-core==0.1.0
|
||||
schedulefree==1.4.1
|
||||
|
||||
axolotl-contribs-lgpl==0.0.7
|
||||
axolotl-contribs-mit==0.0.5
|
||||
axolotl-contribs-mit==0.0.6
|
||||
# telemetry
|
||||
posthog==6.7.11
|
||||
|
||||
mistral-common==1.8.5
|
||||
mistral-common==1.8.6
|
||||
|
||||
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
|
||||
|
||||
print(
|
||||
UNINSTALL_PREFIX
|
||||
+ f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"'
|
||||
+ f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2"'
|
||||
)
|
||||
|
||||
5
setup.py
5
setup.py
@@ -66,7 +66,6 @@ def parse_requirements(extras_require_map):
|
||||
extras_require_map.pop("fbgemm-gpu")
|
||||
extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.4.1"]
|
||||
extras_require_map["vllm"] = ["vllm==0.11.1"]
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
elif (major, minor) >= (2, 8):
|
||||
extras_require_map.pop("fbgemm-gpu")
|
||||
extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.3.0"]
|
||||
@@ -130,7 +129,7 @@ extras_require = {
|
||||
"ring-flash-attn>=0.1.7",
|
||||
],
|
||||
"deepspeed": [
|
||||
"deepspeed==0.17.5",
|
||||
"deepspeed==0.18.2",
|
||||
"deepspeed-kernels",
|
||||
],
|
||||
"mamba-ssm": [
|
||||
@@ -157,7 +156,7 @@ extras_require = {
|
||||
"came_pytorch==0.1.3",
|
||||
],
|
||||
"ray": [
|
||||
"ray[train]",
|
||||
"ray[train]>=2.52.1",
|
||||
],
|
||||
"vllm": [
|
||||
"vllm==0.10.0",
|
||||
|
||||
@@ -24,8 +24,7 @@ if launcher_args:
|
||||
launcher_args_str = "-- " + " ".join(launcher_args)
|
||||
|
||||
# 1. Define a base image for your training job
|
||||
# must use torch 2.7.0 for vllm
|
||||
BASE_IMAGE = "axolotlai/axolotl:main-py3.11-cu126-2.7.1"
|
||||
BASE_IMAGE = "axolotlai/axolotl:main-py3.11-cu128-2.9.1"
|
||||
|
||||
# 2. Define the Runtime Environment for the Training Job
|
||||
# This includes start commands and environment variables.a
|
||||
|
||||
@@ -82,7 +82,7 @@ class ModalCloud(Cloud):
|
||||
return res
|
||||
|
||||
def get_image(self):
|
||||
docker_tag = "main-py3.11-cu126-2.7.1"
|
||||
docker_tag = "main-py3.11-cu128-2.9.1"
|
||||
if self.config.docker_tag:
|
||||
docker_tag = self.config.docker_tag
|
||||
docker_image = f"axolotlai/axolotl:{docker_tag}"
|
||||
|
||||
@@ -14,6 +14,8 @@ import yaml
|
||||
from transformers.utils import is_torch_bf16_gpu_available
|
||||
|
||||
from axolotl.integrations.base import PluginManager
|
||||
from axolotl.telemetry.errors import send_errors
|
||||
from axolotl.telemetry.manager import TelemetryManager
|
||||
from axolotl.utils.comet_ import setup_comet_env_vars
|
||||
from axolotl.utils.config import (
|
||||
normalize_cfg_datasets,
|
||||
@@ -24,6 +26,7 @@ from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.logging import get_logger
|
||||
from axolotl.utils.mlflow_ import setup_mlflow_env_vars
|
||||
from axolotl.utils.tee import prepare_debug_log
|
||||
from axolotl.utils.trackio_ import setup_trackio_env_vars
|
||||
from axolotl.utils.trainer import prepare_optim_env
|
||||
from axolotl.utils.wandb_ import setup_wandb_env_vars
|
||||
|
||||
@@ -31,6 +34,8 @@ LOG = get_logger(__name__)
|
||||
|
||||
API_KEY_FIELDS = {"comet_api_key"}
|
||||
|
||||
TELEMETRY_MANAGER = TelemetryManager.get_instance()
|
||||
|
||||
|
||||
def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
|
||||
"""
|
||||
@@ -164,6 +169,7 @@ def plugin_set_cfg(cfg: DictDefault):
|
||||
plugin_manager.cfg = cfg
|
||||
|
||||
|
||||
@send_errors
|
||||
def load_cfg(
|
||||
config: str | Path | DictDefault = Path("examples/"), **kwargs
|
||||
) -> DictDefault:
|
||||
@@ -197,6 +203,8 @@ def load_cfg(
|
||||
temp_file.close()
|
||||
cfg.axolotl_config_path = temp_file.name
|
||||
|
||||
TELEMETRY_MANAGER.send_event(event_type="config-loaded", properties=cfg)
|
||||
|
||||
# If there are any options passed in the cli, if it is something that seems valid
|
||||
# from the yaml, then overwrite the value
|
||||
cfg_keys = cfg.keys()
|
||||
@@ -220,6 +228,7 @@ def load_cfg(
|
||||
cfg,
|
||||
capabilities={
|
||||
"bf16": is_torch_bf16_gpu_available(),
|
||||
"fp8": compute_supports_fp8(),
|
||||
"n_gpu": int(os.environ.get("WORLD_SIZE", 1)),
|
||||
"compute_capability": gpu_version,
|
||||
},
|
||||
@@ -238,8 +247,10 @@ def load_cfg(
|
||||
setup_wandb_env_vars(cfg)
|
||||
setup_mlflow_env_vars(cfg)
|
||||
setup_comet_env_vars(cfg)
|
||||
setup_trackio_env_vars(cfg)
|
||||
plugin_set_cfg(cfg)
|
||||
|
||||
TELEMETRY_MANAGER.send_event(event_type="config-processed", properties=cfg)
|
||||
cfg_to_log = {
|
||||
k: "[REDACTED]" if k in API_KEY_FIELDS else v
|
||||
for k, v in cfg.items()
|
||||
@@ -251,3 +262,11 @@ def load_cfg(
|
||||
)
|
||||
|
||||
return cfg
|
||||
|
||||
|
||||
def compute_supports_fp8() -> bool:
|
||||
try:
|
||||
compute_capability = torch.cuda.get_device_capability()
|
||||
return compute_capability >= (9, 0)
|
||||
except RuntimeError:
|
||||
return False
|
||||
|
||||
@@ -19,7 +19,10 @@ from axolotl.cli.utils.diffusion import (
|
||||
launch_diffusion_gradio_ui,
|
||||
)
|
||||
from axolotl.integrations.base import PluginManager
|
||||
from axolotl.utils.chat_templates import get_chat_template_from_config
|
||||
from axolotl.telemetry.errors import send_errors
|
||||
from axolotl.utils.chat_templates import (
|
||||
get_chat_template_from_config,
|
||||
)
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.logging import get_logger
|
||||
|
||||
@@ -43,6 +46,7 @@ def get_multi_line_input() -> str:
|
||||
return instruction
|
||||
|
||||
|
||||
@send_errors
|
||||
def do_inference(
|
||||
*,
|
||||
cfg: DictDefault,
|
||||
@@ -160,6 +164,7 @@ def do_inference(
|
||||
print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))
|
||||
|
||||
|
||||
@send_errors
|
||||
def do_inference_gradio(
|
||||
*,
|
||||
cfg: DictDefault,
|
||||
@@ -283,8 +288,8 @@ def do_inference_gradio(
|
||||
title=cfg.get("gradio_title", "Axolotl Gradio Interface"),
|
||||
)
|
||||
|
||||
demo.queue().launch(
|
||||
show_api=False,
|
||||
demo.launch(
|
||||
footer_links=["gradio", "settings"],
|
||||
share=cfg.get("gradio_share", True),
|
||||
server_name=cfg.get("gradio_server_name", "127.0.0.1"),
|
||||
server_port=cfg.get("gradio_server_port", None),
|
||||
|
||||
@@ -26,7 +26,7 @@ from axolotl.cli.utils import (
|
||||
launch_training,
|
||||
)
|
||||
from axolotl.integrations.lm_eval.cli import lm_eval
|
||||
from axolotl.utils import set_pytorch_cuda_alloc_conf
|
||||
from axolotl.utils import set_misc_env, set_pytorch_cuda_alloc_conf
|
||||
from axolotl.utils.logging import get_logger
|
||||
from axolotl.utils.schemas.config import AxolotlInputConfig
|
||||
|
||||
@@ -45,6 +45,7 @@ def cli():
|
||||
print_axolotl_text_art()
|
||||
load_dotenv()
|
||||
set_pytorch_cuda_alloc_conf()
|
||||
set_misc_env()
|
||||
|
||||
|
||||
@cli.command()
|
||||
|
||||
@@ -7,12 +7,14 @@ import fire
|
||||
|
||||
from axolotl.cli.config import load_cfg
|
||||
from axolotl.cli.utils import load_model_and_tokenizer
|
||||
from axolotl.telemetry.errors import send_errors
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.logging import get_logger
|
||||
|
||||
LOG = get_logger(__name__)
|
||||
|
||||
|
||||
@send_errors
|
||||
def do_merge_lora(*, cfg: DictDefault) -> None:
|
||||
"""
|
||||
Calls `transformers`' `merge_and_unload` on the model given in the `axolotl` config
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user