Compare commits
58 Commits
fix/hpc-ro
...
upgrade-to
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7a08e4117a | ||
|
|
b26ba3a5cb | ||
|
|
afe18ace35 | ||
|
|
2b199f9915 | ||
|
|
e73dab6df9 | ||
|
|
f45a97a9ff | ||
|
|
11c0b5b256 | ||
|
|
66a3de3629 | ||
|
|
a6080df73c | ||
|
|
4f5e8a328a | ||
|
|
418933f0d1 | ||
|
|
372f664c63 | ||
|
|
97f1b1758d | ||
|
|
f2155eaf79 | ||
|
|
92ee4256f7 | ||
|
|
efeb5a4e41 | ||
|
|
faaff6c792 | ||
|
|
43cef27458 | ||
|
|
07c41a6c2a | ||
|
|
bbd3486f57 | ||
|
|
3750d7dd64 | ||
|
|
2197b0bf89 | ||
|
|
3e51a680c2 | ||
|
|
2cf254b4af | ||
|
|
83d4d97dcc | ||
|
|
a1d07f42e4 | ||
|
|
2a664dc8ad | ||
|
|
4ac78aa562 | ||
|
|
b3f4aa149f | ||
|
|
75b20fb66f | ||
|
|
5992e607a2 | ||
|
|
2b66ee189c | ||
|
|
86d8cca149 | ||
|
|
4a0f98e612 | ||
|
|
c6ddcdd06a | ||
|
|
7fb6a947d9 | ||
|
|
b234532d9f | ||
|
|
8990ca3205 | ||
|
|
006f226270 | ||
|
|
0b635e69c5 | ||
|
|
0d27e14e45 | ||
|
|
f5f21fb216 | ||
|
|
4e55871112 | ||
|
|
a6bafb55cb | ||
|
|
0fbde69e9c | ||
|
|
301e22849f | ||
|
|
dcf24fd24e | ||
|
|
49b8107989 | ||
|
|
9901ee5602 | ||
|
|
dd78f2e0cc | ||
|
|
b54f9c942b | ||
|
|
11eb36585a | ||
|
|
d0c846fc5e | ||
|
|
b5fcc2f14b | ||
|
|
b62eed8809 | ||
|
|
ed2e8cacd6 | ||
|
|
80270a92fa | ||
|
|
bfdc9a8249 |
6
.github/FUNDING.yml
vendored
6
.github/FUNDING.yml
vendored
@@ -1,13 +1,13 @@
|
||||
# These are supported funding model platforms
|
||||
|
||||
github: [winglian, OpenAccess-AI-Collective] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
||||
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
||||
patreon: # Replace with a single Patreon username
|
||||
open_collective: # Replace with a single Open Collective username
|
||||
ko_fi: axolotl_ai # Replace with a single Ko-fi username
|
||||
ko_fi: # Replace with a single Ko-fi username
|
||||
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
|
||||
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
|
||||
liberapay: # Replace with a single Liberapay username
|
||||
issuehunt: # Replace with a single IssueHunt username
|
||||
otechie: # Replace with a single Otechie username
|
||||
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
|
||||
custom: ['https://quickchart.io/qr?text=bitcoin%3Abc1qxlgwlqwfea5s2cxm42xqsfmwjct0rj8w8ea5np&size=480¢erImageUrl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fcommons%2Fthumb%2F4%2F46%2FBitcoin.svg%2F64px-Bitcoin.svg.png'] # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
|
||||
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
|
||||
|
||||
54
.github/workflows/base.yml
vendored
54
.github/workflows/base.yml
vendored
@@ -25,27 +25,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: "126"
|
||||
cuda_version: 12.6.3
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
- cuda: "126"
|
||||
cuda_version: 12.6.3
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
@@ -60,11 +39,18 @@ jobs:
|
||||
pytorch: 2.9.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
- cuda: "130"
|
||||
cuda_version: 13.0.0
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.0
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
# - cuda: "128"
|
||||
@@ -90,7 +76,6 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
winglian/axolotl-base
|
||||
axolotlai/axolotl-base
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
@@ -122,20 +107,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: "126"
|
||||
cuda_version: 12.6.3
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
@@ -143,6 +114,13 @@ jobs:
|
||||
pytorch: 2.8.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
@@ -154,7 +132,7 @@ jobs:
|
||||
cuda_version: 13.0.0
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.0
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
steps:
|
||||
|
||||
3
.github/workflows/docs.yml
vendored
3
.github/workflows/docs.yml
vendored
@@ -12,6 +12,9 @@ jobs:
|
||||
build-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: cleanup node
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up Quarto
|
||||
|
||||
91
.github/workflows/main.yml
vendored
91
.github/workflows/main.yml
vendored
@@ -15,27 +15,27 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.0
|
||||
axolotl_extras:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras: vllm
|
||||
is_latest: true
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.8.0
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.0
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
# - cuda: 130
|
||||
# cuda_version: 13.0.0
|
||||
# python_version: "3.11"
|
||||
# pytorch: 2.9.1
|
||||
# axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -45,7 +45,6 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
winglian/axolotl
|
||||
axolotlai/axolotl
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
@@ -83,33 +82,27 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.0
|
||||
axolotl_extras:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
is_latest:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras: vllm
|
||||
is_latest: true
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.8.0
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.0
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
# - cuda: 130
|
||||
# cuda_version: 13.0.0
|
||||
# python_version: "3.11"
|
||||
# pytorch: 2.9.1
|
||||
# axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -119,7 +112,6 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
winglian/axolotl-cloud
|
||||
axolotlai/axolotl-cloud
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
@@ -152,24 +144,18 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
is_latest:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras: vllm
|
||||
is_latest: true
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.8.0
|
||||
axolotl_extras:
|
||||
is_latest:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
is_latest:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -179,7 +165,6 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
winglian/axolotl-cloud-term
|
||||
axolotlai/axolotl-cloud-term
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
|
||||
16
.github/workflows/multi-gpu-e2e.yml
vendored
16
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -19,6 +19,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
env:
|
||||
MODAL_IMAGE_BUILDER_VERSION: "2025.06"
|
||||
|
||||
jobs:
|
||||
test-axolotl-multigpu:
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
||||
@@ -26,13 +29,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras: vllm
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
@@ -43,7 +39,7 @@ jobs:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.0
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras: fbgemm-gpu
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
@@ -59,7 +55,7 @@ jobs:
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.0.2 jinja2
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
@@ -72,4 +68,4 @@ jobs:
|
||||
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
||||
- name: Run tests job on Modal
|
||||
run: |
|
||||
modal run cicd.multigpu
|
||||
modal run -m cicd.multigpu
|
||||
|
||||
22
.github/workflows/nightlies.yml
vendored
22
.github/workflows/nightlies.yml
vendored
@@ -12,16 +12,16 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.8.0
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -31,7 +31,6 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
winglian/axolotl
|
||||
axolotlai/axolotl
|
||||
tags: |
|
||||
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
||||
@@ -65,16 +64,16 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.8.0
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -84,7 +83,6 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
winglian/axolotl-cloud
|
||||
axolotlai/axolotl-cloud
|
||||
tags: |
|
||||
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
||||
|
||||
5
.github/workflows/preview-docs.yml
vendored
5
.github/workflows/preview-docs.yml
vendored
@@ -11,6 +11,7 @@ on:
|
||||
- '_quarto.yml'
|
||||
- docs/scripts/generate_config_docs.py
|
||||
- src/axolotl/utils/schemas/**.py
|
||||
- .github/workflows/preview-docs.yml
|
||||
|
||||
permissions:
|
||||
checks: write
|
||||
@@ -27,6 +28,10 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
steps:
|
||||
- name: cleanup node
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
||||
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
|
||||
20
.github/workflows/tests-nightly.yml
vendored
20
.github/workflows/tests-nightly.yml
vendored
@@ -26,7 +26,7 @@ jobs:
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.7.1", "2.8.0"]
|
||||
pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
@@ -99,17 +99,17 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
pytorch: 2.8.0
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.8.0
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
@@ -123,7 +123,7 @@ jobs:
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.0.2 jinja2
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
@@ -148,10 +148,10 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 2
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
@@ -165,7 +165,7 @@ jobs:
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.0.2 jinja2
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
|
||||
81
.github/workflows/tests.yml
vendored
81
.github/workflows/tests.yml
vendored
@@ -55,18 +55,23 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.7.1", "2.8.0", "2.9.0"]
|
||||
pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
- name: cleanup node
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
||||
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Restore Cache from S3
|
||||
id: hf-cache-restore-s3
|
||||
run: |
|
||||
mkdir -p /home/runner/.cache/huggingface/hub
|
||||
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
||||
mkdir -p ~/.cache/huggingface/hub
|
||||
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1
|
||||
ls -ltr ~/.cache/huggingface/hub/
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
@@ -91,6 +96,10 @@ jobs:
|
||||
python scripts/cutcrossentropy_install.py | sh
|
||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
||||
|
||||
- name: cleanup pip cache
|
||||
run: |
|
||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||
|
||||
- name: Make sure PyTorch version wasn't clobbered
|
||||
run: |
|
||||
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
|
||||
@@ -103,13 +112,23 @@ jobs:
|
||||
run: |
|
||||
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
||||
|
||||
- name: Show HF cache
|
||||
run: hf cache scan
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
|
||||
df -h
|
||||
pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
|
||||
df -h
|
||||
pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
|
||||
df -h
|
||||
pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml
|
||||
df -h
|
||||
pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
|
||||
|
||||
- name: Show HF cache
|
||||
run: hf cache scan
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
@@ -118,10 +137,6 @@ jobs:
|
||||
flags: unittests,pytorch-${{ matrix.pytorch_version }}
|
||||
fail_ci_if_error: false
|
||||
|
||||
- name: cleanup pip cache
|
||||
run: |
|
||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||
|
||||
pytest-sdist:
|
||||
name: PyTest from Source Dist
|
||||
runs-on: ubuntu-latest
|
||||
@@ -130,18 +145,23 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.7.1", "2.8.0", "2.9.0"]
|
||||
pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
- name: cleanup node
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
||||
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Restore Cache from S3
|
||||
id: hf-cache-restore-s3
|
||||
run: |
|
||||
mkdir -p /home/runner/.cache/huggingface/hub
|
||||
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
||||
mkdir -p ~/.cache/huggingface/hub
|
||||
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1
|
||||
ls -ltr ~/.cache/huggingface/hub/
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
@@ -167,6 +187,10 @@ jobs:
|
||||
python scripts/cutcrossentropy_install.py | sh
|
||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
||||
|
||||
- name: cleanup pip cache
|
||||
run: |
|
||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||
|
||||
- name: Make sure PyTorch version wasn't clobbered
|
||||
run: |
|
||||
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
|
||||
@@ -176,17 +200,16 @@ jobs:
|
||||
axolotl --help
|
||||
|
||||
- name: Show HF cache
|
||||
run: huggingface-cli scan-cache
|
||||
run: hf cache scan
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
|
||||
pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
|
||||
pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
|
||||
pytest -v --durations=10 tests/cli/
|
||||
|
||||
- name: cleanup pip cache
|
||||
run: |
|
||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||
- name: Show HF cache
|
||||
run: hf cache scan
|
||||
|
||||
gate-skip-e2e:
|
||||
needs: [pre-commit, pytest, pytest-sdist]
|
||||
@@ -248,7 +271,7 @@ jobs:
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.0.2 jinja2
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
@@ -280,18 +303,6 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
# - cuda: 128
|
||||
# cuda_version: 12.8.1
|
||||
# python_version: "3.11"
|
||||
# pytorch: 2.7.1
|
||||
# num_gpus: 1
|
||||
# axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
@@ -302,7 +313,7 @@ jobs:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.0
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
steps:
|
||||
@@ -315,7 +326,7 @@ jobs:
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.0.2 jinja2
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
@@ -342,10 +353,10 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.7.1
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
steps:
|
||||
@@ -358,7 +369,7 @@ jobs:
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.0.2 jinja2
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
|
||||
@@ -11,13 +11,13 @@ repos:
|
||||
- id: no-commit-to-branch
|
||||
args: ['--branch', 'main']
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.14.3
|
||||
rev: v0.14.10
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--fix]
|
||||
- id: ruff-format
|
||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||
rev: v1.18.2
|
||||
rev: v1.19.1
|
||||
hooks:
|
||||
- id: mypy
|
||||
additional_dependencies:
|
||||
@@ -26,7 +26,7 @@ repos:
|
||||
'pydantic>=2.5.3',
|
||||
]
|
||||
- repo: https://github.com/PyCQA/bandit
|
||||
rev: 1.8.6
|
||||
rev: 1.9.2
|
||||
hooks:
|
||||
- id: bandit
|
||||
args: [
|
||||
|
||||
@@ -10,6 +10,7 @@ ARG BASE_VOLUME="/runpod-volume"
|
||||
ENV BASE_VOLUME=$BASE_VOLUME
|
||||
ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
|
||||
ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
||||
ENV HF_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
||||
ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
||||
|
||||
COPY .runpod/src /src
|
||||
|
||||
23
README.md
23
README.md
@@ -29,21 +29,25 @@
|
||||
|
||||
## 🎉 Latest Updates
|
||||
|
||||
- 2025/12: Axolotl now includes support for [Kimi-Linear](https://docs.axolotl.ai/docs/models/kimi-linear.html), [Plano-Orchestrator](https://docs.axolotl.ai/docs/models/plano.html), [MiMo](https://docs.axolotl.ai/docs/models/mimo.html), [InternVL 3.5](https://docs.axolotl.ai/docs/models/internvl3_5.html), [Olmo3](https://docs.axolotl.ai/docs/models/olmo3.html), [Trinity](https://docs.axolotl.ai/docs/models/trinity.html), and [Ministral3](https://docs.axolotl.ai/docs/models/ministral3.html).
|
||||
- 2025/10: New model support has been added in Axolotl for: [Qwen3 Next](https://docs.axolotl.ai/docs/models/qwen3-next.html), [Qwen2.5-vl, Qwen3-vl](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl), [Qwen3, Qwen3MoE](https://docs.axolotl.ai/docs/models/qwen3.html), [Granite 4](https://docs.axolotl.ai/docs/models/granite4.html), [HunYuan](https://docs.axolotl.ai/docs/models/hunyuan.html), [Magistral 2509](https://docs.axolotl.ai/docs/models/magistral/vision.html), [Apertus](https://docs.axolotl.ai/docs/models/apertus.html), and [Seed-OSS](https://docs.axolotl.ai/docs/models/seed-oss.html).
|
||||
- 2025/09: Axolotl now has text diffusion training. Read more [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/diffusion).
|
||||
- 2025/08: QAT has been updated to include NVFP4 support. See [PR](https://github.com/axolotl-ai-cloud/axolotl/pull/3107).
|
||||
- 2025/07:
|
||||
- ND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the [blog post](https://huggingface.co/blog/accelerate-nd-parallel) for more info.
|
||||
- Axolotl adds more models: [GPT-OSS](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gpt-oss), [Gemma 3n](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gemma3n), [Liquid Foundation Model 2 (LFM2)](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/lfm2), and [Arcee Foundation Models (AFM)](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/afm).
|
||||
- Axolotl adds more models: [GPT-OSS](https://docs.axolotl.ai/docs/models/gpt-oss.html), [Gemma 3n](https://docs.axolotl.ai/docs/models/gemma3n.html), [Liquid Foundation Model 2 (LFM2)](https://docs.axolotl.ai/docs/models/LiquidAI.html), and [Arcee Foundation Models (AFM)](https://docs.axolotl.ai/docs/models/arcee.html).
|
||||
- FP8 finetuning with fp8 gather op is now possible in Axolotl via `torchao`. Get started [here](https://docs.axolotl.ai/docs/mixed_precision.html#sec-fp8)!
|
||||
- [Voxtral](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/voxtral), [Magistral 1.1](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral), and [Devstral](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/devstral) with mistral-common tokenizer support has been integrated in Axolotl!
|
||||
- [Voxtral](https://docs.axolotl.ai/docs/models/voxtral.html), [Magistral 1.1](https://docs.axolotl.ai/docs/models/magistral.html), and [Devstral](https://docs.axolotl.ai/docs/models/devstral.html) with mistral-common tokenizer support has been integrated in Axolotl!
|
||||
- TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
|
||||
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
|
||||
- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
|
||||
|
||||
<details>
|
||||
|
||||
<summary>Expand older updates</summary>
|
||||
|
||||
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
|
||||
- 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
|
||||
- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
|
||||
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [docs](https://docs.axolotl.ai/docs/models/magistral.html) to start training your own Magistral models with Axolotl!
|
||||
- 2025/04: Llama 4 support has been added in Axolotl. See [docs](https://docs.axolotl.ai/docs/models/llama-4.html) to start training your own Llama 4 models with Axolotl's linearized version!
|
||||
- 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
|
||||
- 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
|
||||
- 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
|
||||
@@ -73,7 +77,7 @@ Features:
|
||||
|
||||
- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
|
||||
- Python 3.11
|
||||
- PyTorch ≥2.7.1
|
||||
- PyTorch ≥2.8.0
|
||||
|
||||
### Google Colab
|
||||
|
||||
@@ -154,6 +158,13 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge
|
||||
|
||||
Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.
|
||||
|
||||
## 📈 Telemetry
|
||||
|
||||
Axolotl has opt-out telemetry that helps us understand how the project is being used
|
||||
and prioritize improvements. We collect basic system information, model types, and
|
||||
error rates—never personal data or file paths. Telemetry is enabled by default. To
|
||||
disable it, set AXOLOTL_DO_NOT_TRACK=1. For more details, see our [telemetry documentation](https://docs.axolotl.ai/docs/telemetry.html).
|
||||
|
||||
## ❤️ Sponsors
|
||||
|
||||
Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai)
|
||||
|
||||
45
_quarto.yml
45
_quarto.yml
@@ -1,6 +1,8 @@
|
||||
project:
|
||||
type: website
|
||||
pre-render: docs/scripts/generate_config_docs.py
|
||||
pre-render:
|
||||
- docs/scripts/generate_config_docs.py
|
||||
- docs/scripts/generate_examples_docs.py
|
||||
|
||||
quartodoc:
|
||||
dir: docs/api
|
||||
@@ -240,7 +242,48 @@ website:
|
||||
- docs/getting-started.qmd
|
||||
- docs/installation.qmd
|
||||
- docs/inference.qmd
|
||||
- section: "Model Guides"
|
||||
contents:
|
||||
- docs/models/kimi-linear.qmd
|
||||
- docs/models/plano.qmd
|
||||
- docs/models/mimo.qmd
|
||||
- docs/models/internvl3_5.qmd
|
||||
- docs/models/olmo3.qmd
|
||||
- docs/models/trinity.qmd
|
||||
- docs/models/arcee.qmd
|
||||
- docs/models/mistral.qmd
|
||||
- section: "Ministral3"
|
||||
contents:
|
||||
- docs/models/ministral3.qmd
|
||||
- docs/models/ministral3/think.qmd
|
||||
- docs/models/ministral3/vision.qmd
|
||||
- section: "Magistral"
|
||||
contents:
|
||||
- docs/models/magistral.qmd
|
||||
- docs/models/magistral/think.qmd
|
||||
- docs/models/magistral/vision.qmd
|
||||
- docs/models/ministral.qmd
|
||||
- docs/models/mistral-small.qmd
|
||||
- docs/models/voxtral.qmd
|
||||
- docs/models/devstral.qmd
|
||||
- docs/models/llama-4.qmd
|
||||
- docs/models/llama-2.qmd
|
||||
- docs/models/qwen3-next.qmd
|
||||
- docs/models/qwen3.qmd
|
||||
- docs/models/gemma3n.qmd
|
||||
- docs/models/apertus.qmd
|
||||
- docs/models/gpt-oss.qmd
|
||||
- docs/models/seed-oss.qmd
|
||||
- docs/models/phi.qmd
|
||||
- docs/models/smolvlm2.qmd
|
||||
- docs/models/granite4.qmd
|
||||
- docs/models/LiquidAI.qmd
|
||||
- docs/models/hunyuan.qmd
|
||||
- docs/models/jamba.qmd
|
||||
- docs/models/orpheus.qmd
|
||||
|
||||
- docs/cli.qmd
|
||||
- docs/telemetry.qmd
|
||||
- docs/config-reference.qmd
|
||||
- text: "API Reference"
|
||||
href: docs/api
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
set -e
|
||||
|
||||
# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
|
||||
pytest -v --durations=10 -n2 \
|
||||
pytest -v --durations=10 -n2 --maxfail=4 \
|
||||
--ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
|
||||
--ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
|
||||
/workspace/axolotl/tests/e2e/multigpu/ \
|
||||
|
||||
@@ -51,7 +51,7 @@ RUN git lfs install --skip-repo && \
|
||||
pip3 install -U --no-cache-dir pydantic==1.10.10 && \
|
||||
pip3 cache purge
|
||||
|
||||
RUN if [ "$PYTORCH_VERSION" = "2.9.0" ] && [ "$CUDA" = "128" ] ; then \
|
||||
RUN if [ "$PYTORCH_VERSION" =~ ^2\.9\.[0-9]+$ ] && [ "$CUDA" = "128" ] ; then \
|
||||
wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
pip3 install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
|
||||
|
||||
2
docs/.gitignore
vendored
2
docs/.gitignore
vendored
@@ -3,3 +3,5 @@ _site/
|
||||
/api/*.qmd
|
||||
/api/*.html
|
||||
config-reference.qmd
|
||||
models/**/*.qmd
|
||||
models/**/*.html
|
||||
|
||||
86
docs/checkpoint_saving.qmd
Normal file
86
docs/checkpoint_saving.qmd
Normal file
@@ -0,0 +1,86 @@
|
||||
---
|
||||
title: "Checkpoint Saving"
|
||||
format:
|
||||
html:
|
||||
toc: true
|
||||
toc-depth: 2
|
||||
number-sections: true
|
||||
execute:
|
||||
enabled: false
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Axolotl supports on-demand checkpoint saving during training. You can trigger checkpoints via file-based triggers (for programmatic control) or Control+C (for interactive use).
|
||||
|
||||
## File-Based Checkpoint Trigger
|
||||
|
||||
### Configuration
|
||||
|
||||
Enable in your config:
|
||||
|
||||
```yaml
|
||||
dynamic_checkpoint:
|
||||
enabled: true
|
||||
check_interval: 100 # Optional: check every N steps (default: 100)
|
||||
trigger_file_path: "axolotl_checkpoint.save" # Optional: custom filename
|
||||
```
|
||||
|
||||
**Options:**
|
||||
- `enabled`: `true` to enable (required)
|
||||
- `check_interval`: Steps between file checks. Default: 100. Lower = faster response, higher I/O overhead.
|
||||
- `trigger_file_path`: Custom trigger filename. Default: `axolotl_checkpoint.save`
|
||||
|
||||
### How It Works
|
||||
|
||||
1. Rank 0 checks for trigger file every `check_interval` steps in `output_dir`
|
||||
2. When detected, file is deleted and checkpoint is saved
|
||||
3. In distributed training, rank 0 broadcasts to synchronize all ranks
|
||||
|
||||
### Usage
|
||||
|
||||
**Command line:**
|
||||
```bash
|
||||
touch /path/to/output_dir/axolotl_checkpoint.save
|
||||
```
|
||||
|
||||
**Programmatic:**
|
||||
```python
|
||||
from pathlib import Path
|
||||
Path("/path/to/output_dir/axolotl_checkpoint.save").touch()
|
||||
```
|
||||
|
||||
Checkpoint saves within the next `check_interval` steps. The trigger file is auto-deleted after detection, so you can create it multiple times.
|
||||
|
||||
**Custom filename:**
|
||||
```yaml
|
||||
dynamic_checkpoint:
|
||||
enabled: true
|
||||
trigger_file_path: "my_trigger.save"
|
||||
```
|
||||
```bash
|
||||
touch /path/to/output_dir/my_trigger.save
|
||||
```
|
||||
|
||||
## Control+C (SIGINT) Checkpoint
|
||||
|
||||
Pressing `Ctrl+C` during training saves the model state and exits gracefully. **Note:** This saves only the model weights, not optimizer state. For resumable checkpoints, use the file-based trigger.
|
||||
|
||||
## Best Practices
|
||||
|
||||
- **Check interval**: Lower values (10-50) for fast training, default 100 for slower training
|
||||
- **Distributed training**: Create trigger file once; rank 0 handles synchronization
|
||||
- **Resume**: Dynamic checkpoints can be resumed like regular checkpoints via `resume_from_checkpoint`
|
||||
|
||||
## Example
|
||||
|
||||
```yaml
|
||||
output_dir: ./outputs/lora-out
|
||||
save_steps: 500 # Scheduled checkpoints
|
||||
|
||||
dynamic_checkpoint:
|
||||
enabled: true
|
||||
check_interval: 50
|
||||
```
|
||||
|
||||
This enables scheduled checkpoints every 500 steps plus on-demand saves via file trigger (checked every 50 steps).
|
||||
@@ -218,6 +218,13 @@ If you have tool arguments with same name but different dtypes (like `"time": st
|
||||
```
|
||||
"arguments": "{\"...\": \"...\"}"
|
||||
```
|
||||
|
||||
The same is applicable for tool parameters.
|
||||
|
||||
```
|
||||
"parameters": "{\"...\": \"...\"}"
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
Example config for Llama4:
|
||||
|
||||
@@ -32,11 +32,8 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
|
||||
|
||||
Tags examples:
|
||||
|
||||
- `main-base-py3.11-cu128-2.7.1`
|
||||
- `main-base-py3.11-cu126-2.7.1`
|
||||
- `main-base-py3.11-cu126-2.7.0`
|
||||
- `main-base-py3.11-cu126-2.6.0`
|
||||
- `main-base-py3.11-cu124-2.6.0`
|
||||
- `main-base-py3.11-cu128-2.8.0`
|
||||
- `main-base-py3.11-cu128-2.9.1`
|
||||
|
||||
## Main
|
||||
|
||||
@@ -74,15 +71,12 @@ There may be some extra tags appended to the image, like `-vllm` which installs
|
||||
|
||||
Tags examples:
|
||||
|
||||
- `main-py3.11-cu128-2.7.1`
|
||||
- `main-py3.11-cu126-2.7.1`
|
||||
- `main-py3.11-cu126-2.7.0`
|
||||
- `main-py3.11-cu126-2.6.0`
|
||||
- `main-py3.11-cu124-2.6.0`
|
||||
- `main-py3.11-cu128-2.8.0`
|
||||
- `main-py3.11-cu128-2.9.1`
|
||||
- `main-latest`
|
||||
- `main-20250303-py3.11-cu124-2.6.0`
|
||||
- `main-20250303-py3.11-cu126-2.6.0`
|
||||
- `0.10.1`
|
||||
- `0.12.0`
|
||||
|
||||
## Cloud
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ Follow the instructions at: [https://pytorch.org/get-started/locally/](https://p
|
||||
:::
|
||||
|
||||
::: {.callout-important}
|
||||
For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
|
||||
For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.
|
||||
:::
|
||||
|
||||
### PyPI Installation (Recommended) {#sec-pypi}
|
||||
@@ -111,7 +111,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
|
||||
:::
|
||||
|
||||
::: {.callout-important}
|
||||
For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.7.0` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0`.
|
||||
For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1`.
|
||||
:::
|
||||
|
||||
Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
|
||||
|
||||
@@ -4,7 +4,7 @@ format:
|
||||
html:
|
||||
toc: true
|
||||
toc-depth: 3
|
||||
number-sections: true
|
||||
# number-sections: true
|
||||
code-tools: true
|
||||
execute:
|
||||
enabled: false
|
||||
@@ -14,12 +14,18 @@ This guide covers advanced training configurations for multi-GPU setups using Ax
|
||||
|
||||
## Overview {#sec-overview}
|
||||
|
||||
Axolotl supports several methods for multi-GPU training:
|
||||
When training on multiple GPUs, Axolotl supports 3 sharding/parallelism strategies. Additionally, you can layer specific optimization features on top of that strategy.
|
||||
|
||||
- DeepSpeed (recommended)
|
||||
- FSDP (Fully Sharded Data Parallel)
|
||||
- Sequence parallelism
|
||||
- FSDP + QLoRA
|
||||
You generally cannot combine these strategies; they are mutually exclusive.
|
||||
|
||||
1. **DeepSpeed**: Powerful optimization library, supports ZeRO stages 1-3.
|
||||
2. **FSDP (Fully Sharded Data Parallel)**: PyTorch's native sharding implementation (Recommended).
|
||||
3. **DDP (Distributed Data Parallel)**: PyTorch's native parallelism implementation (Default if neither of the above are selected).
|
||||
|
||||
These features can often be combined with the strategies above:
|
||||
|
||||
* **Sequence Parallelism**: Splits long sequences across GPUs (Compatible with DDP, DeepSpeed, and FSDP).
|
||||
* **FSDP + QLoRA**: Combines 4-bit quantization with FSDP (Specific to FSDP).
|
||||
|
||||
## DeepSpeed {#sec-deepspeed}
|
||||
|
||||
@@ -65,12 +71,18 @@ Start from Stage 1 -> Stage 2 -> Stage 3.
|
||||
|
||||
## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
|
||||
|
||||
FSDP allows you to shard model parameters, gradients, and optimizer states across data parallel workers.
|
||||
|
||||
::: {.callout-note}
|
||||
|
||||
FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.
|
||||
|
||||
:::
|
||||
|
||||
### FSDP + QLoRA {#sec-fsdp-qlora}
|
||||
|
||||
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
|
||||
|
||||
### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}
|
||||
|
||||
To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and
|
||||
@@ -145,10 +157,6 @@ single sequence causes OOM errors during model training.
|
||||
|
||||
See our [dedicated guide](sequence_parallelism.qmd) for more information.
|
||||
|
||||
### FSDP + QLoRA {#sec-fsdp-qlora}
|
||||
|
||||
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
|
||||
|
||||
## Performance Optimization {#sec-performance}
|
||||
|
||||
### Liger Kernel Integration {#sec-liger}
|
||||
|
||||
@@ -21,6 +21,7 @@ format:
|
||||
- [Qwen2.5-VL](#sec-qwen25-vl)
|
||||
- [SmolVLM2](#sec-smolvlm2)
|
||||
- [LFM2-VL](#sec-lfm2-vl)
|
||||
- [Intern-VL](#sec-intern-vl)
|
||||
|
||||
## Usage
|
||||
|
||||
@@ -124,6 +125,8 @@ Please make sure to install audio lib via `pip3 install librosa==0.11.0 'mistral
|
||||
|
||||
```yaml
|
||||
base_model: mistralai/Voxtral-Mini-3B-2507
|
||||
|
||||
processor_type: VoxtralProcessor
|
||||
```
|
||||
|
||||
### Gemma-3 {#sec-gemma-3}
|
||||
@@ -200,6 +203,16 @@ Please uninstall `causal-conv1d` via `pip3 uninstall -y causal-conv1d`
|
||||
base_model: LiquidAI/LFM2-VL-450M
|
||||
```
|
||||
|
||||
### Intern-VL {#sec-intern-vl}
|
||||
|
||||
::: {.callout-tip}
|
||||
Please make sure to install `timm` via `pip3 install timm==1.0.19`
|
||||
:::
|
||||
|
||||
```yaml
|
||||
base_model: OpenGVLab/InternVL3_5-8B
|
||||
```
|
||||
|
||||
## Dataset Format
|
||||
|
||||
For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
|
||||
|
||||
110
docs/rlhf.qmd
110
docs/rlhf.qmd
@@ -597,6 +597,116 @@ To see other examples of custom reward functions, please see [TRL GRPO Docs](htt
|
||||
|
||||
To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
|
||||
|
||||
#### OpenEnv Rollout Functions
|
||||
|
||||
GRPO supports custom rollout functions for OpenEnv-style environments, enabling interactive tasks like web browsing, code execution, or tool use. This allows you to implement custom generation logic that interacts with external environments.
|
||||
|
||||
For example, to implement a simple math-solving environment with step-by-step verification:
|
||||
|
||||
```python
|
||||
# math_env.py
|
||||
import re
|
||||
|
||||
def math_solver_rollout(model, processing_class, prompts, generation_config=None):
|
||||
"""
|
||||
Custom rollout function that generates step-by-step math solutions.
|
||||
|
||||
Args:
|
||||
model: The language model
|
||||
processing_class: The tokenizer/processing_class
|
||||
prompts: List of prompt dicts (with 'messages' key for chat format)
|
||||
generation_config: Optional generation configuration
|
||||
|
||||
Returns:
|
||||
List of completion strings
|
||||
"""
|
||||
completions = []
|
||||
|
||||
for prompt in prompts:
|
||||
# Apply chat template to prompt
|
||||
messages = prompt.get("messages", [])
|
||||
formatted_prompt = processing_class.apply_chat_template(
|
||||
messages, processing_class=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
# Generate step-by-step solution
|
||||
full_response = ""
|
||||
for step in range(5): # Max 5 reasoning steps
|
||||
current_input = formatted_prompt + full_response + "\nNext step:"
|
||||
inputs = processing_class(current_input, return_tensors="pt").to(model.device)
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=100,
|
||||
generation_config=generation_config,
|
||||
)
|
||||
step_text = processing_class.decode(
|
||||
outputs[0][inputs.input_ids.shape[1]:],
|
||||
skip_special_tokens=True
|
||||
)
|
||||
|
||||
# Check if solution is complete
|
||||
if "FINAL ANSWER:" in step_text:
|
||||
full_response += step_text
|
||||
break
|
||||
full_response += step_text + "\n"
|
||||
|
||||
completions.append(full_response)
|
||||
|
||||
return completions
|
||||
|
||||
def math_reward(prompts, completions, answers, **kwargs):
|
||||
"""Reward function that checks mathematical correctness"""
|
||||
rewards = []
|
||||
for completion, correct_answer in zip(completions, answers):
|
||||
# Extract predicted answer
|
||||
match = re.search(r"FINAL ANSWER:\s*(.+)", completion)
|
||||
predicted = match.group(1).strip() if match else ""
|
||||
|
||||
# Compare with correct answer
|
||||
reward = 1.0 if predicted == str(correct_answer) else 0.0
|
||||
rewards.append(reward)
|
||||
|
||||
return rewards
|
||||
|
||||
def math_transform(cfg, *args, **kwargs):
|
||||
"""Transform dataset to GRPO format with answer field"""
|
||||
def transform_fn(example, processing_class=None):
|
||||
return {
|
||||
"prompt": [{"role": "user", "content": example["question"]}],
|
||||
"answer": str(example["answer"]),
|
||||
}
|
||||
return transform_fn, {"remove_columns": ["question"]}
|
||||
```
|
||||
|
||||
```yaml
|
||||
rl: grpo
|
||||
|
||||
trl:
|
||||
beta: 0.001
|
||||
max_completion_length: 512
|
||||
num_generations: 4
|
||||
rollout_func: "math_env.math_solver_rollout" # Custom rollout function
|
||||
reward_funcs: ["math_env.math_reward"]
|
||||
reward_weights: [1.0]
|
||||
|
||||
datasets:
|
||||
- path: openai/gsm8k
|
||||
name: main
|
||||
type: math_env.math_transform
|
||||
```
|
||||
|
||||
The `rollout_func` parameter accepts a fully qualified name (e.g., `module_name.function_name`) that points to a callable function in your local directory. The function receives:
|
||||
|
||||
- `model`: The language model
|
||||
- `processing_class`: The tokenizer/processing class
|
||||
- `prompts`: List of prompt dictionaries
|
||||
- `generation_config` (optional): Generation configuration
|
||||
|
||||
And should return a list of completion strings.
|
||||
|
||||
For more OpenEnv examples, see [TRL OpenEnv Documentation](https://huggingface.co/docs/trl/main/en/openenv).
|
||||
|
||||
#### GRPO with DAPO/Dr. GRPO loss
|
||||
|
||||
The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
|
||||
|
||||
90
docs/scripts/examples-allowlist.yml
Normal file
90
docs/scripts/examples-allowlist.yml
Normal file
@@ -0,0 +1,90 @@
|
||||
examples:
|
||||
# December 2025
|
||||
- name: kimi-linear
|
||||
title: Kimi Linear
|
||||
- name: plano
|
||||
title: Plano Orchestrator
|
||||
- name: mimo
|
||||
title: MiMo
|
||||
- name: internvl3_5
|
||||
title: InternVL 3.5
|
||||
|
||||
# AllenAI
|
||||
- name: olmo3
|
||||
title: OLMo 3
|
||||
|
||||
# ArceeAI
|
||||
- name: trinity
|
||||
title: Trinity
|
||||
- name: arcee
|
||||
title: Arcee AFM
|
||||
|
||||
# MistralAI
|
||||
- name: ministral3/think
|
||||
title: Ministral 3 Thinking
|
||||
- name: ministral3/vision
|
||||
title: Ministral 3 Vision
|
||||
- name: magistral/think
|
||||
title: Magistral Thinking
|
||||
- name: magistral/vision
|
||||
title: Magistral Vision
|
||||
- name: ministral
|
||||
title: Ministral
|
||||
- name: mistral-small
|
||||
title: Mistral Small 3.1/3.2
|
||||
- name: voxtral
|
||||
title: Voxtral
|
||||
- name: devstral
|
||||
title: Devstral
|
||||
- name: mistral
|
||||
title: Mistral 7B
|
||||
|
||||
# Meta
|
||||
- name: llama-4
|
||||
title: Llama 4
|
||||
- name: llama-2
|
||||
title: Llama 2
|
||||
|
||||
# Alibaba
|
||||
- name: qwen3-next
|
||||
title: Qwen 3 Next
|
||||
- name: qwen3
|
||||
title: Qwen 3
|
||||
|
||||
# Google
|
||||
- name: gemma3n
|
||||
title: Gemma 3n
|
||||
|
||||
# Swiss AI
|
||||
- name: apertus
|
||||
title: Apertus
|
||||
|
||||
# GPT-OSS
|
||||
- name: gpt-oss
|
||||
title: GPT-OSS
|
||||
- name: seed-oss
|
||||
title: Seed-OSS
|
||||
|
||||
# Microsoft
|
||||
- name: phi
|
||||
title: Phi
|
||||
|
||||
# SmolVLM
|
||||
- name: smolvlm2
|
||||
title: SmolVLM 2
|
||||
|
||||
# IBM
|
||||
- name: granite4
|
||||
title: Granite 4
|
||||
|
||||
# LiquidAI
|
||||
- name: LiquidAI
|
||||
title: Liquid Foundation Models 2
|
||||
|
||||
# Other
|
||||
- name: hunyuan
|
||||
title: Hunyuan
|
||||
- name: jamba
|
||||
title: Jamba
|
||||
- name: orpheus
|
||||
title: Orpheus
|
||||
424
docs/scripts/generate_examples_docs.py
Executable file
424
docs/scripts/generate_examples_docs.py
Executable file
@@ -0,0 +1,424 @@
|
||||
"""
|
||||
auto generate example docs from allowlist
|
||||
"""
|
||||
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
# Paths
|
||||
THIS = Path(__file__).resolve()
|
||||
ROOT = THIS.parents[2] # repo root (docs/scripts -> docs -> ROOT)
|
||||
EXAMPLES_DIR = ROOT / "examples"
|
||||
OUTPUT_DIR = ROOT / "docs" / "models"
|
||||
ALLOWLIST_YML = THIS.parent / "examples-allowlist.yml"
|
||||
|
||||
|
||||
def slugify(name: str) -> str:
|
||||
"""Convert a name to a slug (lowercase, hyphens for spaces)."""
|
||||
s = re.sub(r"[^a-zA-Z0-9\s\-]+", "", name.strip())
|
||||
s = re.sub(r"\s+", "-", s).strip("-").lower()
|
||||
return s or "example"
|
||||
|
||||
|
||||
def read_allowlist():
|
||||
with open(ALLOWLIST_YML, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
items = data.get("examples", [])
|
||||
if not isinstance(items, list):
|
||||
raise ValueError("`examples` must be a list in examples-allowlist.yml")
|
||||
return items
|
||||
|
||||
|
||||
def find_readme(folder: Path) -> Path | None:
|
||||
for name in ("README.md", "Readme.md", "readme.md"):
|
||||
p = folder / name
|
||||
if p.exists():
|
||||
return p
|
||||
return None
|
||||
|
||||
|
||||
def remove_first_h1(md: str) -> tuple[str, str | None]:
|
||||
"""
|
||||
Remove the first H1 from markdown and return (modified_md, h1_title).
|
||||
The H1 is removed since we use the frontmatter title instead.
|
||||
"""
|
||||
lines = md.splitlines()
|
||||
result = []
|
||||
h1_title = None
|
||||
skipped_first = False
|
||||
|
||||
for line in lines:
|
||||
if not skipped_first and line.startswith("# "):
|
||||
h1_title = line[2:].strip()
|
||||
skipped_first = True
|
||||
continue
|
||||
result.append(line)
|
||||
|
||||
return "\n".join(result), h1_title
|
||||
|
||||
|
||||
IMG_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
|
||||
LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
|
||||
|
||||
|
||||
def rewrite_and_copy_assets(md: str, src_dir: Path, dest_assets_root: Path) -> str:
|
||||
"""
|
||||
Copy local image assets referenced in markdown to
|
||||
docs/examples/assets/... and rewrite the links.
|
||||
"""
|
||||
dest_assets = dest_assets_root / "assets"
|
||||
|
||||
def repl(m):
|
||||
url = m.group(1).strip()
|
||||
if re.match(r"^(https?:)?//", url):
|
||||
return m.group(0) # leave remote URLs
|
||||
src_path = (src_dir / url).resolve()
|
||||
if not src_path.exists():
|
||||
return m.group(0) # leave as-is if not found
|
||||
rel = src_path.relative_to(src_dir)
|
||||
# Create a unique asset path based on source directory name
|
||||
asset_name = src_dir.name.replace("/", "-")
|
||||
dest_path = dest_assets / asset_name / rel
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(src_path, dest_path)
|
||||
new_rel = f"assets/{asset_name}/{rel.as_posix()}"
|
||||
return m.group(0).replace(url, new_rel)
|
||||
|
||||
return IMG_RE.sub(repl, md)
|
||||
|
||||
|
||||
def rewrite_readme_links(
|
||||
md: str,
|
||||
src_dir: Path,
|
||||
examples_dir: Path,
|
||||
parent_index_only: set,
|
||||
current_src_path: str,
|
||||
allowlist_entries: set,
|
||||
current_output_path: str,
|
||||
) -> str:
|
||||
"""
|
||||
Rewrite links between README.md files to point to the correct .qmd files.
|
||||
"""
|
||||
|
||||
def repl(m):
|
||||
text = m.group(1)
|
||||
url = m.group(2).strip()
|
||||
|
||||
# Skip remote URLs and anchor links
|
||||
if re.match(r"^(https?:)?//", url) or url.startswith("#"):
|
||||
return m.group(0)
|
||||
|
||||
# Skip non-markdown files
|
||||
if not url.lower().endswith(".md"):
|
||||
return m.group(0)
|
||||
|
||||
# Resolve the target path
|
||||
try:
|
||||
target_path = (src_dir / url).resolve()
|
||||
|
||||
# Check if target is outside examples_dir
|
||||
try:
|
||||
rel_path = target_path.relative_to(examples_dir)
|
||||
except ValueError:
|
||||
# Target is outside examples_dir, leave as-is
|
||||
return m.group(0)
|
||||
|
||||
parts = list(rel_path.parts)
|
||||
|
||||
# Determine the output path for the target
|
||||
if len(parts) > 0 and parts[-1].lower() in ("readme.md", "readme"):
|
||||
# This is a README link
|
||||
if len(parts) == 1:
|
||||
# Link to root README -> index.qmd
|
||||
target_output = "index.qmd"
|
||||
elif len(parts) == 2:
|
||||
if parts[0] == ".":
|
||||
# Current directory README
|
||||
target_output = "index.qmd"
|
||||
else:
|
||||
# subdir/README.md
|
||||
parent_dir = parts[0]
|
||||
if parent_dir in parent_index_only:
|
||||
target_output = f"{parent_dir}/index.qmd"
|
||||
else:
|
||||
target_output = f"{parent_dir}.qmd"
|
||||
else:
|
||||
# Deeper nesting: parent/subdir/README.md
|
||||
# Build the full path like "parent/subdir"
|
||||
full_path = "/".join(parts[:-1]) # Remove README.md
|
||||
# Check if this exact path is in allowlist
|
||||
if full_path in allowlist_entries:
|
||||
# This is a sub-entry with its own entry -> use .qmd
|
||||
target_output = f"{full_path}.qmd"
|
||||
elif parts[0] == ".":
|
||||
# ./subdir/README.md -> check if subdir has own entry
|
||||
subdir = parts[1]
|
||||
if subdir in parent_index_only:
|
||||
target_output = f"{subdir}/index.qmd"
|
||||
else:
|
||||
target_output = f"{subdir}.qmd"
|
||||
else:
|
||||
# parent/subdir where parent doesn't have own entry
|
||||
target_output = f"{full_path}/index.qmd"
|
||||
else:
|
||||
# Regular .md file -> convert to .qmd, keep path structure
|
||||
target_output = "/".join(parts)[:-2] + "qmd"
|
||||
|
||||
# Compute relative path from current output file to target
|
||||
current_parts = current_output_path.split("/")
|
||||
target_parts = target_output.split("/")
|
||||
|
||||
# Special case: if current is a subdir file and target is a single-component file at root
|
||||
# Example: current="magistral/vision", target="magistral.qmd"
|
||||
if len(current_parts) > 1 and len(target_parts) == 1:
|
||||
# Current is in subdir, target is at root level
|
||||
# Go up to root: ../ for each level
|
||||
up_count = len(current_parts) - 1
|
||||
rel_parts = [".."] * up_count + [target_parts[0]]
|
||||
new_url = "/".join(rel_parts)
|
||||
else:
|
||||
# Find common prefix
|
||||
i = 0
|
||||
while (
|
||||
i < min(len(current_parts) - 1, len(target_parts))
|
||||
and current_parts[i] == target_parts[i]
|
||||
):
|
||||
i += 1
|
||||
|
||||
# Build relative path: go up (../) then down to target
|
||||
up_count = len(current_parts) - 1 - i
|
||||
rel_parts = [".."] * up_count + target_parts[i:]
|
||||
|
||||
if not rel_parts or rel_parts == [".."]:
|
||||
# Points to same directory or parent
|
||||
new_url = "/".join(rel_parts) if rel_parts else "."
|
||||
else:
|
||||
new_url = "/".join(rel_parts)
|
||||
|
||||
return f"[{text}]({new_url})"
|
||||
except (ValueError, IndexError):
|
||||
return m.group(0)
|
||||
|
||||
return LINK_RE.sub(repl, md)
|
||||
|
||||
|
||||
def write_qmd(out_path: Path, title: str, body_md: str):
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fm = f"---\ntitle: {title!r}\nexecute:\n eval: false\nformat:\n html:\n toc: true\n---\n\n"
|
||||
out_path.write_text(fm + body_md, encoding="utf-8")
|
||||
|
||||
|
||||
def update_quarto_yml(generated: list[tuple[str, str, str]]):
|
||||
"""
|
||||
Update _quarto.yml with the generated example files in the correct order.
|
||||
This keeps the sidebar in sync with the allowlist.
|
||||
|
||||
Model Guides is now nested under "Getting Started" section.
|
||||
Creates nested sections for models with sub-entries (e.g., magistral, ministral3).
|
||||
Parent pages are now flat files (e.g., ministral3.qmd) with sub-pages in subdirs.
|
||||
"""
|
||||
quarto_yml = ROOT / "_quarto.yml"
|
||||
if not quarto_yml.exists():
|
||||
print(f"[WARN] {quarto_yml} not found, skipping update", file=sys.stderr)
|
||||
return
|
||||
|
||||
content = quarto_yml.read_text(encoding="utf-8")
|
||||
|
||||
# First pass: find all parents that have sub-entries
|
||||
parents_with_subs = set()
|
||||
for path, _name, _title in generated:
|
||||
if "/" in path:
|
||||
parent = path.split("/")[0]
|
||||
parents_with_subs.add(parent)
|
||||
|
||||
# Build the YAML contents while preserving allowlist order
|
||||
lines = []
|
||||
processed_sections = set()
|
||||
|
||||
for path, _name, title in generated:
|
||||
# Check if this is a parent page that has sub-pages
|
||||
if path in parents_with_subs:
|
||||
# This is a parent page with sub-pages - create a nested section
|
||||
if path not in processed_sections:
|
||||
processed_sections.add(path)
|
||||
section_title = (
|
||||
title or path.replace("-", " ").replace("_", " ").title()
|
||||
)
|
||||
lines.append(f' - section: "{section_title}"')
|
||||
lines.append(" contents:")
|
||||
# Add the parent page first
|
||||
lines.append(f" - docs/models/{path}.qmd")
|
||||
# Then add all sub-pages
|
||||
for sub_path, _sub_name, _sub_title in generated:
|
||||
if "/" in sub_path and sub_path.split("/")[0] == path:
|
||||
lines.append(
|
||||
f" - docs/models/{sub_path}.qmd"
|
||||
)
|
||||
elif "/" not in path:
|
||||
# This is a flat item with no sub-pages
|
||||
# Skip if it was already included as part of a parent section
|
||||
if path not in processed_sections:
|
||||
lines.append(f" - docs/models/{path}.qmd")
|
||||
|
||||
yaml_content = "\n".join(lines) + "\n"
|
||||
|
||||
# Pattern to match only the Model Guides contents, stopping at the next item
|
||||
# in Getting Started (lines starting with 12 spaces: same level as the section)
|
||||
pattern = r'( - section: "Model Guides"\n contents:)([^\n]*|.*?)(?=\n - |\n - section:|\n\nformat:)'
|
||||
|
||||
def replacement(match):
|
||||
prefix = match.group(1)
|
||||
return prefix + "\n" + yaml_content
|
||||
|
||||
new_content = re.sub(pattern, replacement, content, flags=re.DOTALL)
|
||||
|
||||
if new_content != content:
|
||||
quarto_yml.write_text(new_content, encoding="utf-8")
|
||||
print(f"Updated {quarto_yml}")
|
||||
else:
|
||||
print(f"No changes needed for {quarto_yml}")
|
||||
|
||||
|
||||
def main():
|
||||
allow = read_allowlist()
|
||||
if not EXAMPLES_DIR.exists():
|
||||
print(f"[WARN] {EXAMPLES_DIR} not found", file=sys.stderr)
|
||||
return
|
||||
|
||||
(OUTPUT_DIR / "assets").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# First pass: identify which parents have their own entry vs only sub-entries
|
||||
parent_entries = set() # Parents that have their own entry
|
||||
parent_with_subs = set() # Parents that have sub-entries
|
||||
allowlist_entries = set() # All entries in allowlist
|
||||
|
||||
for item in allow:
|
||||
if isinstance(item, str):
|
||||
name = item
|
||||
else:
|
||||
name = item.get("name")
|
||||
|
||||
allowlist_entries.add(name)
|
||||
|
||||
if "/" in name:
|
||||
parent = name.split("/")[0]
|
||||
parent_with_subs.add(parent)
|
||||
else:
|
||||
parent_entries.add(name)
|
||||
|
||||
# Parents with subs that DON'T have their own entry -> use index.qmd
|
||||
parent_index_only = parent_with_subs - parent_entries
|
||||
|
||||
generated = []
|
||||
seen_dirs = set() # Track which parent directories we've created index for
|
||||
|
||||
for item in allow:
|
||||
if isinstance(item, str):
|
||||
name = item
|
||||
title = None
|
||||
else:
|
||||
name = item.get("name")
|
||||
title = item.get("title")
|
||||
|
||||
if not name:
|
||||
print(f"[WARN] Skipping item without name: {item}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
src_dir = EXAMPLES_DIR / name
|
||||
if not src_dir.exists() or not src_dir.is_dir():
|
||||
print(f"[WARN] Skipping {name} (not a directory)", file=sys.stderr)
|
||||
continue
|
||||
|
||||
readme = find_readme(src_dir)
|
||||
if not readme:
|
||||
print(f"[WARN] Skipping {name} (no README.md)", file=sys.stderr)
|
||||
continue
|
||||
|
||||
md = readme.read_text(encoding="utf-8")
|
||||
|
||||
# Determine output path first (needed for link rewriting)
|
||||
parts = name.split("/")
|
||||
if len(parts) == 1:
|
||||
# Simple case: no subdirectory
|
||||
out_path = OUTPUT_DIR / f"{parts[0]}.qmd"
|
||||
sidebar_path = parts[0]
|
||||
else:
|
||||
# Has subdirectory: e.g., magistral/think
|
||||
parent = parts[0]
|
||||
child = "-".join(parts[1:]) # handle nested subdirs
|
||||
out_path = OUTPUT_DIR / parent / f"{child}.qmd"
|
||||
sidebar_path = f"{parent}/{child}"
|
||||
|
||||
# Remove the first H1 (we use frontmatter title instead)
|
||||
md, _ = remove_first_h1(md)
|
||||
# Rewrite links between README files
|
||||
md = rewrite_readme_links(
|
||||
md,
|
||||
src_dir,
|
||||
EXAMPLES_DIR,
|
||||
parent_index_only,
|
||||
name,
|
||||
allowlist_entries,
|
||||
sidebar_path,
|
||||
)
|
||||
md = rewrite_and_copy_assets(md, src_dir, OUTPUT_DIR)
|
||||
|
||||
# Handle parent page generation for sub-entries
|
||||
if len(parts) > 1:
|
||||
# Has subdirectory: e.g., magistral/think
|
||||
parent = parts[0]
|
||||
|
||||
# Create parent.qmd if not already done and parent doesn't have own entry
|
||||
if parent not in seen_dirs and parent in parent_index_only:
|
||||
parent_readme = find_readme(EXAMPLES_DIR / parent)
|
||||
if parent_readme:
|
||||
parent_md = parent_readme.read_text(encoding="utf-8")
|
||||
parent_md, _ = remove_first_h1(parent_md)
|
||||
parent_md = rewrite_readme_links(
|
||||
parent_md,
|
||||
EXAMPLES_DIR / parent,
|
||||
EXAMPLES_DIR,
|
||||
parent_index_only,
|
||||
parent,
|
||||
allowlist_entries,
|
||||
parent,
|
||||
)
|
||||
parent_md = rewrite_and_copy_assets(
|
||||
parent_md, EXAMPLES_DIR / parent, OUTPUT_DIR
|
||||
)
|
||||
parent_title = parent.replace("-", " ").replace("_", " ").title()
|
||||
write_qmd(OUTPUT_DIR / f"{parent}.qmd", parent_title, parent_md)
|
||||
generated.append((parent, parent, parent_title))
|
||||
seen_dirs.add(parent)
|
||||
|
||||
if not title:
|
||||
title = name.replace("/", " ").replace("-", " ").title()
|
||||
|
||||
write_qmd(out_path, title, md)
|
||||
generated.append((sidebar_path, name, title))
|
||||
|
||||
# Index page - preserve allowlist order
|
||||
if generated:
|
||||
listing = "\n".join(
|
||||
[f"- [{title}]({path}.qmd)" for path, name, title in generated]
|
||||
)
|
||||
index_md = (
|
||||
"# Model Guides\n\nBelow are the curated examples for training various model architectures:\n\n"
|
||||
+ listing
|
||||
+ "\n"
|
||||
)
|
||||
index_fm = (
|
||||
"---\nexecute:\n eval: false\nformat:\n html:\n toc: true\n---\n\n"
|
||||
)
|
||||
(OUTPUT_DIR / "index.qmd").write_text(index_fm + index_md, encoding="utf-8")
|
||||
|
||||
# Auto-update _quarto.yml to keep sidebar in sync
|
||||
update_quarto_yml(generated)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
61
docs/telemetry.qmd
Normal file
61
docs/telemetry.qmd
Normal file
@@ -0,0 +1,61 @@
|
||||
---
|
||||
title: Telemetry
|
||||
description: A description of the telemetry implementation in Axolotl.
|
||||
---
|
||||
|
||||
# Telemetry in Axolotl
|
||||
|
||||
Axolotl implements anonymous telemetry to help maintainers understand how the library
|
||||
is used and where users encounter issues. This data helps prioritize features, optimize
|
||||
performance, and fix bugs.
|
||||
|
||||
## Data Collection
|
||||
|
||||
We collect:
|
||||
|
||||
- System info: OS, Python version, Axolotl version, PyTorch version, Transformers
|
||||
version, etc.
|
||||
- Hardware info: CPU count, memory, GPU count and models
|
||||
- Runtime metrics: Training progress, memory usage, timing information
|
||||
- Usage patterns: Models (from a whitelist) and configurations used
|
||||
- Error tracking: Stack traces and error messages (sanitized to remove personal
|
||||
information)
|
||||
|
||||
Personally identifiable information (PII) is not collected.
|
||||
|
||||
## Implementation
|
||||
|
||||
Telemetry is implemented using PostHog and consists of:
|
||||
|
||||
- `axolotl.telemetry.TelemetryManager`: A singleton class that initializes the
|
||||
telemetry system and provides methods for tracking events.
|
||||
- `axolotl.telemetry.errors.send_errors`: A decorator that captures exceptions and
|
||||
sends sanitized stack traces.
|
||||
- `axolotl.telemetry.runtime_metrics.RuntimeMetricsTracker`: A class that tracks
|
||||
runtime metrics during training.
|
||||
- `axolotl.telemetry.callbacks.TelemetryCallback`: A Trainer callback that sends
|
||||
runtime metrics telemetry.
|
||||
|
||||
The telemetry system will block training startup for 10 seconds to ensure users are
|
||||
aware of data collection, unless telemetry is explicitly enabled or disabled.
|
||||
|
||||
## Opt-Out Mechanism
|
||||
|
||||
Telemetry is **enabled by default** on an opt-out basis. To disable it, set
|
||||
`AXOLOTL_DO_NOT_TRACK=1` or `DO_NOT_TRACK=1`.
|
||||
|
||||
A warning message will be logged on start to clearly inform users about telemetry.
|
||||
We will remove this after some period.
|
||||
|
||||
To hide the warning message about telemetry that is displayed on train, etc. startup,
|
||||
explicitly set: `AXOLOTL_DO_NOT_TRACK=0` (enable telemetry) or `AXOLOTL_DO_NOT_TRACK=1`
|
||||
(explicitly disable telemetry).
|
||||
|
||||
## Privacy
|
||||
|
||||
- All path-like config information is automatically redacted from telemetry data
|
||||
- Model information is only collected for whitelisted organizations
|
||||
- See `axolotl/telemetry/whitelist.yaml` for the set of whitelisted organizations
|
||||
- Each run generates a unique anonymous ID
|
||||
- This allows us to link different telemetry events in a single same training run
|
||||
- Telemetry is only sent from the main process to avoid duplicate events
|
||||
@@ -40,7 +40,7 @@
|
||||
"%%capture\n",
|
||||
"# This step can take ~5-10 minutes to install dependencies\n",
|
||||
"!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
|
||||
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec\""
|
||||
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -253,7 +253,6 @@
|
||||
"source": [
|
||||
"from axolotl.utils import set_pytorch_cuda_alloc_conf\n",
|
||||
"\n",
|
||||
"# Set \"PYTORCH_CUDA_ALLOC_CONF\" env to save memory\n",
|
||||
"set_pytorch_cuda_alloc_conf()"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -32,6 +32,10 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
trackio_project_name:
|
||||
trackio_run_name:
|
||||
trackio_space_id:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
|
||||
@@ -28,6 +28,10 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
trackio_project_name:
|
||||
trackio_run_name:
|
||||
trackio_space_id:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
|
||||
@@ -29,6 +29,10 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
trackio_project_name:
|
||||
trackio_run_name:
|
||||
trackio_space_id:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
|
||||
@@ -28,6 +28,10 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
trackio_project_name:
|
||||
trackio_run_name:
|
||||
trackio_space_id:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
|
||||
@@ -41,6 +41,10 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
trackio_project_name:
|
||||
trackio_run_name:
|
||||
trackio_space_id:
|
||||
|
||||
gradient_accumulation_steps: 8
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
|
||||
@@ -41,6 +41,10 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
trackio_project_name:
|
||||
trackio_run_name:
|
||||
trackio_space_id:
|
||||
|
||||
gradient_accumulation_steps: 8
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
|
||||
65
examples/granite4/README.md
Normal file
65
examples/granite4/README.md
Normal file
@@ -0,0 +1,65 @@
|
||||
# Finetune IBM's Granite 4.0 with Axolotl
|
||||
|
||||
[Granite 4.0](https://huggingface.co/collections/ibm-granite/granite-40-language-models) are a family of open source models trained by IBM Research.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Granite4 is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
|
||||
|
||||
Here is an example of how to install from main for pip:
|
||||
|
||||
```bash
|
||||
# Ensure you have Pytorch installed (Pytorch 2.7.1 min)
|
||||
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
cd axolotl
|
||||
|
||||
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
|
||||
pip3 install --no-build-isolation -e '.[flash-attn]'
|
||||
|
||||
# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
|
||||
python scripts/cutcrossentropy_install.py | sh
|
||||
```
|
||||
|
||||
2. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/granite4/granite-4.0-tiny-fft.yaml
|
||||
```
|
||||
|
||||
This config uses about 40.8GiB VRAM.
|
||||
|
||||
Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### TIPS
|
||||
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
### Limitation
|
||||
|
||||
Adapter finetuning does not work at the moment. It would error with
|
||||
|
||||
```bash
|
||||
RuntimeError: mat1 and mat2 shapes cannot be multiplied (4096x3072 and 1x1179648)
|
||||
```
|
||||
|
||||
In addition, if adapter training works, `lora_target_linear: true` will not work due to:
|
||||
```bash
|
||||
ValueError: Target module GraniteMoeHybridParallelExperts() is not supported.
|
||||
```
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
||||
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
||||
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [Granite Docs](https://www.ibm.com/granite/docs/models/granite)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
45
examples/granite4/granite-4.0-tiny-fft.yaml
Normal file
45
examples/granite4/granite-4.0-tiny-fft.yaml
Normal file
@@ -0,0 +1,45 @@
|
||||
base_model: ibm-granite/granite-4.0-tiny-preview
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/model-out
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
43
examples/internvl3_5/README.md
Normal file
43
examples/internvl3_5/README.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# Finetune OpenGV's InternVL with Axolotl
|
||||
|
||||
[InternVL 3.5](https://huggingface.co/OpenGVLab/InternVL3_5-8B-HF) is a family of powerful vision-language models supporting dynamic resolution and multi-image understanding by OpenGV. It features a ViT-style vision encoder and strong language model backbone for tasks like visual question answering, OCR, and scene text understanding.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Install `timm` for vision model support:
|
||||
|
||||
```bash
|
||||
pip install timm==1.0.19
|
||||
```
|
||||
|
||||
3. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||
|
||||
4. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/internvl3_5/internvl3_5-8b-qlora.yml
|
||||
```
|
||||
|
||||
This config uses about 8.21 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### Tips
|
||||
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the multi-modal format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [InternVL Paper](https://huggingface.co/papers/2508.18265)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
61
examples/internvl3_5/internvl3_5-8b-qlora.yml
Normal file
61
examples/internvl3_5/internvl3_5-8b-qlora.yml
Normal file
@@ -0,0 +1,61 @@
|
||||
base_model: OpenGVLab/InternVL3_5-8B-HF
|
||||
processor_type: AutoProcessor
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_4bit: true
|
||||
|
||||
# these 3 lines are needed for now to handle vision chat templates w images
|
||||
skip_prepare_dataset: true
|
||||
remove_unused_columns: false
|
||||
sample_packing: false
|
||||
|
||||
datasets:
|
||||
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||
type: chat_template
|
||||
split: train[:1%]
|
||||
field_messages: messages
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.01
|
||||
output_dir: ./outputs/out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: true
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
eager_attention:
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
47
examples/kimi-linear/README.md
Normal file
47
examples/kimi-linear/README.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# Finetune MoonshotAI's Kimi Linear with Axolotl
|
||||
|
||||
[Kimi Linear](https://huggingface.co/collections/moonshotai/kimi-linear-a3b) is a MoE model (48B total, 3B active) by MoonshotAI using a hybrid linear attention architecture to achieve a 1M token context length. It uses Kimi Delta Attention (KDA), a refined version of Gated DeltaNet that reduces KV cache size by up to 75% and boosts decoding throughput by up to 6x for long contexts.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
**Note:** Axolotl uses experimental training code for Kimi Linear as their original modeling code is inference-only.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Install CCE via [docs](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy)
|
||||
|
||||
3. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/kimi-linear/kimi-48b-lora.yaml
|
||||
```
|
||||
|
||||
This config uses about 98.7GiB VRAM.
|
||||
|
||||
Let us know how it goes. Happy finetuning!
|
||||
|
||||
### TIPS
|
||||
|
||||
- Kimi Linear requires `trust_remote_code: true`.
|
||||
- You can run a full finetuning by removing the `adapter: lora` and `load_in_8bit: true`.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html)
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template)
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
See 👉 [docs](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Limitations
|
||||
|
||||
This is not yet compatible with MoE kernels from transformers v5.
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [Kimi Linear Paper](https://huggingface.co/papers/2510.26692)
|
||||
- [Kimi Linear GitHub](https://github.com/MoonshotAI/Kimi-Linear)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
81
examples/kimi-linear/kimi-48b-lora.yaml
Normal file
81
examples/kimi-linear/kimi-48b-lora.yaml
Normal file
@@ -0,0 +1,81 @@
|
||||
base_model: moonshotai/Kimi-Linear-48B-A3B-Instruct
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
split: train
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.2
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
lora_r: 16
|
||||
lora_alpha: 32
|
||||
lora_dropout: 0.05
|
||||
lora_fan_in_fan_out:
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
loss_watchdog_threshold: 5.0
|
||||
loss_watchdog_patience: 3
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 2
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
@@ -29,7 +29,6 @@ flex_attention: true
|
||||
flex_attn_compile_kwargs:
|
||||
dynamic: false
|
||||
mode: max-autotune-no-cudagraphs
|
||||
save_strategy: no
|
||||
torch_compile: true
|
||||
|
||||
wandb_project:
|
||||
|
||||
@@ -13,7 +13,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for these
|
||||
Here is an example of how to install from pip:
|
||||
|
||||
```bash
|
||||
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
|
||||
# Ensure you have Pytorch installed (Pytorch 2.7.0 min)
|
||||
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
|
||||
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
|
||||
```
|
||||
|
||||
@@ -5,6 +5,7 @@ This guide covers fine-tuning [Magistral Small 2507](https://huggingface.co/mist
|
||||
## Prerequisites
|
||||
|
||||
Before starting, ensure you have:
|
||||
|
||||
- Installed Axolotl (see [main README](../README.md))
|
||||
|
||||
## Getting Started
|
||||
|
||||
@@ -5,7 +5,8 @@ This guide covers fine-tuning [Magistral Small 2509](https://huggingface.co/mist
|
||||
## Prerequisites
|
||||
|
||||
Before starting, ensure you have:
|
||||
- Installed Axolotl from source (see [main README](../README.md#getting-started))
|
||||
|
||||
- Installed Axolotl from source (see [main README](../README.md))
|
||||
|
||||
## Getting started
|
||||
|
||||
|
||||
39
examples/mimo/README.md
Normal file
39
examples/mimo/README.md
Normal file
@@ -0,0 +1,39 @@
|
||||
# Finetune Xiaomi's MiMo with Axolotl
|
||||
|
||||
[MiMo](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL) is a family of models trained from scratch for reasoning tasks, incorporating **Multiple-Token Prediction (MTP)** as an additional training objective for enhanced performance and faster inference. Pre-trained on ~25T tokens with a three-stage data mixture strategy and optimized reasoning pattern density.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/mimo/mimo-7b-qlora.yaml
|
||||
```
|
||||
|
||||
This config uses about 17.2 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### Tips
|
||||
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Limitations
|
||||
|
||||
**Cut Cross Entropy (CCE)**: Currently not supported. We plan to include CCE support for MiMo in the near future.
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [MiMo Paper](https://arxiv.org/abs/2505.07608)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
67
examples/mimo/mimo-7b-qlora.yaml
Normal file
67
examples/mimo/mimo-7b-qlora.yaml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: XiaomiMiMo/MiMo-7B-RL
|
||||
trust_remote_code: true
|
||||
revision_of_model: 6299b5a
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
# CCE - N/A as of now
|
||||
# plugins:
|
||||
# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
50
examples/ministral/README.md
Normal file
50
examples/ministral/README.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# Finetune Ministral with Axolotl
|
||||
|
||||
Ministral is a family of openweight models from MistralAI found on [HuggingFace](mistralai/Ministral-8B-Instruct-2410). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||
|
||||
3. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/ministral/ministral-small-qlora.yaml
|
||||
```
|
||||
|
||||
This config uses about 8.76 GiB VRAM.
|
||||
|
||||
Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### Tips
|
||||
|
||||
- We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`.
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Limitations
|
||||
|
||||
We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
|
||||
|
||||
In addition, we do not support overriding tokens yet.
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [MistralAI Ministral Blog](https://mistral.ai/news/ministraux)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
|
||||
|
||||
## Future Work
|
||||
|
||||
- Add parity to Preference Tuning, RL, etc.
|
||||
- Add parity to other tokenizer configs like overriding tokens.
|
||||
67
examples/ministral/ministral-small-qlora.yaml
Normal file
67
examples/ministral/ministral-small-qlora.yaml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: mistralai/Ministral-8B-Instruct-2410
|
||||
|
||||
# Enable to use mistral-common tokenizer
|
||||
tokenizer_use_mistral_common: true
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
79
examples/ministral3/README.md
Normal file
79
examples/ministral3/README.md
Normal file
@@ -0,0 +1,79 @@
|
||||
# Finetune Ministral3 with Axolotl
|
||||
|
||||
Ministral3 is a family of open-weight models from MistralAI found on [HuggingFace](https://huggingface.co/collections/mistralai/ministral-3). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
Please see [Thinking](#thinking) and [Vision](#vision) for their respective fine-tuning.
|
||||
|
||||
Thanks to the team at MistralAI for giving us early access to prepare for these releases.
|
||||
|
||||
Note: This is still experimental given it is based on transformers v5 RC.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl from source following the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build).
|
||||
|
||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||
|
||||
3. Swap to the Axolotl transformers v5 branch
|
||||
|
||||
```bash
|
||||
cp examples/ministral3/ministral3-3b-qlora.yaml ministral3-3b-qlora.yaml
|
||||
|
||||
git fetch
|
||||
git checkout transformers-v5
|
||||
|
||||
# Install packages for transformers v5
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
4. Run the fine-tuning:
|
||||
|
||||
```bash
|
||||
axolotl train ministral3-3b-qlora.yaml
|
||||
```
|
||||
|
||||
Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
|
||||
### Tips
|
||||
|
||||
- We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`.
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
### Thinking
|
||||
|
||||
Ministral3 2512 model supports thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps.
|
||||
|
||||
📚 **[See the Thinking fine-tuning guide →](./think/README.md)**
|
||||
|
||||
### Vision
|
||||
|
||||
Ministral3 2512 model also supports vision capabilities.
|
||||
|
||||
📚 **[See the Vision fine-tuning guide →](./vision/README.md)**
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Limitations
|
||||
|
||||
We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
|
||||
|
||||
In addition, we do not support overriding tokens yet.
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [MistralAI Mistral3 Blog](https://mistral.ai/news/mistral-3)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
|
||||
|
||||
## Future Work
|
||||
|
||||
- Add parity to Preference Tuning, RL, etc.
|
||||
- Add parity to other tokenizer configs like overriding tokens.
|
||||
67
examples/ministral3/ministral3-3b-qlora.yaml
Normal file
67
examples/ministral3/ministral3-3b-qlora.yaml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: mistralai/Ministral-3-3B-Reasoning-2512
|
||||
|
||||
# Enable to use mistral-common tokenizer
|
||||
tokenizer_use_mistral_common: true
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
74
examples/ministral3/think/README.md
Normal file
74
examples/ministral3/think/README.md
Normal file
@@ -0,0 +1,74 @@
|
||||
# Ministral3 2512 Thinking Fine-tuning
|
||||
|
||||
This guide covers fine-tuning [Ministral3 2512](https://huggingface.co/collections/mistralai/ministral-3) with thinking capabilities using Axolotl. The thinking model enables explicit Chain-of-Thought reasoning with separate thinking and response sections.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before starting, ensure you have:
|
||||
|
||||
- Installed Axolotl (see [main README](../README.md))
|
||||
|
||||
## Getting Started
|
||||
|
||||
Run the thinking model fine-tuning:
|
||||
|
||||
```bash
|
||||
axolotl train examples/ministral3/think/ministral3-3b-think-qlora.yaml
|
||||
```
|
||||
|
||||
This config uses about 4.76 GiB VRAM.
|
||||
|
||||
### Tips
|
||||
|
||||
- Dataset uses multi-content format with `type: thinking` support. See [Dataset Format](#dataset-format) below.
|
||||
- You cannot mix `content: str` and `content: list[dict]`, otherwise, dataset loading will fail. Keep it consistent.
|
||||
|
||||
## Dataset Format
|
||||
|
||||
The thinking model requires the multi-content dataset format with support for an extra `role: thinking` within system and assistant messages.
|
||||
|
||||
Example format:
|
||||
|
||||
```json
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{ "type": "text", "text": "{SYSTEM_PROMPT}"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{ "type": "text", "text": "Solve this step by step: What is 15% of 240?"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"type": "thinking",
|
||||
"thinking": "I need to calculate 15% of 240. First, I'll convert 15% to decimal: 0.15. Then multiply: 0.15 × 240 = 36."
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "To find 15% of 240, I'll multiply 240 by 0.15:\n\n240 × 0.15 = 36\n\nTherefore, 15% of 240 is 36."
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Advanced Options
|
||||
|
||||
The `thinking` section supports an optional `closed` parameter:
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "thinking",
|
||||
"thinking": "Internal reasoning here...",
|
||||
"closed": true // Default: true, controls adding the closing [/THINK] tag
|
||||
}
|
||||
```
|
||||
67
examples/ministral3/think/ministral3-3b-think-qlora.yaml
Normal file
67
examples/ministral3/think/ministral3-3b-think-qlora.yaml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: mistralai/Ministral-3-3B-Reasoning-2512
|
||||
|
||||
# Enable to use mistral-common tokenizer
|
||||
tokenizer_use_mistral_common: true
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
datasets:
|
||||
- path: Nanobit/text-think-2k-test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
58
examples/ministral3/vision/README.md
Normal file
58
examples/ministral3/vision/README.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# Ministral3 2512 Vision Fine-tuning
|
||||
|
||||
This guide covers fine-tuning [Ministral3 2512](https://huggingface.co/collections/mistralai/ministral-3) with vision capabilities using Axolotl.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before starting, ensure you have:
|
||||
|
||||
- Installed Axolotl from source (see [main README](../README.md))
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install the required vision lib:
|
||||
```bash
|
||||
pip install 'mistral-common[opencv]==1.8.6'
|
||||
```
|
||||
|
||||
2. Download the example dataset image:
|
||||
```bash
|
||||
wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
|
||||
```
|
||||
|
||||
3. Run the fine-tuning:
|
||||
```bash
|
||||
axolotl train examples/ministral3/vision/ministral3-3b-vision-qlora.yml
|
||||
```
|
||||
|
||||
WARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.
|
||||
|
||||
### Tips
|
||||
|
||||
Key differences from text-only model:
|
||||
- Multi-modal dataset format required
|
||||
- Sample packing not supported
|
||||
|
||||
## Dataset Format
|
||||
|
||||
The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
|
||||
|
||||
One exception is that, passing `"image": PIL.Image` is not supported. MistralTokenizer only supports `path`, `url`, and `base64` for now.
|
||||
|
||||
Example:
|
||||
```json
|
||||
{
|
||||
"messages": [
|
||||
{"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
|
||||
{"role": "user", "content": [
|
||||
{ "type": "text", "text": "What's in this image?"},
|
||||
{"type": "image", "path": "path/to/image.jpg" }
|
||||
]},
|
||||
{"role": "assistant", "content": [{ "type": "text", "text": "..." }]},
|
||||
],
|
||||
}
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- Sample Packing is not supported for multi-modality training currently.
|
||||
64
examples/ministral3/vision/ministral3-3b-vision-qlora.yml
Normal file
64
examples/ministral3/vision/ministral3-3b-vision-qlora.yml
Normal file
@@ -0,0 +1,64 @@
|
||||
base_model: mistralai/Ministral-3-3B-Reasoning-2512
|
||||
processor_type: AutoProcessor
|
||||
|
||||
# Enable to use mistral-common tokenizer
|
||||
tokenizer_use_mistral_common: true
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_4bit: true
|
||||
|
||||
# these 3 lines are needed for now to handle vision chat templates w images
|
||||
skip_prepare_dataset: true
|
||||
remove_unused_columns: false
|
||||
sample_packing: false
|
||||
|
||||
# sample dataset below requires downloading image in advance
|
||||
# wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
|
||||
datasets:
|
||||
- path: Nanobit/text-vision-2k-test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.01
|
||||
output_dir: ./outputs/out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: true
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
@@ -5,6 +5,7 @@ This guide covers fine-tuning [Mistral Small 3.1](mistralai/Mistral-Small-3.1-24
|
||||
## Prerequisites
|
||||
|
||||
Before starting, ensure you have:
|
||||
|
||||
- Installed Axolotl (see [Installation docs](https://docs.axolotl.ai/docs/installation.html))
|
||||
|
||||
## Getting Started
|
||||
38
examples/olmo3/README.md
Normal file
38
examples/olmo3/README.md
Normal file
@@ -0,0 +1,38 @@
|
||||
# Finetune Allenai's Olmo 3 with Axolotl
|
||||
|
||||
[Olmo 3](https://huggingface.co/collections/allenai/olmo-3) are a family of 7B and 32B models open source models trained by The Allen Institute for Artificial Intelligence.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||
|
||||
3. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/olmo3/olmo3-7b-qlora.yaml
|
||||
```
|
||||
|
||||
This uses about 11.3 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### TIPS
|
||||
|
||||
- The example config can be re-used for Olmo and Olmo 2.
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [Olmo 3 Blog](https://allenai.org/blog/olmo3)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
64
examples/olmo3/olmo3-7b-qlora.yaml
Normal file
64
examples/olmo3/olmo3-7b-qlora.yaml
Normal file
@@ -0,0 +1,64 @@
|
||||
base_model: allenai/Olmo-3-7B-Instruct-SFT
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
42
examples/plano/README.md
Normal file
42
examples/plano/README.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# Finetune Katanemo's Plano-Orchestrator with Axolotl
|
||||
|
||||
[Plano-Orchestrator](https://huggingface.co/collections/katanemo/plano-orchestrator) is a family of 4B and 30B-A3B routing and orchestration models designed for multi-agent systems. It analyzes user intent and conversation context to make precise routing decisions, excelling at multi-turn context understanding, multi-intent detection, and context-dependent routing.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||
|
||||
3. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/plano/plano-4b-qlora.yaml
|
||||
```
|
||||
|
||||
This config uses about 5.1 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### Orchestration Prompt
|
||||
|
||||
Plano-Orchestrator uses a specific orchestration prompt format for routing/agent decisions. Please check the [official model card](https://huggingface.co/katanemo/Plano-Orchestrator-4B) for proper prompt formatting and the `ORCHESTRATION_PROMPT` template.
|
||||
|
||||
### Tips
|
||||
|
||||
- To use the larger [Plano-Orchestrator-30B-A3B](https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B) MoE model, simply change `base_model: katanemo/Plano-Orchestrator-30B-A3B` in the config and enable multi-GPU training if needed.
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [Plano GitHub](https://github.com/katanemo/plano)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
65
examples/plano/plano-4b-qlora.yaml
Normal file
65
examples/plano/plano-4b-qlora.yaml
Normal file
@@ -0,0 +1,65 @@
|
||||
base_model: katanemo/Plano-Orchestrator-4B
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
chat_template: qwen3
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
67
examples/qat_nvfp4/Gemma3-12B_baseline.yml
Normal file
67
examples/qat_nvfp4/Gemma3-12B_baseline.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: google/gemma-3-12b-it
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: gemma3
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
|
||||
output_dir: ./outputs/out_gemma/
|
||||
|
||||
sequence_len: 8096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 16
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 4e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
72
examples/qat_nvfp4/Gemma3-12B_qat.yml
Normal file
72
examples/qat_nvfp4/Gemma3-12B_qat.yml
Normal file
@@ -0,0 +1,72 @@
|
||||
base_model: google/gemma-3-12b-it
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: gemma3
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
|
||||
output_dir: ./outputs/qat_out_gemma/
|
||||
|
||||
sequence_len: 8096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
qat:
|
||||
activation_dtype: nvfp4
|
||||
weight_dtype: nvfp4
|
||||
group_size: 16 # only group_size of 16 is supported with nvfp4
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 16
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 4e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
67
examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml
Normal file
67
examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: google/gemma-3-12b-it
|
||||
# Math finetuning configuration for Gemma3-12B
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: gemma3
|
||||
datasets:
|
||||
- path: AI-MO/NuminaMath-CoT
|
||||
type: chat_template
|
||||
|
||||
output_dir: ./outputs/out_math_gemma/
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 3e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
72
examples/qat_nvfp4/Math-Gemma3-12B_qat.yml
Normal file
72
examples/qat_nvfp4/Math-Gemma3-12B_qat.yml
Normal file
@@ -0,0 +1,72 @@
|
||||
base_model: google/gemma-3-12b-it
|
||||
# Math finetuning configuration for Gemma3-12B
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: gemma3
|
||||
datasets:
|
||||
- path: AI-MO/NuminaMath-CoT
|
||||
type: chat_template
|
||||
|
||||
output_dir: ./outputs/qat_out_math_gemma/
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
qat:
|
||||
activation_dtype: nvfp4
|
||||
weight_dtype: nvfp4
|
||||
group_size: 16 # only group_size of 16 is supported with nvfp4
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 3e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
68
examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml
Normal file
68
examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml
Normal file
@@ -0,0 +1,68 @@
|
||||
base_model: google/gemma-3-27b-it
|
||||
# Math finetuning configuration for Gemma3-27B
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: gemma3
|
||||
datasets:
|
||||
- path: AI-MO/NuminaMath-CoT
|
||||
type: chat_template
|
||||
|
||||
output_dir: ./outputs/out_math_gemma27/
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 16
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 5e-6
|
||||
eta_min: 7e-7
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
73
examples/qat_nvfp4/Math-Gemma3-27B_qat.yml
Normal file
73
examples/qat_nvfp4/Math-Gemma3-27B_qat.yml
Normal file
@@ -0,0 +1,73 @@
|
||||
base_model: google/gemma-3-27b-it
|
||||
# Math finetuning configuration for Gemma3-27B
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: gemma3
|
||||
datasets:
|
||||
- path: AI-MO/NuminaMath-CoT
|
||||
type: chat_template
|
||||
|
||||
output_dir: ./outputs/qat_out_math_gemma27/
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
qat:
|
||||
activation_dtype: nvfp4
|
||||
weight_dtype: nvfp4
|
||||
group_size: 16 # only group_size of 16 is supported with nvfp4
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 16
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 5e-6
|
||||
eta_min: 7e-7
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Gemma3DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
67
examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml
Normal file
67
examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: Qwen/Qwen2.5-72B
|
||||
# Math finetuning configuration for Qwen2.5-72B (non-instruct)
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: qwen_25
|
||||
datasets:
|
||||
- path: AI-MO/NuminaMath-CoT
|
||||
type: chat_template
|
||||
|
||||
output_dir: ./outputs/out_math_72b/
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 5e-6
|
||||
eta_min: 7e-7
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
72
examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml
Normal file
72
examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml
Normal file
@@ -0,0 +1,72 @@
|
||||
base_model: Qwen/Qwen2.5-72B
|
||||
# Math finetuning configuration for Qwen2.5-72B (non-instruct)
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: qwen_25
|
||||
datasets:
|
||||
- path: AI-MO/NuminaMath-CoT
|
||||
type: chat_template
|
||||
|
||||
output_dir: ./outputs/qat_out_math_72b/
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
qat:
|
||||
activation_dtype: nvfp4
|
||||
weight_dtype: nvfp4
|
||||
group_size: 16 # only group_size of 16 is supported with nvfp4
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 5e-6
|
||||
eta_min: 7e-7
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
67
examples/qat_nvfp4/Qwen2.5-72B_baseline.yml
Normal file
67
examples/qat_nvfp4/Qwen2.5-72B_baseline.yml
Normal file
@@ -0,0 +1,67 @@
|
||||
base_model: Qwen/Qwen2.5-72B
|
||||
# Alpaca finetuning configuration for Qwen2.5-72B
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: qwen_25
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
|
||||
output_dir: ./outputs/out_qwen72b/
|
||||
|
||||
sequence_len: 8096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 16
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 2e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
72
examples/qat_nvfp4/Qwen2.5-72B_qat.yml
Normal file
72
examples/qat_nvfp4/Qwen2.5-72B_qat.yml
Normal file
@@ -0,0 +1,72 @@
|
||||
base_model: Qwen/Qwen2.5-72B
|
||||
# Alpaca finetuning configuration for Qwen2.5-72B
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_rope: true
|
||||
liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_layer_norm: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
seed: 42
|
||||
chat_template: qwen_25
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
|
||||
output_dir: ./outputs/qat_out_qwen72b/
|
||||
|
||||
sequence_len: 8096
|
||||
sample_packing: true
|
||||
flash_attention: true
|
||||
|
||||
qat:
|
||||
activation_dtype: nvfp4
|
||||
weight_dtype: nvfp4
|
||||
group_size: 16 # only group_size of 16 is supported with nvfp4
|
||||
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 16
|
||||
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 2e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
|
||||
# evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
warmup_ratio: 0.1
|
||||
weight_decay: 0.0
|
||||
fsdp_version: 2
|
||||
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
cpu_ram_efficient_loading: true
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
sharding_strategy: FULL_SHARD
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
special_tokens:
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
70
examples/qwen2/adamw-pretrain-fsdp2.yaml
Normal file
70
examples/qwen2/adamw-pretrain-fsdp2.yaml
Normal file
@@ -0,0 +1,70 @@
|
||||
base_model: Qwen/Qwen2.5-0.5B
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
# Use random initialization for fair comparison
|
||||
reinit_weights: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
# Pretraining dataset
|
||||
pretraining_dataset:
|
||||
- path: allenai/c4
|
||||
name: en
|
||||
type: pretrain
|
||||
split: train
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/compare-adamw-pretrain
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
wandb_project: dist_muon
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name: adamw
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 4
|
||||
num_epochs: 1
|
||||
max_steps: 305
|
||||
|
||||
# AdamW optimizer settings (standard LR for AdamW)
|
||||
optimizer: adamw_torch_fused
|
||||
learning_rate: 0.0002
|
||||
weight_decay: 0.01
|
||||
lr_scheduler: cosine
|
||||
|
||||
train_on_inputs: true
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16: false
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: false
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 0
|
||||
saves_per_epoch: 1
|
||||
|
||||
# Reproducibility
|
||||
seed: 42
|
||||
|
||||
fsdp_config:
|
||||
fsdp_version: 2
|
||||
fsdp_offload_params: false
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_cpu_ram_efficient_loading: false
|
||||
fsdp_reshard_after_forward: true
|
||||
|
||||
special_tokens:
|
||||
70
examples/qwen2/muon-pretrain-fsdp2.yaml
Normal file
70
examples/qwen2/muon-pretrain-fsdp2.yaml
Normal file
@@ -0,0 +1,70 @@
|
||||
base_model: Qwen/Qwen2.5-0.5B
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
# Use random initialization for fair comparison
|
||||
reinit_weights: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
# Pretraining dataset
|
||||
pretraining_dataset:
|
||||
- path: allenai/c4
|
||||
name: en
|
||||
type: pretrain
|
||||
split: train
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/compare-muon-pretrain
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
wandb_project: dist_muon
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name: muon
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 4
|
||||
num_epochs: 1
|
||||
max_steps: 305
|
||||
|
||||
# Muon optimizer settings
|
||||
optimizer: muon
|
||||
learning_rate: 0.02
|
||||
weight_decay: 0.01
|
||||
lr_scheduler: cosine
|
||||
|
||||
train_on_inputs: true
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16: false
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: false
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 0
|
||||
saves_per_epoch: 1
|
||||
|
||||
# Reproducibility
|
||||
seed: 42
|
||||
|
||||
fsdp_config:
|
||||
fsdp_version: 2
|
||||
fsdp_offload_params: false
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_cpu_ram_efficient_loading: false
|
||||
fsdp_reshard_after_forward: true
|
||||
|
||||
special_tokens:
|
||||
46
examples/qwen3/README.md
Normal file
46
examples/qwen3/README.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# Finetune Qwen3 with Axolotl
|
||||
|
||||
[Qwen3](https://huggingface.co/collections/Qwen/qwen3) are a family of open source models trained by Alibaba.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||
|
||||
3. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/qwen3/32b-qlora.yaml
|
||||
```
|
||||
|
||||
Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### Chat template masking a few tokens off
|
||||
|
||||
If you notice that the `chat_template` masking for assistant prompts are off by a few tokens, please ensure that you are adding the below to the yaml.
|
||||
|
||||
```yaml
|
||||
chat_template: qwen3
|
||||
```
|
||||
|
||||
### TIPS
|
||||
|
||||
- For inference, please check the official model card as it depends on your reasoning mode.
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [Qwen3 Blog](https://qwenlm.github.io/blog/qwen3/)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
@@ -6,21 +6,17 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Seed-OSS is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
Here is an example of how to install from main for pip:
|
||||
Here is an example of how to install from pip:
|
||||
```bash
|
||||
# Ensure you have a compatible version of Pytorch installed
|
||||
pip3 install packaging setuptools wheel ninja
|
||||
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
|
||||
|
||||
```bash
|
||||
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
|
||||
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
cd axolotl
|
||||
|
||||
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
|
||||
pip3 install --no-build-isolation -e '.[flash-attn]'
|
||||
|
||||
# Install Cut Cross Entropy
|
||||
python scripts/cutcrossentropy_install.py | sh
|
||||
```
|
||||
# Install Cut Cross Entropy
|
||||
python scripts/cutcrossentropy_install.py | sh
|
||||
```
|
||||
|
||||
2. Run the finetuning example:
|
||||
|
||||
@@ -41,9 +37,7 @@ Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
||||
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
||||
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Related Resources
|
||||
|
||||
|
||||
@@ -37,9 +37,7 @@ This guide shows how to fine-tune SmolVLM2 models with Axolotl.
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
||||
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
|
||||
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Related Resources
|
||||
|
||||
|
||||
42
examples/trinity/README.md
Normal file
42
examples/trinity/README.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# Finetune ArceeAI's Trinity with Axolotl
|
||||
|
||||
[Trinity](https://huggingface.co/collections/arcee-ai/trinity) is a family of open weight MoE models trained by Arcee.ai.
|
||||
|
||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the main from the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build).
|
||||
|
||||
2. Run the finetuning example:
|
||||
|
||||
```bash
|
||||
axolotl train examples/trinity/trinity-nano-preview-qlora.yaml
|
||||
```
|
||||
|
||||
This config uses about 24.9 GiB VRAM.
|
||||
|
||||
Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
### TIPS
|
||||
|
||||
- For inference, the official Arcee.ai team recommends `top_p: 0.75`, `temperature: 0.15`, `top_k: 50`, and `min_p: 0.06`.
|
||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||
|
||||
## Optimization Guides
|
||||
|
||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
||||
|
||||
## Limitations
|
||||
|
||||
**Cut Cross Entropy (CCE)**: Currently not supported. We plan to include CCE support for Trinity in the near future.
|
||||
|
||||
## Related Resources
|
||||
|
||||
- [Trinity Blog](https://www.arcee.ai/blog/the-trinity-manifesto)
|
||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||
- [Axolotl Website](https://axolotl.ai)
|
||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||
68
examples/trinity/trinity-nano-preview-qlora.yaml
Normal file
68
examples/trinity/trinity-nano-preview-qlora.yaml
Normal file
@@ -0,0 +1,68 @@
|
||||
base_model: arcee-ai/Trinity-Nano-Preview
|
||||
trust_remote_code: true
|
||||
revision_of_model: 2ee94b0
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
# CCE - N/A as of now
|
||||
# plugins:
|
||||
# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
# flash_attention: true # Not supported
|
||||
sdp_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
@@ -1,5 +1,5 @@
|
||||
base_model: mistralai/Voxtral-Mini-3B-2507
|
||||
processor_type: AutoProcessor
|
||||
processor_type: VoxtralProcessor
|
||||
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
@@ -1,33 +1,34 @@
|
||||
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||
|
||||
# START section of dependencies that don't install on Darwin/MacOS
|
||||
bitsandbytes==0.47.0
|
||||
bitsandbytes==0.48.2
|
||||
triton>=3.0.0
|
||||
mamba-ssm==1.2.0.post1
|
||||
xformers>=0.0.23.post1
|
||||
liger-kernel==0.6.3
|
||||
liger-kernel==0.6.4
|
||||
# END section
|
||||
|
||||
packaging==23.2
|
||||
|
||||
huggingface_hub>=0.36.0
|
||||
peft>=0.17.1
|
||||
tokenizers>=0.21.1
|
||||
peft>=0.18.0
|
||||
tokenizers>=0.22.1
|
||||
transformers==4.57.1
|
||||
accelerate==1.10.1
|
||||
datasets==4.3.0
|
||||
deepspeed>=0.17.0
|
||||
trl==0.24.0
|
||||
accelerate==1.12.0
|
||||
datasets==4.4.2
|
||||
deepspeed>=0.18.3
|
||||
trl==0.25.1
|
||||
hf_xet==1.2.0
|
||||
kernels>=0.9.0
|
||||
trackio
|
||||
kernels==0.11.5
|
||||
trackio>=0.13.0
|
||||
typing-extensions>=4.15.0
|
||||
|
||||
optimum==1.16.2
|
||||
hf_transfer
|
||||
sentencepiece
|
||||
gradio==5.49.1
|
||||
gradio>=6.2.0,<7.0
|
||||
|
||||
modal==1.0.2
|
||||
modal==1.3.0.post1
|
||||
pydantic>=2.10.6
|
||||
addict
|
||||
fire
|
||||
@@ -42,7 +43,6 @@ numpy>=2.2.6
|
||||
# qlora things
|
||||
evaluate==0.4.1
|
||||
scipy
|
||||
scikit-learn==1.4.2
|
||||
nvidia-ml-py==12.560.30
|
||||
art
|
||||
tensorboard
|
||||
@@ -63,10 +63,13 @@ langdetect==1.0.9
|
||||
immutabledict==4.2.0
|
||||
antlr4-python3-runtime==4.13.2
|
||||
|
||||
torchao==0.13.0
|
||||
torchao==0.15.0
|
||||
openenv-core==0.1.0
|
||||
schedulefree==1.4.1
|
||||
|
||||
axolotl-contribs-lgpl==0.0.7
|
||||
axolotl-contribs-mit==0.0.5
|
||||
axolotl-contribs-mit==0.0.6
|
||||
# telemetry
|
||||
posthog==6.7.11
|
||||
|
||||
mistral-common==1.8.5
|
||||
mistral-common==1.8.6
|
||||
|
||||
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
|
||||
|
||||
print(
|
||||
UNINSTALL_PREFIX
|
||||
+ f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"'
|
||||
+ f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2"'
|
||||
)
|
||||
|
||||
5
setup.py
5
setup.py
@@ -66,7 +66,6 @@ def parse_requirements(extras_require_map):
|
||||
extras_require_map.pop("fbgemm-gpu")
|
||||
extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.4.1"]
|
||||
extras_require_map["vllm"] = ["vllm==0.11.1"]
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
elif (major, minor) >= (2, 8):
|
||||
extras_require_map.pop("fbgemm-gpu")
|
||||
extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.3.0"]
|
||||
@@ -130,7 +129,7 @@ extras_require = {
|
||||
"ring-flash-attn>=0.1.7",
|
||||
],
|
||||
"deepspeed": [
|
||||
"deepspeed==0.17.5",
|
||||
"deepspeed==0.18.2",
|
||||
"deepspeed-kernels",
|
||||
],
|
||||
"mamba-ssm": [
|
||||
@@ -157,7 +156,7 @@ extras_require = {
|
||||
"came_pytorch==0.1.3",
|
||||
],
|
||||
"ray": [
|
||||
"ray[train]",
|
||||
"ray[train]>=2.52.1",
|
||||
],
|
||||
"vllm": [
|
||||
"vllm==0.10.0",
|
||||
|
||||
@@ -24,8 +24,7 @@ if launcher_args:
|
||||
launcher_args_str = "-- " + " ".join(launcher_args)
|
||||
|
||||
# 1. Define a base image for your training job
|
||||
# must use torch 2.7.0 for vllm
|
||||
BASE_IMAGE = "axolotlai/axolotl:main-py3.11-cu126-2.7.1"
|
||||
BASE_IMAGE = "axolotlai/axolotl:main-py3.11-cu128-2.9.1"
|
||||
|
||||
# 2. Define the Runtime Environment for the Training Job
|
||||
# This includes start commands and environment variables.a
|
||||
|
||||
@@ -82,7 +82,7 @@ class ModalCloud(Cloud):
|
||||
return res
|
||||
|
||||
def get_image(self):
|
||||
docker_tag = "main-py3.11-cu126-2.7.1"
|
||||
docker_tag = "main-py3.11-cu128-2.9.1"
|
||||
if self.config.docker_tag:
|
||||
docker_tag = self.config.docker_tag
|
||||
docker_image = f"axolotlai/axolotl:{docker_tag}"
|
||||
|
||||
@@ -14,6 +14,8 @@ import yaml
|
||||
from transformers.utils import is_torch_bf16_gpu_available
|
||||
|
||||
from axolotl.integrations.base import PluginManager
|
||||
from axolotl.telemetry.errors import send_errors
|
||||
from axolotl.telemetry.manager import TelemetryManager
|
||||
from axolotl.utils.comet_ import setup_comet_env_vars
|
||||
from axolotl.utils.config import (
|
||||
normalize_cfg_datasets,
|
||||
@@ -24,6 +26,7 @@ from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.logging import get_logger
|
||||
from axolotl.utils.mlflow_ import setup_mlflow_env_vars
|
||||
from axolotl.utils.tee import prepare_debug_log
|
||||
from axolotl.utils.trackio_ import setup_trackio_env_vars
|
||||
from axolotl.utils.trainer import prepare_optim_env
|
||||
from axolotl.utils.wandb_ import setup_wandb_env_vars
|
||||
|
||||
@@ -31,6 +34,8 @@ LOG = get_logger(__name__)
|
||||
|
||||
API_KEY_FIELDS = {"comet_api_key"}
|
||||
|
||||
TELEMETRY_MANAGER = TelemetryManager.get_instance()
|
||||
|
||||
|
||||
def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
|
||||
"""
|
||||
@@ -164,6 +169,7 @@ def plugin_set_cfg(cfg: DictDefault):
|
||||
plugin_manager.cfg = cfg
|
||||
|
||||
|
||||
@send_errors
|
||||
def load_cfg(
|
||||
config: str | Path | DictDefault = Path("examples/"), **kwargs
|
||||
) -> DictDefault:
|
||||
@@ -197,6 +203,8 @@ def load_cfg(
|
||||
temp_file.close()
|
||||
cfg.axolotl_config_path = temp_file.name
|
||||
|
||||
TELEMETRY_MANAGER.send_event(event_type="config-loaded", properties=cfg)
|
||||
|
||||
# If there are any options passed in the cli, if it is something that seems valid
|
||||
# from the yaml, then overwrite the value
|
||||
cfg_keys = cfg.keys()
|
||||
@@ -220,6 +228,7 @@ def load_cfg(
|
||||
cfg,
|
||||
capabilities={
|
||||
"bf16": is_torch_bf16_gpu_available(),
|
||||
"fp8": compute_supports_fp8(),
|
||||
"n_gpu": int(os.environ.get("WORLD_SIZE", 1)),
|
||||
"compute_capability": gpu_version,
|
||||
},
|
||||
@@ -238,8 +247,10 @@ def load_cfg(
|
||||
setup_wandb_env_vars(cfg)
|
||||
setup_mlflow_env_vars(cfg)
|
||||
setup_comet_env_vars(cfg)
|
||||
setup_trackio_env_vars(cfg)
|
||||
plugin_set_cfg(cfg)
|
||||
|
||||
TELEMETRY_MANAGER.send_event(event_type="config-processed", properties=cfg)
|
||||
cfg_to_log = {
|
||||
k: "[REDACTED]" if k in API_KEY_FIELDS else v
|
||||
for k, v in cfg.items()
|
||||
@@ -251,3 +262,11 @@ def load_cfg(
|
||||
)
|
||||
|
||||
return cfg
|
||||
|
||||
|
||||
def compute_supports_fp8() -> bool:
|
||||
try:
|
||||
compute_capability = torch.cuda.get_device_capability()
|
||||
return compute_capability >= (9, 0)
|
||||
except RuntimeError:
|
||||
return False
|
||||
|
||||
@@ -19,7 +19,10 @@ from axolotl.cli.utils.diffusion import (
|
||||
launch_diffusion_gradio_ui,
|
||||
)
|
||||
from axolotl.integrations.base import PluginManager
|
||||
from axolotl.utils.chat_templates import get_chat_template_from_config
|
||||
from axolotl.telemetry.errors import send_errors
|
||||
from axolotl.utils.chat_templates import (
|
||||
get_chat_template_from_config,
|
||||
)
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.logging import get_logger
|
||||
|
||||
@@ -43,6 +46,7 @@ def get_multi_line_input() -> str:
|
||||
return instruction
|
||||
|
||||
|
||||
@send_errors
|
||||
def do_inference(
|
||||
*,
|
||||
cfg: DictDefault,
|
||||
@@ -160,6 +164,7 @@ def do_inference(
|
||||
print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))
|
||||
|
||||
|
||||
@send_errors
|
||||
def do_inference_gradio(
|
||||
*,
|
||||
cfg: DictDefault,
|
||||
@@ -283,8 +288,8 @@ def do_inference_gradio(
|
||||
title=cfg.get("gradio_title", "Axolotl Gradio Interface"),
|
||||
)
|
||||
|
||||
demo.queue().launch(
|
||||
show_api=False,
|
||||
demo.launch(
|
||||
footer_links=["gradio", "settings"],
|
||||
share=cfg.get("gradio_share", True),
|
||||
server_name=cfg.get("gradio_server_name", "127.0.0.1"),
|
||||
server_port=cfg.get("gradio_server_port", None),
|
||||
|
||||
@@ -26,7 +26,7 @@ from axolotl.cli.utils import (
|
||||
launch_training,
|
||||
)
|
||||
from axolotl.integrations.lm_eval.cli import lm_eval
|
||||
from axolotl.utils import set_pytorch_cuda_alloc_conf
|
||||
from axolotl.utils import set_misc_env, set_pytorch_cuda_alloc_conf
|
||||
from axolotl.utils.logging import get_logger
|
||||
from axolotl.utils.schemas.config import AxolotlInputConfig
|
||||
|
||||
@@ -45,6 +45,7 @@ def cli():
|
||||
print_axolotl_text_art()
|
||||
load_dotenv()
|
||||
set_pytorch_cuda_alloc_conf()
|
||||
set_misc_env()
|
||||
|
||||
|
||||
@cli.command()
|
||||
|
||||
@@ -7,12 +7,14 @@ import fire
|
||||
|
||||
from axolotl.cli.config import load_cfg
|
||||
from axolotl.cli.utils import load_model_and_tokenizer
|
||||
from axolotl.telemetry.errors import send_errors
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.logging import get_logger
|
||||
|
||||
LOG = get_logger(__name__)
|
||||
|
||||
|
||||
@send_errors
|
||||
def do_merge_lora(*, cfg: DictDefault) -> None:
|
||||
"""
|
||||
Calls `transformers`' `merge_and_unload` on the model given in the `axolotl` config
|
||||
|
||||
@@ -23,6 +23,7 @@ from safetensors.torch import save_file as safe_save_file
|
||||
from torch.distributed.checkpoint.format_utils import _EmptyStateDictLoadPlanner
|
||||
|
||||
from axolotl.cli.config import load_cfg
|
||||
from axolotl.telemetry.errors import send_errors
|
||||
from axolotl.utils.logging import get_logger
|
||||
from axolotl.utils.train import determine_last_checkpoint
|
||||
|
||||
@@ -118,6 +119,7 @@ def _distributed_checkpoint_to_merged_weights(
|
||||
return save_path_
|
||||
|
||||
|
||||
@send_errors
|
||||
def merge_fsdp_weights(
|
||||
checkpoint_dir: str,
|
||||
output_path: str,
|
||||
|
||||
@@ -17,6 +17,7 @@ from axolotl.cli.config import load_cfg
|
||||
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
|
||||
from axolotl.common.datasets import load_datasets, load_preference_datasets
|
||||
from axolotl.integrations.base import PluginManager
|
||||
from axolotl.telemetry.errors import send_errors
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.logging import get_logger
|
||||
from axolotl.utils.trainer import disable_datasets_caching
|
||||
@@ -24,6 +25,7 @@ from axolotl.utils.trainer import disable_datasets_caching
|
||||
LOG = get_logger(__name__)
|
||||
|
||||
|
||||
@send_errors
|
||||
def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
|
||||
"""
|
||||
Preprocesses dataset specified in axolotl config.
|
||||
|
||||
@@ -8,7 +8,7 @@ from typing import Union
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, TorchAoConfig
|
||||
|
||||
from axolotl.cli.config import load_cfg
|
||||
from axolotl.loaders import load_tokenizer
|
||||
from axolotl.loaders import load_processor, load_tokenizer
|
||||
from axolotl.utils.logging import get_logger
|
||||
from axolotl.utils.quantization import (
|
||||
TorchAOQuantDType,
|
||||
@@ -66,6 +66,11 @@ def do_quantize(
|
||||
|
||||
LOG.info(f"Loading model from {model_path}.")
|
||||
tokenizer = load_tokenizer(cfg)
|
||||
|
||||
processor = None
|
||||
if cfg.is_multimodal:
|
||||
processor = load_processor(cfg, tokenizer)
|
||||
|
||||
config = AutoConfig.from_pretrained(model_path)
|
||||
torch_dtype = config.torch_dtype if hasattr(config, "torch_dtype") else None
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
@@ -107,6 +112,10 @@ def do_quantize(
|
||||
save_jinja_files=cfg.tokenizer_save_jinja_files,
|
||||
)
|
||||
|
||||
if processor:
|
||||
LOG.info(f"Saving processor to: {str(Path(output_dir) / 'quantized')}.")
|
||||
processor.save_pretrained(str(Path(output_dir) / "quantized"))
|
||||
|
||||
if hub_model_id:
|
||||
hub_model_id = (
|
||||
hub_model_id.rstrip("-")
|
||||
@@ -114,6 +123,8 @@ def do_quantize(
|
||||
)
|
||||
model.push_to_hub(hub_model_id, safe_serialization=False)
|
||||
tokenizer.push_to_hub(hub_model_id)
|
||||
if processor:
|
||||
processor.push_to_hub(hub_model_id)
|
||||
LOG.info(f"Quantized model pushed to: {hub_model_id}.")
|
||||
|
||||
LOG.info(f"Quantized model saved to: {str(Path(output_dir) / 'quantized')}.")
|
||||
|
||||
@@ -366,8 +366,8 @@ def launch_diffusion_gradio_ui(
|
||||
outputs=[masked_preview, html_out],
|
||||
)
|
||||
|
||||
demo.queue().launch(
|
||||
show_api=False,
|
||||
demo.launch(
|
||||
footer_links=["gradio", "settings"],
|
||||
share=cfg.get("gradio_share", True),
|
||||
server_name=cfg.get("gradio_server_name", "127.0.0.1"),
|
||||
server_port=cfg.get("gradio_server_port", None),
|
||||
|
||||
@@ -17,4 +17,5 @@ MOE_ARCH_BLOCK = {
|
||||
"deepseek_v3": "DeepseekV3MoE",
|
||||
"gpt_oss": "GptOssDecoderLayer",
|
||||
"lfm2_moe": "Lfm2MoeSparseMoeBlock",
|
||||
"afmoe": "AfmoeMoE",
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ from datasets import Dataset
|
||||
import axolotl.monkeypatch.data.batch_dataset_fetcher # noqa: F401
|
||||
from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
|
||||
from axolotl.loaders import load_processor, load_tokenizer
|
||||
from axolotl.telemetry.errors import send_errors
|
||||
from axolotl.utils.data import prepare_datasets, prepare_preference_datasets
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.logging import get_logger
|
||||
@@ -34,6 +35,7 @@ def sample_dataset(dataset: Dataset, num_samples: int) -> Dataset:
|
||||
)
|
||||
|
||||
|
||||
@send_errors
|
||||
def load_datasets(
|
||||
*,
|
||||
cfg: DictDefault,
|
||||
@@ -96,6 +98,7 @@ def load_datasets(
|
||||
)
|
||||
|
||||
|
||||
@send_errors
|
||||
def load_preference_datasets(
|
||||
*, cfg: DictDefault, cli_args: PreprocessCliArgs | TrainerCliArgs | None = None
|
||||
) -> TrainDatasetMeta:
|
||||
|
||||
@@ -1,158 +0,0 @@
|
||||
"""
|
||||
monkeypatch for flex + packing
|
||||
"""
|
||||
|
||||
import sys
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch.nn.attention.flex_attention import BlockMask
|
||||
from transformers import Cache, PretrainedConfig
|
||||
from transformers.masking_utils import (
|
||||
ALL_MASK_ATTENTION_FUNCTIONS,
|
||||
_preprocess_mask_arguments,
|
||||
and_masks,
|
||||
causal_mask_function,
|
||||
or_masks,
|
||||
)
|
||||
from transformers.utils import is_torch_greater_or_equal
|
||||
|
||||
_is_torch_greater_or_equal_than_2_6 = is_torch_greater_or_equal("2.6", accept_dev=True)
|
||||
|
||||
|
||||
def create_causal_mask(
|
||||
config: PretrainedConfig,
|
||||
input_embeds: torch.Tensor,
|
||||
attention_mask: torch.Tensor,
|
||||
cache_position: torch.Tensor,
|
||||
past_key_values: Optional[Cache],
|
||||
or_mask_function: Optional[Callable] = None,
|
||||
and_mask_function: Optional[Callable] = None,
|
||||
) -> Optional[Union[torch.Tensor, BlockMask]]:
|
||||
"""
|
||||
Create a standard causal mask based on the attention implementation used (stored in the config). If `past_key_values`
|
||||
has an HybridCache structure, this function will return the mask corresponding to one of the "full_attention" layers (to align
|
||||
to what is needed in the `modeling_xxx.py` files).
|
||||
|
||||
Args:
|
||||
config (`PretrainedConfig`):
|
||||
The model config.
|
||||
input_embeds (`torch.Tensor`):
|
||||
The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
|
||||
batch size, query length and dtype.
|
||||
attention_mask (`torch.Tensor`, optional):
|
||||
The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
|
||||
It can also be an already prepared 4D mask, in which case it is returned as-is.
|
||||
cache_position (`torch.Tensor`):
|
||||
A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
|
||||
past_key_values (`Cache`, optional):
|
||||
The past key values, if we use a cache.
|
||||
or_mask_function (`Callable`, optional):
|
||||
An optional mask function to combine with the causal mask function (by doing the union of both). This is
|
||||
useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
|
||||
and_mask_function (`Callable`, optional):
|
||||
An optional mask function to combine with the causal mask function (by doing the intersection of both). This is
|
||||
useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
|
||||
"""
|
||||
# If we have an HybridCache structure, here we want to create the mask for the full layers
|
||||
if (
|
||||
past_key_values
|
||||
and hasattr(past_key_values, "is_sliding")
|
||||
and False in past_key_values.is_sliding
|
||||
):
|
||||
layer_idx = past_key_values.is_sliding.index(False)
|
||||
else:
|
||||
layer_idx = 0
|
||||
|
||||
original_attention_mask = (
|
||||
None
|
||||
if attention_mask is None
|
||||
else attention_mask.clone().to(cache_position.device)
|
||||
)
|
||||
early_exit, attention_mask, kv_length, kv_offset = _preprocess_mask_arguments(
|
||||
config, input_embeds, attention_mask, cache_position, past_key_values, layer_idx
|
||||
)
|
||||
if early_exit:
|
||||
return attention_mask
|
||||
|
||||
batch_size, total_seq_len = cache_position.shape
|
||||
key_length = total_seq_len
|
||||
document_ids = torch.nn.functional.pad(
|
||||
original_attention_mask, value=0, pad=(0, key_length)
|
||||
)
|
||||
|
||||
batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
|
||||
if attention_mask is not None:
|
||||
|
||||
def causal_doc_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
|
||||
"""
|
||||
Defines the logic of a block causal mask by combining both a standard causal mask
|
||||
and a block diagonal document mask.
|
||||
See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
|
||||
for an illustration.
|
||||
"""
|
||||
causal_mask_ = q_idx >= kv_idx # not valid when decoding
|
||||
document_mask = (
|
||||
document_ids[batch_idx, q_idx] == document_ids[batch_idx, kv_idx]
|
||||
)
|
||||
final_mask = causal_mask_ & document_mask
|
||||
return final_mask
|
||||
|
||||
mask_factory_function = causal_doc_mask_mod
|
||||
else:
|
||||
mask_factory_function = causal_mask_function
|
||||
mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
|
||||
|
||||
# Do not allow skip if we are compiling (this is to match BC)
|
||||
allow_is_causal_skip = (
|
||||
not past_key_values.is_compileable if past_key_values is not None else True
|
||||
)
|
||||
|
||||
# Allow slight deviations from causal mask
|
||||
if or_mask_function is not None:
|
||||
if not _is_torch_greater_or_equal_than_2_6:
|
||||
raise ValueError(
|
||||
"Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6"
|
||||
)
|
||||
mask_factory_function = or_masks(mask_factory_function, or_mask_function)
|
||||
allow_is_causal_skip = False
|
||||
if and_mask_function is not None:
|
||||
if not _is_torch_greater_or_equal_than_2_6:
|
||||
raise ValueError(
|
||||
"Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6"
|
||||
)
|
||||
mask_factory_function = and_masks(mask_factory_function, and_mask_function)
|
||||
allow_is_causal_skip = False
|
||||
|
||||
# We now create the mask
|
||||
causal_mask = mask_interface(
|
||||
batch_size=batch_size,
|
||||
cache_position=cache_position,
|
||||
kv_length=kv_length,
|
||||
kv_offset=kv_offset,
|
||||
mask_function=mask_factory_function,
|
||||
attention_mask=attention_mask,
|
||||
allow_is_causal_skip=allow_is_causal_skip, # additional kwarg for sdpa
|
||||
dtype=dtype, # Additional kwarg for eager
|
||||
config=config, # Pass the config as well, in case someone wants to easily have their own mask_interface
|
||||
)
|
||||
return causal_mask
|
||||
|
||||
|
||||
def patch_create_causal_mask(model_type):
|
||||
import transformers.masking_utils
|
||||
|
||||
transformers.masking_utils.create_causal_mask = create_causal_mask
|
||||
|
||||
if model_type:
|
||||
try:
|
||||
# Dynamically import the module and attention class
|
||||
module_path = f"transformers.models.{model_type}.modeling_{model_type}"
|
||||
module = __import__(module_path)
|
||||
module.create_causal_mask = create_causal_mask
|
||||
del sys.modules[module_path]
|
||||
except (ImportError, AttributeError) as e:
|
||||
raise ValueError(
|
||||
f"Could not import attention class for model_type: {model_type}. "
|
||||
f"Error: {str(e)}"
|
||||
) from e
|
||||
@@ -29,10 +29,13 @@ from transformers.trainer_pt_utils import AcceleratorConfig
|
||||
|
||||
from axolotl.integrations.base import PluginManager
|
||||
from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
|
||||
from axolotl.telemetry.callbacks import TelemetryCallback
|
||||
from axolotl.telemetry.manager import TelemetryManager
|
||||
from axolotl.utils import (
|
||||
is_comet_available,
|
||||
is_mlflow_available,
|
||||
is_opentelemetry_available,
|
||||
is_trackio_available,
|
||||
)
|
||||
from axolotl.utils.callbacks import (
|
||||
GCCallback,
|
||||
@@ -118,6 +121,13 @@ class TrainerBuilderBase(abc.ABC):
|
||||
if self.cfg.gc_steps:
|
||||
callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
|
||||
|
||||
if self.cfg.dynamic_checkpoint and self.cfg.dynamic_checkpoint.enabled:
|
||||
from axolotl.utils.callbacks.dynamic_checkpoint import (
|
||||
DynamicCheckpointCallback,
|
||||
)
|
||||
|
||||
callbacks.append(DynamicCheckpointCallback(self.cfg))
|
||||
|
||||
if self.cfg.use_wandb:
|
||||
callbacks.append(
|
||||
SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
|
||||
@@ -138,6 +148,14 @@ class TrainerBuilderBase(abc.ABC):
|
||||
callbacks.append(
|
||||
SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path)
|
||||
)
|
||||
if self.cfg.use_trackio and is_trackio_available():
|
||||
from axolotl.utils.callbacks.trackio_ import (
|
||||
SaveAxolotlConfigtoTrackioCallback,
|
||||
)
|
||||
|
||||
callbacks.append(
|
||||
SaveAxolotlConfigtoTrackioCallback(self.cfg.axolotl_config_path)
|
||||
)
|
||||
if self.cfg.use_otel_metrics and is_opentelemetry_available():
|
||||
from axolotl.utils.callbacks.opentelemetry import (
|
||||
OpenTelemetryMetricsCallback,
|
||||
@@ -155,6 +173,10 @@ class TrainerBuilderBase(abc.ABC):
|
||||
)
|
||||
)
|
||||
|
||||
telemetry_manager = TelemetryManager.get_instance()
|
||||
if telemetry_manager.enabled:
|
||||
callbacks.append(TelemetryCallback())
|
||||
|
||||
return callbacks
|
||||
|
||||
def get_post_trainer_create_callbacks(self, trainer):
|
||||
@@ -196,9 +218,9 @@ class TrainerBuilderBase(abc.ABC):
|
||||
):
|
||||
warmup_steps = 0
|
||||
warmup_ratio = 0.0
|
||||
if self.cfg.warmup_steps:
|
||||
if self.cfg.warmup_steps is not None:
|
||||
warmup_steps = self.cfg.warmup_steps
|
||||
elif self.cfg.warmup_ratio:
|
||||
elif self.cfg.warmup_ratio is not None:
|
||||
if total_num_steps:
|
||||
warmup_steps = max(int(self.cfg.warmup_ratio * total_num_steps), 0)
|
||||
else:
|
||||
@@ -268,11 +290,22 @@ class TrainerBuilderBase(abc.ABC):
|
||||
adam_kwargs["eps"] = training_args_kwargs.get("adam_epsilon")
|
||||
|
||||
if self.cfg.optimizer == "muon":
|
||||
from axolotl.contribs.mit.muon import (
|
||||
MuonOptimizerFactory,
|
||||
)
|
||||
_, device_mesh = build_parallelism_config(self.cfg)
|
||||
|
||||
if device_mesh is not None:
|
||||
from axolotl.contribs.mit.muon.dist_muon import (
|
||||
DistMuonOptimizerFactory,
|
||||
)
|
||||
|
||||
optimizer_cls = DistMuonOptimizerFactory
|
||||
optimizer_kwargs["device_mesh"] = device_mesh
|
||||
else:
|
||||
from axolotl.contribs.mit.muon import (
|
||||
MuonOptimizerFactory,
|
||||
)
|
||||
|
||||
optimizer_cls = MuonOptimizerFactory
|
||||
|
||||
optimizer_cls = MuonOptimizerFactory
|
||||
optimizer_kwargs.update(adam_kwargs)
|
||||
elif self.cfg.optimizer == "dion":
|
||||
from axolotl.contribs.mit.dion import (
|
||||
@@ -410,6 +443,8 @@ class TrainerBuilderBase(abc.ABC):
|
||||
report_to.append("tensorboard")
|
||||
if self.cfg.use_comet:
|
||||
report_to.append("comet_ml")
|
||||
if self.cfg.use_trackio:
|
||||
report_to.append("trackio")
|
||||
|
||||
training_args_kwargs["report_to"] = report_to
|
||||
|
||||
@@ -417,6 +452,8 @@ class TrainerBuilderBase(abc.ABC):
|
||||
training_args_kwargs["run_name"] = self.cfg.wandb_name
|
||||
elif self.cfg.use_mlflow:
|
||||
training_args_kwargs["run_name"] = self.cfg.mlflow_run_name
|
||||
elif self.cfg.use_trackio:
|
||||
training_args_kwargs["run_name"] = self.cfg.trackio_run_name
|
||||
else:
|
||||
training_args_kwargs["run_name"] = None
|
||||
|
||||
|
||||
@@ -72,7 +72,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
if self.cfg.include_tkps:
|
||||
callbacks.append(
|
||||
TokensPerSecondCallback(
|
||||
self.cfg.tensor_parallel_size, self.cfg.context_parallel_size
|
||||
self.cfg.tensor_parallel_size,
|
||||
self.cfg.context_parallel_size,
|
||||
resume_from_checkpoint=self.cfg.resume_from_checkpoint,
|
||||
)
|
||||
)
|
||||
return callbacks
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from functools import partial, wraps
|
||||
@@ -43,12 +45,14 @@ from axolotl.core.trainers.utils import (
|
||||
from axolotl.utils import get_not_null
|
||||
from axolotl.utils.bench import get_gpu_memory_usage
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.distributed import is_main_process
|
||||
from axolotl.utils.distributed import is_distributed, is_main_process
|
||||
from axolotl.utils.logging import get_logger
|
||||
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
|
||||
|
||||
LOG = get_logger(__name__)
|
||||
|
||||
TOKENS_STATE_FILE = "tokens_state."
|
||||
|
||||
REDUCTION_FNS = {
|
||||
"mean": torch.mean,
|
||||
"min": torch.min,
|
||||
@@ -348,14 +352,33 @@ class AxolotlTrainer(
|
||||
# return (loss, outputs) if return_outputs else loss
|
||||
|
||||
# track number of tokens for tokens per second calculation
|
||||
if self.args.include_tkps:
|
||||
if self.args.include_tkps and model.training:
|
||||
inputs_key = "labels" if "labels" in inputs else "input_ids"
|
||||
if hasattr(self.state, "num_tokens"):
|
||||
self.state.num_tokens = (
|
||||
self.state.num_tokens + (inputs[inputs_key] != -100).sum().cpu()
|
||||
trainable_tokens = (inputs[inputs_key] != -100).sum()
|
||||
total_tokens = inputs[inputs_key].numel()
|
||||
total_tokens = torch.tensor(total_tokens, device=inputs[inputs_key].device)
|
||||
|
||||
if is_distributed():
|
||||
torch.distributed.all_reduce(
|
||||
trainable_tokens, op=torch.distributed.ReduceOp.SUM
|
||||
)
|
||||
else:
|
||||
self.state.num_tokens = (inputs[inputs_key] != -100).sum().cpu()
|
||||
torch.distributed.all_reduce(
|
||||
total_tokens, op=torch.distributed.ReduceOp.SUM
|
||||
)
|
||||
|
||||
if not hasattr(self.state, "tokens"):
|
||||
self.state.tokens = {
|
||||
"trainable": torch.zeros(1),
|
||||
"total": torch.zeros(1),
|
||||
}
|
||||
|
||||
# trainable tokens for throughput and total token slots for summaries
|
||||
self.state.tokens["trainable"] = (
|
||||
self.state.tokens["trainable"] + trainable_tokens.detach().cpu()
|
||||
)
|
||||
self.state.tokens["total"] = self.state.tokens["total"] + total_tokens.cpu()
|
||||
# Store per-step trainable tokens for throughput calculation
|
||||
self.state.tokens["trainable_tokens"] = trainable_tokens.detach().cpu()
|
||||
|
||||
if self.args.orpo_alpha:
|
||||
return self.orpo_compute_loss(
|
||||
@@ -593,6 +616,7 @@ class AxolotlTrainer(
|
||||
"""
|
||||
# logs either has 'loss' or 'eval_loss'
|
||||
train_eval = "train" if "loss" in logs else "eval"
|
||||
metric_ndigits = int(os.getenv("AXOLOTL_METRIC_NDIGITS", "5"))
|
||||
|
||||
for key, metric_data in self._stored_metrics[train_eval].items():
|
||||
values = torch.tensor(metric_data["values"]) # type: ignore[arg-type]
|
||||
@@ -603,7 +627,18 @@ class AxolotlTrainer(
|
||||
raise NotImplementedError(
|
||||
"Metric reduction must be one of [mean, min, max, sum]"
|
||||
)
|
||||
logs[key] = round(fn(values).item(), 4)
|
||||
logs[key] = round(fn(values).item(), metric_ndigits)
|
||||
|
||||
if "loss" in logs:
|
||||
try:
|
||||
logs["ppl"] = round(math.exp(logs["loss"]), metric_ndigits)
|
||||
except OverflowError:
|
||||
logs["ppl"] = float("inf")
|
||||
if "eval_loss" in logs:
|
||||
try:
|
||||
logs["eval_ppl"] = round(math.exp(logs["eval_loss"]), metric_ndigits)
|
||||
except OverflowError:
|
||||
logs["eval_ppl"] = float("inf")
|
||||
|
||||
if is_main_process():
|
||||
# Add memory usage
|
||||
@@ -615,12 +650,21 @@ class AxolotlTrainer(
|
||||
except (ValueError, TypeError, FileNotFoundError):
|
||||
pass
|
||||
|
||||
if self.args.include_tkps and train_eval == "train":
|
||||
if (
|
||||
self.args.include_tkps
|
||||
and train_eval == "train"
|
||||
and hasattr(self.state, "tokens")
|
||||
):
|
||||
# each rank will log its own tokens per second
|
||||
# for logging_steps > 1 we obtain a moving average of this metric
|
||||
logs["tokens_per_second_per_gpu"] = round(
|
||||
logs["tokens/train_per_sec_per_gpu"] = round(
|
||||
self.state.last_tokens_per_second.item() / self.args.logging_steps, 2
|
||||
)
|
||||
if (
|
||||
hasattr(self.state, "total_tokens")
|
||||
and self.state.total_tokens is not None
|
||||
):
|
||||
logs["total_tokens"] = int(self.state.total_tokens.item())
|
||||
|
||||
del self._stored_metrics[train_eval]
|
||||
|
||||
@@ -655,6 +699,19 @@ class AxolotlTrainer(
|
||||
run_dir = self._get_output_dir(trial=trial)
|
||||
output_dir = os.path.join(run_dir, checkpoint_folder)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Save total_tokens state if tracking is enabled
|
||||
if self.args.include_tkps and hasattr(self.state, "tokens"):
|
||||
tokens_state = {
|
||||
"total": int(torch.as_tensor(self.state.tokens.get("total", 0)).item()),
|
||||
"trainable": int(
|
||||
torch.as_tensor(self.state.tokens.get("trainable", 0)).item()
|
||||
),
|
||||
}
|
||||
tokens_state_path = os.path.join(output_dir, TOKENS_STATE_FILE)
|
||||
with open(tokens_state_path, "w", encoding="utf-8") as f:
|
||||
json.dump(tokens_state, f)
|
||||
|
||||
return super()._save_checkpoint(model, trial, **kwargs)
|
||||
|
||||
# TODO(wing): remove once https://github.com/huggingface/transformers/pull/39866/files is merged
|
||||
|
||||
@@ -36,4 +36,6 @@ class DPOStrategy:
|
||||
training_args_kwargs["dpo_norm_loss"] = cfg.dpo_norm_loss
|
||||
if cfg.dpo_use_logits_to_keep is not None:
|
||||
training_args_kwargs["use_logits_to_keep"] = cfg.dpo_use_logits_to_keep
|
||||
if cfg.dpo_use_liger_kernel is not None:
|
||||
training_args_kwargs["use_liger_kernel"] = cfg.dpo_use_liger_kernel
|
||||
return training_args_kwargs
|
||||
|
||||
@@ -126,6 +126,9 @@ class GRPOStrategy:
|
||||
if trl.use_liger_loss is not None:
|
||||
grpo_args_kwargs["use_liger_loss"] = trl.use_liger_loss
|
||||
|
||||
if trl.rollout_func:
|
||||
grpo_args_kwargs["rollout_func"] = cls.get_rollout_func(trl.rollout_func)
|
||||
|
||||
return grpo_args_kwargs
|
||||
|
||||
@classmethod
|
||||
@@ -201,3 +204,32 @@ class GRPOStrategy:
|
||||
raise ValueError(
|
||||
f"Reward function {reward_func_fqn} not found."
|
||||
) from exc
|
||||
|
||||
@classmethod
|
||||
def get_rollout_func(cls, rollout_func_fqn: str):
|
||||
"""
|
||||
Returns the rollout function from the given fully qualified name.
|
||||
|
||||
Args:
|
||||
rollout_func_fqn (str): Fully qualified name of the rollout function
|
||||
(e.g. my_module.my_rollout_func)
|
||||
|
||||
Returns:
|
||||
Callable rollout function
|
||||
"""
|
||||
try:
|
||||
rollout_func_module_name = rollout_func_fqn.split(".")[-1]
|
||||
rollout_func_module = importlib.import_module(
|
||||
".".join(rollout_func_fqn.split(".")[:-1])
|
||||
)
|
||||
rollout_func = getattr(rollout_func_module, rollout_func_module_name)
|
||||
|
||||
if not callable(rollout_func):
|
||||
raise ValueError(
|
||||
f"Rollout function {rollout_func_fqn} must be callable"
|
||||
)
|
||||
|
||||
return rollout_func
|
||||
|
||||
except ModuleNotFoundError as exc:
|
||||
raise ValueError(f"Rollout function {rollout_func_fqn} not found.") from exc
|
||||
|
||||
@@ -10,6 +10,7 @@ import torch
|
||||
from datasets import Dataset
|
||||
from transformers.trainer import Trainer
|
||||
|
||||
from axolotl.telemetry.errors import send_errors
|
||||
from axolotl.train import (
|
||||
TrainDatasetMeta,
|
||||
setup_model_and_tokenizer,
|
||||
@@ -63,6 +64,7 @@ def evaluate_dataset(
|
||||
return metrics
|
||||
|
||||
|
||||
@send_errors
|
||||
def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, float]:
|
||||
"""
|
||||
Evaluate a model on training and validation datasets.
|
||||
|
||||
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
|
||||
|
||||
- If you are installing from pip
|
||||
```bash
|
||||
pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"
|
||||
pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2"
|
||||
```
|
||||
|
||||
## Usage
|
||||
@@ -54,6 +54,8 @@ plugins:
|
||||
- granitemoehybrid
|
||||
- hunyuan_v1_dense
|
||||
- hunyuan_v1_moe
|
||||
- internvl
|
||||
- kimi_linear
|
||||
- lfm2
|
||||
- lfm2_moe
|
||||
- lfm2_vl
|
||||
@@ -61,10 +63,15 @@ plugins:
|
||||
- llama4
|
||||
- llama4_text
|
||||
- llava
|
||||
- ministral
|
||||
- ministral3
|
||||
- mistral
|
||||
- mistral3
|
||||
- mixtral
|
||||
- mllama
|
||||
- olmo
|
||||
- olmo2
|
||||
- olmo3
|
||||
- phi
|
||||
- phi3
|
||||
- phi4_multimodal
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user