Compare commits
92 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ffb307a8a7 | ||
|
|
915c258c6e | ||
|
|
1e58235c38 | ||
|
|
5753c5b89c | ||
|
|
18d78f02cf | ||
|
|
923181aaed | ||
|
|
786f1a3ff9 | ||
|
|
26418e6f9a | ||
|
|
19fe84ef46 | ||
|
|
98730868e7 | ||
|
|
5771a65b88 | ||
|
|
f912d1bb97 | ||
|
|
0250e5f87c | ||
|
|
274c579d81 | ||
|
|
ccd2f12335 | ||
|
|
00e0238501 | ||
|
|
f782957002 | ||
|
|
f2f66f2bb9 | ||
|
|
013474eb70 | ||
|
|
6dc9816722 | ||
|
|
74715125b6 | ||
|
|
f0f3bfbdf0 | ||
|
|
022ef7ab4e | ||
|
|
04533b79d4 | ||
|
|
19de29be19 | ||
|
|
ec75aa5889 | ||
|
|
cf4e3fac64 | ||
|
|
69df309cbb | ||
|
|
b436ecf61f | ||
|
|
f137ce50ec | ||
|
|
4131bcf769 | ||
|
|
64fea39978 | ||
|
|
4966496b98 | ||
|
|
66a9e4fced | ||
|
|
15d35b76bb | ||
|
|
0d53e0fe8f | ||
|
|
9344fa5e8c | ||
|
|
c702edae5f | ||
|
|
dfaf76659f | ||
|
|
26a58bb8af | ||
|
|
cec2490903 | ||
|
|
dfa5224908 | ||
|
|
ddafc6ef80 | ||
|
|
ad56e600e3 | ||
|
|
18d9456297 | ||
|
|
da5ede6372 | ||
|
|
6cbca1ffb2 | ||
|
|
2e082d47cc | ||
|
|
b4c6675cd2 | ||
|
|
828131332a | ||
|
|
273a03f85c | ||
|
|
9bbe2cfe0f | ||
|
|
64da8f0044 | ||
|
|
1fa0a98e38 | ||
|
|
8d542d9d63 | ||
|
|
a4565476e0 | ||
|
|
02dc263338 | ||
|
|
2acd3e1242 | ||
|
|
0437c1a4ba | ||
|
|
ef150fd973 | ||
|
|
47ad92c6b9 | ||
|
|
f0fee9c56c | ||
|
|
37d07bd7f7 | ||
|
|
4c81172917 | ||
|
|
cd8c769e84 | ||
|
|
0d60046d08 | ||
|
|
c110e3eb48 | ||
|
|
95c259b3fb | ||
|
|
d1fd505813 | ||
|
|
1334281d50 | ||
|
|
98f230d864 | ||
|
|
02f308351c | ||
|
|
3b91e8174d | ||
|
|
40d906fb33 | ||
|
|
89d5323c13 | ||
|
|
df870f6a8f | ||
|
|
f500aaa490 | ||
|
|
9ec33f52e3 | ||
|
|
b453562c01 | ||
|
|
367f7eb3a6 | ||
|
|
e888e38ce7 | ||
|
|
400120af2d | ||
|
|
459e5f9b16 | ||
|
|
43f6f84269 | ||
|
|
36c4ab11f9 | ||
|
|
2f4e4ef604 | ||
|
|
aee03fc636 | ||
|
|
255b818fbc | ||
|
|
332ee74f32 | ||
|
|
3b0d2ac5c0 | ||
|
|
9462a1bf79 | ||
|
|
8e9386c799 |
@@ -2,7 +2,6 @@
|
|||||||
source = axolotl
|
source = axolotl
|
||||||
omit =
|
omit =
|
||||||
*/tests/*
|
*/tests/*
|
||||||
setup.py
|
|
||||||
|
|
||||||
[report]
|
[report]
|
||||||
exclude_lines =
|
exclude_lines =
|
||||||
|
|||||||
26
.github/CONTRIBUTING.md
vendored
26
.github/CONTRIBUTING.md
vendored
@@ -29,13 +29,18 @@ PRs are **greatly welcome**!
|
|||||||
2. Set up the development environment by following the instructions in the [README.md](https://github.com/axolotl-ai-cloud/axolotl/tree/main/README.md) file.
|
2. Set up the development environment by following the instructions in the [README.md](https://github.com/axolotl-ai-cloud/axolotl/tree/main/README.md) file.
|
||||||
3. Explore the codebase, run tests, and verify that everything works as expected.
|
3. Explore the codebase, run tests, and verify that everything works as expected.
|
||||||
|
|
||||||
Please run below to setup env
|
Please run the below to setup:
|
||||||
```bash
|
|
||||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
|
||||||
pre-commit install
|
|
||||||
|
|
||||||
# test
|
```bash
|
||||||
pytest tests/
|
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
||||||
|
cd axolotl
|
||||||
|
|
||||||
|
uv sync --dev && uv pip install flash-attn --no-build-isolation
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
pre-commit install # install pre-commit hooks
|
||||||
|
|
||||||
|
pytest tests/ # optional; run test suite
|
||||||
```
|
```
|
||||||
|
|
||||||
## How to Contribute
|
## How to Contribute
|
||||||
@@ -68,12 +73,7 @@ You can skip certain CI checks by including specific keywords in your commit mes
|
|||||||
|
|
||||||
### Code Style
|
### Code Style
|
||||||
|
|
||||||
axolotl uses [Ruff](https://docs.astral.sh/ruff/) as its code style guide. Please ensure that your code follows these guidelines.
|
axolotl uses [{codestyle}]({URLofCodestyle}) as its code style guide. Please ensure that your code follows these guidelines.
|
||||||
|
|
||||||
Use the pre-commit linter to ensure that your code is formatted consistently.
|
|
||||||
```bash
|
|
||||||
pre-commit run --all-files
|
|
||||||
```
|
|
||||||
|
|
||||||
### Commit Messages
|
### Commit Messages
|
||||||
|
|
||||||
@@ -83,6 +83,6 @@ Write clear and concise commit messages that briefly describe the changes made i
|
|||||||
|
|
||||||
- [GitHub Help](https://help.github.com/)
|
- [GitHub Help](https://help.github.com/)
|
||||||
- [GitHub Pull Request Documentation](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests)
|
- [GitHub Pull Request Documentation](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests)
|
||||||
- [Ruff](https://docs.astral.sh/ruff/)
|
- [{codestyle}]({URLofCodestyle})
|
||||||
|
|
||||||
Thank you once again for your interest in contributing to axolotl. We look forward to collaborating with you and creating an even better project together!
|
Thank you once again for your interest in contributing to axolotl. We look forward to collaborating with you and creating an even better project together!
|
||||||
|
|||||||
6
.github/FUNDING.yml
vendored
6
.github/FUNDING.yml
vendored
@@ -1,13 +1,13 @@
|
|||||||
# These are supported funding model platforms
|
# These are supported funding model platforms
|
||||||
|
|
||||||
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
github: [winglian, OpenAccess-AI-Collective] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
||||||
patreon: # Replace with a single Patreon username
|
patreon: # Replace with a single Patreon username
|
||||||
open_collective: # Replace with a single Open Collective username
|
open_collective: # Replace with a single Open Collective username
|
||||||
ko_fi: # Replace with a single Ko-fi username
|
ko_fi: axolotl_ai # Replace with a single Ko-fi username
|
||||||
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
|
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
|
||||||
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
|
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
|
||||||
liberapay: # Replace with a single Liberapay username
|
liberapay: # Replace with a single Liberapay username
|
||||||
issuehunt: # Replace with a single IssueHunt username
|
issuehunt: # Replace with a single IssueHunt username
|
||||||
otechie: # Replace with a single Otechie username
|
otechie: # Replace with a single Otechie username
|
||||||
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
|
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
|
||||||
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
|
custom: ['https://quickchart.io/qr?text=bitcoin%3Abc1qxlgwlqwfea5s2cxm42xqsfmwjct0rj8w8ea5np&size=480¢erImageUrl=https%3A%2F%2Fupload.wikimedia.org%2Fwikipedia%2Fcommons%2Fthumb%2F4%2F46%2FBitcoin.svg%2F64px-Bitcoin.svg.png'] # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
|
||||||
|
|||||||
5
.github/PULL_REQUEST_TEMPLATE.md
vendored
5
.github/PULL_REQUEST_TEMPLATE.md
vendored
@@ -15,11 +15,6 @@
|
|||||||
<!--- Include details of your testing environment, tests ran to see how -->
|
<!--- Include details of your testing environment, tests ran to see how -->
|
||||||
<!--- your change affects other areas of the code, etc. -->
|
<!--- your change affects other areas of the code, etc. -->
|
||||||
|
|
||||||
## AI Usage Disclaimer
|
|
||||||
|
|
||||||
<!--- Was AI (e.g., ChatGPT, Claude, Copilot) used to generate or assist with this PR? -->
|
|
||||||
<!--- Please indicate: No / Yes (specify which tool and to what extent) -->
|
|
||||||
|
|
||||||
## Screenshots (if appropriate)
|
## Screenshots (if appropriate)
|
||||||
|
|
||||||
## Types of changes
|
## Types of changes
|
||||||
|
|||||||
211
.github/workflows/base.yml
vendored
211
.github/workflows/base.yml
vendored
@@ -15,21 +15,44 @@ on:
|
|||||||
- '.github/workflows/base.yml'
|
- '.github/workflows/base.yml'
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-base:
|
build-base:
|
||||||
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
||||||
timeout-minutes: 480
|
timeout-minutes: 480
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
runs-on: ubuntu-latest-m
|
runs-on: ubuntu-latest-m
|
||||||
env:
|
|
||||||
HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
|
- cuda: "124"
|
||||||
|
cuda_version: 12.4.1
|
||||||
|
cudnn_version: ""
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
|
dockerfile: "Dockerfile-base"
|
||||||
|
- cuda: "126"
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
cudnn_version: ""
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
|
dockerfile: "Dockerfile-base"
|
||||||
|
- cuda: "126"
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
cudnn_version: ""
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
|
dockerfile: "Dockerfile-base"
|
||||||
|
- cuda: "128"
|
||||||
|
cuda_version: 12.8.1
|
||||||
|
cudnn_version: ""
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
|
dockerfile: "Dockerfile-base"
|
||||||
- cuda: "128"
|
- cuda: "128"
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
cudnn_version: ""
|
cudnn_version: ""
|
||||||
@@ -37,71 +60,6 @@ jobs:
|
|||||||
pytorch: 2.8.0
|
pytorch: 2.8.0
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
dockerfile: "Dockerfile-base"
|
dockerfile: "Dockerfile-base"
|
||||||
platforms: "linux/amd64"
|
|
||||||
- cuda: "128"
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.0
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: "128"
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: "128"
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: "128"
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
# - cuda: "129"
|
|
||||||
# cuda_version: 12.9.1
|
|
||||||
# cudnn_version: ""
|
|
||||||
# python_version: "3.12"
|
|
||||||
# pytorch: 2.9.1
|
|
||||||
# torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
# dockerfile: "Dockerfile-base"
|
|
||||||
# platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: "130"
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
torch_cuda_arch_list: "9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: "130"
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
torch_cuda_arch_list: "9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: "130"
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
torch_cuda_arch_list: "9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
# - cuda: "128"
|
# - cuda: "128"
|
||||||
# cuda_version: 12.8.1
|
# cuda_version: 12.8.1
|
||||||
# cudnn_version: ""
|
# cudnn_version: ""
|
||||||
@@ -125,23 +83,24 @@ jobs:
|
|||||||
uses: docker/metadata-action@v5
|
uses: docker/metadata-action@v5
|
||||||
with:
|
with:
|
||||||
images: |
|
images: |
|
||||||
|
winglian/axolotl-base
|
||||||
axolotlai/axolotl-base
|
axolotlai/axolotl-base
|
||||||
- name: Login to Docker Hub
|
- name: Login to Docker Hub
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v2
|
||||||
if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
|
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
- name: Set up Docker Buildx
|
- name: Set up Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
- name: Build
|
- name: Build
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v4
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
file: ./docker/${{ matrix.dockerfile }}
|
file: ./docker/${{ matrix.dockerfile }}
|
||||||
platforms: ${{ matrix.platforms }}
|
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
tags: |
|
||||||
|
${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
|
${{ steps.metadata.outputs.tags }}-base-uv-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
build-args: |
|
build-args: |
|
||||||
CUDA_VERSION=${{ matrix.cuda_version }}
|
CUDA_VERSION=${{ matrix.cuda_version }}
|
||||||
@@ -154,12 +113,31 @@ jobs:
|
|||||||
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
||||||
timeout-minutes: 480
|
timeout-minutes: 480
|
||||||
runs-on: ubuntu-latest-m
|
runs-on: ubuntu-latest-m
|
||||||
env:
|
|
||||||
HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
|
- cuda: "126"
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
cudnn_version: ""
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
|
dockerfile: "Dockerfile-uv-base"
|
||||||
|
- cuda: "126"
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
cudnn_version: ""
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
|
dockerfile: "Dockerfile-uv-base"
|
||||||
|
- cuda: "128"
|
||||||
|
cuda_version: 12.8.1
|
||||||
|
cudnn_version: ""
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
|
dockerfile: "Dockerfile-uv-base"
|
||||||
- cuda: "128"
|
- cuda: "128"
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
cudnn_version: ""
|
cudnn_version: ""
|
||||||
@@ -167,79 +145,6 @@ jobs:
|
|||||||
pytorch: 2.8.0
|
pytorch: 2.8.0
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
dockerfile: "Dockerfile-uv-base"
|
dockerfile: "Dockerfile-uv-base"
|
||||||
platforms: "linux/amd64"
|
|
||||||
- cuda: "128"
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-uv-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: "128"
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-uv-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: "128"
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.0
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-uv-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: "128"
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-uv-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: "128"
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-uv-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
# - cuda: "129"
|
|
||||||
# cuda_version: 12.9.1
|
|
||||||
# cudnn_version: ""
|
|
||||||
# python_version: "3.12"
|
|
||||||
# pytorch: 2.9.1
|
|
||||||
# torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
# dockerfile: "Dockerfile-uv-base"
|
|
||||||
# platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: "130"
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
torch_cuda_arch_list: "9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-uv-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: "130"
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
torch_cuda_arch_list: "9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-uv-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: "130"
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
torch_cuda_arch_list: "9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-uv-base"
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -250,19 +155,17 @@ jobs:
|
|||||||
images: |
|
images: |
|
||||||
axolotlai/axolotl-base-uv
|
axolotlai/axolotl-base-uv
|
||||||
- name: Login to Docker Hub
|
- name: Login to Docker Hub
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v2
|
||||||
if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
|
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
- name: Set up Docker Buildx
|
- name: Set up Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
- name: Build
|
- name: Build
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v4
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
file: ./docker/${{ matrix.dockerfile }}
|
file: ./docker/${{ matrix.dockerfile }}
|
||||||
platforms: ${{ matrix.platforms }}
|
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
|
|||||||
11
.github/workflows/docs.yml
vendored
11
.github/workflows/docs.yml
vendored
@@ -12,9 +12,6 @@ jobs:
|
|||||||
build-deploy:
|
build-deploy:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: cleanup node
|
|
||||||
run: |
|
|
||||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
|
||||||
- name: Check out repository
|
- name: Check out repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
- name: Set up Quarto
|
- name: Set up Quarto
|
||||||
@@ -23,10 +20,14 @@ jobs:
|
|||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: '3.11'
|
python-version: '3.11'
|
||||||
|
- name: Install uv
|
||||||
|
uses: astral-sh/setup-uv@v4
|
||||||
|
with:
|
||||||
|
version: "latest"
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install jupyter quartodoc
|
uv pip install --system jupyter quartodoc
|
||||||
python3 -m pip install -e .
|
uv pip install --system -e .
|
||||||
- name: Build autodoc
|
- name: Build autodoc
|
||||||
run: quartodoc build
|
run: quartodoc build
|
||||||
- name: Publish to GitHub Pages (and render)
|
- name: Publish to GitHub Pages (and render)
|
||||||
|
|||||||
6
.github/workflows/lint.yml
vendored
6
.github/workflows/lint.yml
vendored
@@ -6,16 +6,13 @@ on:
|
|||||||
types: [opened, synchronize, reopened, ready_for_review]
|
types: [opened, synchronize, reopened, ready_for_review]
|
||||||
paths:
|
paths:
|
||||||
- '**.py'
|
- '**.py'
|
||||||
- 'requirements.txt'
|
- 'pyproject.toml'
|
||||||
- '.github/workflows/*.yml'
|
- '.github/workflows/*.yml'
|
||||||
- "*.[q]md"
|
- "*.[q]md"
|
||||||
- "examples/**/*.y[a]?ml"
|
- "examples/**/*.y[a]?ml"
|
||||||
- ".pre-commit-config.yaml"
|
- ".pre-commit-config.yaml"
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pre-commit:
|
pre-commit:
|
||||||
name: pre-commit
|
name: pre-commit
|
||||||
@@ -26,5 +23,4 @@ jobs:
|
|||||||
- uses: actions/setup-python@v5
|
- uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
cache: 'pip' # caching pip dependencies
|
|
||||||
- uses: pre-commit/action@v3.0.1
|
- uses: pre-commit/action@v3.0.1
|
||||||
|
|||||||
307
.github/workflows/main.yml
vendored
307
.github/workflows/main.yml
vendored
@@ -8,9 +8,6 @@ on:
|
|||||||
- "v*"
|
- "v*"
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-axolotl:
|
build-axolotl:
|
||||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||||
@@ -18,49 +15,27 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
|
- cuda: 126
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
axolotl_extras:
|
||||||
|
- cuda: 126
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
axolotl_extras: vllm
|
||||||
|
is_latest: true
|
||||||
|
- cuda: 128
|
||||||
|
cuda_version: 12.8.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
axolotl_extras:
|
||||||
- cuda: 128
|
- cuda: 128
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.8.0
|
pytorch: 2.8.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
platforms: "linux/amd64"
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.0
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
is_latest: true
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
# - cuda: 129
|
|
||||||
# cuda_version: 12.9.1
|
|
||||||
# python_version: "3.12"
|
|
||||||
# pytorch: 2.9.1
|
|
||||||
# axolotl_extras:
|
|
||||||
# platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 130
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 130
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -70,6 +45,7 @@ jobs:
|
|||||||
uses: docker/metadata-action@v5
|
uses: docker/metadata-action@v5
|
||||||
with:
|
with:
|
||||||
images: |
|
images: |
|
||||||
|
winglian/axolotl
|
||||||
axolotlai/axolotl
|
axolotlai/axolotl
|
||||||
tags: |
|
tags: |
|
||||||
type=ref,event=branch
|
type=ref,event=branch
|
||||||
@@ -86,13 +62,14 @@ jobs:
|
|||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v5
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
platforms: ${{ matrix.platforms }}
|
|
||||||
build-args: |
|
build-args: |
|
||||||
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
||||||
CUDA=${{ matrix.cuda }}
|
CUDA=${{ matrix.cuda }}
|
||||||
PYTORCH_VERSION=${{ matrix.pytorch }}
|
PYTORCH_VERSION=${{ matrix.pytorch }}
|
||||||
AXOLOTL_ARGS=${{ matrix.axolotl_args }}
|
AXOLOTL_ARGS=${{ matrix.axolotl_args }}
|
||||||
AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
|
AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
|
||||||
|
GIT_REF=${{ github.ref }}
|
||||||
|
GIT_SHA=${{ github.sha }}
|
||||||
file: ./docker/Dockerfile
|
file: ./docker/Dockerfile
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: |
|
tags: |
|
||||||
@@ -101,134 +78,40 @@ jobs:
|
|||||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
|
|
||||||
build-axolotl-uv:
|
|
||||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
include:
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
is_latest: true
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 130
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 130
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
runs-on: axolotl-gpu-runner
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Docker metadata
|
|
||||||
id: metadata
|
|
||||||
uses: docker/metadata-action@v5
|
|
||||||
with:
|
|
||||||
images: |
|
|
||||||
axolotlai/axolotl-uv
|
|
||||||
tags: |
|
|
||||||
type=ref,event=branch
|
|
||||||
type=pep440,pattern={{version}}
|
|
||||||
- name: Set up Docker Buildx
|
|
||||||
uses: docker/setup-buildx-action@v3
|
|
||||||
- name: Login to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
|
||||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
|
||||||
# guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
|
|
||||||
- name: Build and export to Docker
|
|
||||||
uses: docker/build-push-action@v5
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
platforms: ${{ matrix.platforms }}
|
|
||||||
build-args: |
|
|
||||||
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
|
||||||
CUDA=${{ matrix.cuda }}
|
|
||||||
PYTORCH_VERSION=${{ matrix.pytorch }}
|
|
||||||
AXOLOTL_ARGS=${{ matrix.axolotl_args }}
|
|
||||||
AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
|
|
||||||
file: ./docker/Dockerfile-uv
|
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
|
||||||
tags: |
|
|
||||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
|
||||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
|
||||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
|
||||||
|
|
||||||
build-axolotl-cloud:
|
build-axolotl-cloud:
|
||||||
needs: build-axolotl
|
needs: build-axolotl
|
||||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
|
- cuda: 126
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
axolotl_extras:
|
||||||
|
- cuda: 126
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
axolotl_extras:
|
||||||
|
is_latest:
|
||||||
|
- cuda: 126
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
axolotl_extras: vllm
|
||||||
|
is_latest: true
|
||||||
|
- cuda: 128
|
||||||
|
cuda_version: 12.8.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
axolotl_extras:
|
||||||
- cuda: 128
|
- cuda: 128
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.8.0
|
pytorch: 2.8.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
platforms: "linux/amd64"
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.0
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
axolotl_extras:
|
|
||||||
is_latest: true
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
# - cuda: 129
|
|
||||||
# cuda_version: 12.9.1
|
|
||||||
# python_version: "3.12"
|
|
||||||
# pytorch: 2.9.1
|
|
||||||
# axolotl_extras:
|
|
||||||
# platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 130
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 130
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -238,6 +121,7 @@ jobs:
|
|||||||
uses: docker/metadata-action@v5
|
uses: docker/metadata-action@v5
|
||||||
with:
|
with:
|
||||||
images: |
|
images: |
|
||||||
|
winglian/axolotl-cloud
|
||||||
axolotlai/axolotl-cloud
|
axolotlai/axolotl-cloud
|
||||||
tags: |
|
tags: |
|
||||||
type=ref,event=branch
|
type=ref,event=branch
|
||||||
@@ -253,10 +137,11 @@ jobs:
|
|||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v5
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
platforms: ${{ matrix.platforms }}
|
|
||||||
build-args: |
|
build-args: |
|
||||||
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
CUDA=${{ matrix.cuda }}
|
CUDA=${{ matrix.cuda }}
|
||||||
|
GIT_REF=${{ github.ref }}
|
||||||
|
GIT_SHA=${{ github.sha }}
|
||||||
file: ./docker/Dockerfile-cloud
|
file: ./docker/Dockerfile-cloud
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: |
|
tags: |
|
||||||
@@ -264,98 +149,34 @@ jobs:
|
|||||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
|
|
||||||
build-axolotl-cloud-uv:
|
|
||||||
needs: build-axolotl-uv
|
|
||||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
include:
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
axolotl_extras:
|
|
||||||
is_latest: true
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 130
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
- cuda: 130
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
axolotl_extras:
|
|
||||||
platforms: "linux/amd64,linux/arm64"
|
|
||||||
runs-on: axolotl-gpu-runner
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Docker metadata
|
|
||||||
id: metadata
|
|
||||||
uses: docker/metadata-action@v5
|
|
||||||
with:
|
|
||||||
images: |
|
|
||||||
axolotlai/axolotl-cloud-uv
|
|
||||||
tags: |
|
|
||||||
type=ref,event=branch
|
|
||||||
type=pep440,pattern={{version}}
|
|
||||||
- name: Login to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
|
||||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
|
||||||
- name: Set up Docker Buildx
|
|
||||||
uses: docker/setup-buildx-action@v3
|
|
||||||
- name: Build
|
|
||||||
uses: docker/build-push-action@v5
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
platforms: ${{ matrix.platforms }}
|
|
||||||
build-args: |
|
|
||||||
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
|
||||||
CUDA=${{ matrix.cuda }}
|
|
||||||
file: ./docker/Dockerfile-cloud-uv
|
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
|
||||||
tags: |
|
|
||||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
|
||||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
|
||||||
|
|
||||||
build-axolotl-cloud-no-tmux:
|
build-axolotl-cloud-no-tmux:
|
||||||
needs: build-axolotl
|
needs: build-axolotl
|
||||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
|
- cuda: 126
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
axolotl_extras:
|
||||||
|
- cuda: 126
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
axolotl_extras:
|
||||||
|
is_latest:
|
||||||
|
- cuda: 126
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
axolotl_extras: vllm
|
||||||
|
is_latest: true
|
||||||
- cuda: 128
|
- cuda: 128
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.9.1
|
pytorch: 2.8.0
|
||||||
axolotl_extras:
|
|
||||||
is_latest: true
|
|
||||||
- cuda: 130
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
is_latest:
|
is_latest:
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
@@ -367,6 +188,7 @@ jobs:
|
|||||||
uses: docker/metadata-action@v5
|
uses: docker/metadata-action@v5
|
||||||
with:
|
with:
|
||||||
images: |
|
images: |
|
||||||
|
winglian/axolotl-cloud-term
|
||||||
axolotlai/axolotl-cloud-term
|
axolotlai/axolotl-cloud-term
|
||||||
tags: |
|
tags: |
|
||||||
type=ref,event=branch
|
type=ref,event=branch
|
||||||
@@ -382,10 +204,11 @@ jobs:
|
|||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v5
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
platforms: linux/amd64,linux/arm64
|
|
||||||
build-args: |
|
build-args: |
|
||||||
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
CUDA=${{ matrix.cuda }}
|
CUDA=${{ matrix.cuda }}
|
||||||
|
GIT_REF=${{ github.ref }}
|
||||||
|
GIT_SHA=${{ github.sha }}
|
||||||
file: ./docker/Dockerfile-cloud-no-tmux
|
file: ./docker/Dockerfile-cloud-no-tmux
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: |
|
tags: |
|
||||||
|
|||||||
58
.github/workflows/multi-gpu-e2e.yml
vendored
58
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -4,11 +4,8 @@ on:
|
|||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
- 'tests/e2e/multigpu/**.py'
|
- 'tests/e2e/multigpu/**.py'
|
||||||
- 'requirements.txt'
|
|
||||||
- 'setup.py'
|
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
- '.github/workflows/multi-gpu-e2e.yml'
|
- '.github/workflows/multi-gpu-e2e.yml'
|
||||||
- 'scripts/cutcrossentropy_install.py'
|
|
||||||
- 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
|
- 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
|
||||||
- 'src/axolotl/utils/distributed.py'
|
- 'src/axolotl/utils/distributed.py'
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
@@ -20,12 +17,6 @@ concurrency:
|
|||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
env:
|
|
||||||
MODAL_IMAGE_BUILDER_VERSION: "2025.06"
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
test-axolotl-multigpu:
|
test-axolotl-multigpu:
|
||||||
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
||||||
@@ -33,33 +24,27 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
|
- cuda: 126
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
axolotl_extras:
|
||||||
|
num_gpus: 2
|
||||||
|
nightly_build: "true"
|
||||||
|
- cuda: 126
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
axolotl_extras: vllm
|
||||||
|
num_gpus: 2
|
||||||
|
nightly_build: "true"
|
||||||
- cuda: 128
|
- cuda: 128
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.8.0
|
pytorch: 2.8.0
|
||||||
axolotl_extras: fbgemm-gpu
|
axolotl_extras: fbgemm-gpu
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
# - cuda: 129
|
nightly_build: "true"
|
||||||
# cuda_version: 12.9.1
|
|
||||||
# python_version: "3.12"
|
|
||||||
# pytorch: 2.9.1
|
|
||||||
# axolotl_extras: "fbgemm-gpu"
|
|
||||||
# num_gpus: 2
|
|
||||||
# dockerfile: "Dockerfile-uv.jinja"
|
|
||||||
- cuda: 130
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
axolotl_extras:
|
|
||||||
# axolotl_extras: fbgemm-gpu
|
|
||||||
num_gpus: 2
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
axolotl_extras: "fbgemm-gpu"
|
|
||||||
num_gpus: 2
|
|
||||||
dockerfile: "Dockerfile-uv.jinja"
|
|
||||||
runs-on: [self-hosted, modal]
|
runs-on: [self-hosted, modal]
|
||||||
timeout-minutes: 120
|
timeout-minutes: 120
|
||||||
steps:
|
steps:
|
||||||
@@ -69,21 +54,24 @@ jobs:
|
|||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
|
- name: Install uv
|
||||||
|
uses: astral-sh/setup-uv@v4
|
||||||
|
with:
|
||||||
|
version: "latest"
|
||||||
- name: Install Modal
|
- name: Install Modal
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install modal==1.3.0.post1 jinja2
|
pip install modal==1.0.2 jinja2 protobuf
|
||||||
- name: Update env vars
|
- name: Update env vars
|
||||||
run: |
|
run: |
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
echo "BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||||
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
|
echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
|
||||||
|
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
env:
|
|
||||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
|
||||||
run: |
|
run: |
|
||||||
modal run -m cicd.multigpu
|
modal run -m cicd.multigpu
|
||||||
|
|||||||
33
.github/workflows/nightlies.yml
vendored
33
.github/workflows/nightlies.yml
vendored
@@ -5,9 +5,6 @@ on:
|
|||||||
schedule:
|
schedule:
|
||||||
- cron: '0 0 * * *' # Runs at 00:00 UTC every day
|
- cron: '0 0 * * *' # Runs at 00:00 UTC every day
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-axolotl:
|
build-axolotl:
|
||||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||||
@@ -15,15 +12,15 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- cuda: 128
|
- cuda: 126
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.8.0
|
pytorch: 2.6.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: 128
|
- cuda: 126
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.9.1
|
pytorch: 2.7.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
@@ -34,6 +31,7 @@ jobs:
|
|||||||
uses: docker/metadata-action@v5
|
uses: docker/metadata-action@v5
|
||||||
with:
|
with:
|
||||||
images: |
|
images: |
|
||||||
|
winglian/axolotl
|
||||||
axolotlai/axolotl
|
axolotlai/axolotl
|
||||||
tags: |
|
tags: |
|
||||||
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
||||||
@@ -54,6 +52,8 @@ jobs:
|
|||||||
CUDA=${{ matrix.cuda }}
|
CUDA=${{ matrix.cuda }}
|
||||||
PYTORCH_VERSION=${{ matrix.pytorch }}
|
PYTORCH_VERSION=${{ matrix.pytorch }}
|
||||||
AXOLOTL_ARGS=${{ matrix.axolotl_args }}
|
AXOLOTL_ARGS=${{ matrix.axolotl_args }}
|
||||||
|
GIT_REF=${{ github.ref }}
|
||||||
|
GIT_SHA=${{ github.sha }}
|
||||||
file: ./docker/Dockerfile
|
file: ./docker/Dockerfile
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: |
|
tags: |
|
||||||
@@ -67,15 +67,15 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- cuda: 128
|
- cuda: 126
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.8.0
|
pytorch: 2.6.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: 128
|
- cuda: 126
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.9.1
|
pytorch: 2.7.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
@@ -86,6 +86,7 @@ jobs:
|
|||||||
uses: docker/metadata-action@v5
|
uses: docker/metadata-action@v5
|
||||||
with:
|
with:
|
||||||
images: |
|
images: |
|
||||||
|
winglian/axolotl-cloud
|
||||||
axolotlai/axolotl-cloud
|
axolotlai/axolotl-cloud
|
||||||
tags: |
|
tags: |
|
||||||
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
||||||
@@ -103,6 +104,8 @@ jobs:
|
|||||||
build-args: |
|
build-args: |
|
||||||
BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
CUDA=${{ matrix.cuda }}
|
CUDA=${{ matrix.cuda }}
|
||||||
|
GIT_REF=${{ github.ref }}
|
||||||
|
GIT_SHA=${{ github.sha }}
|
||||||
file: ./docker/Dockerfile-cloud
|
file: ./docker/Dockerfile-cloud
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: |
|
tags: |
|
||||||
|
|||||||
11
.github/workflows/precommit-autoupdate.yml
vendored
11
.github/workflows/precommit-autoupdate.yml
vendored
@@ -2,11 +2,9 @@ name: Pre-commit auto-update
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '0 0 1 * *' # Run monthly
|
- cron: '0 0 * * 0' # Run weekly
|
||||||
workflow_dispatch: # Manual kickoff
|
workflow_dispatch: # Manual kickoff
|
||||||
|
|
||||||
permissions: {}
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
auto-update:
|
auto-update:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -20,10 +18,15 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
python-version: '3.11'
|
python-version: '3.11'
|
||||||
|
|
||||||
|
- name: Install uv
|
||||||
|
uses: astral-sh/setup-uv@v4
|
||||||
|
with:
|
||||||
|
version: "latest"
|
||||||
|
|
||||||
- name: Update pre-commit hooks
|
- name: Update pre-commit hooks
|
||||||
id: update
|
id: update
|
||||||
run: |
|
run: |
|
||||||
pip install pre-commit
|
uv pip install --system pre-commit
|
||||||
pre-commit autoupdate
|
pre-commit autoupdate
|
||||||
if [[ -n $(git status --porcelain) ]]; then
|
if [[ -n $(git status --porcelain) ]]; then
|
||||||
echo "changes=true" >> $GITHUB_OUTPUT
|
echo "changes=true" >> $GITHUB_OUTPUT
|
||||||
|
|||||||
22
.github/workflows/preview-docs.yml
vendored
22
.github/workflows/preview-docs.yml
vendored
@@ -11,21 +11,22 @@ on:
|
|||||||
- '_quarto.yml'
|
- '_quarto.yml'
|
||||||
- docs/scripts/generate_config_docs.py
|
- docs/scripts/generate_config_docs.py
|
||||||
- src/axolotl/utils/schemas/**.py
|
- src/axolotl/utils/schemas/**.py
|
||||||
- .github/workflows/preview-docs.yml
|
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
checks: write
|
||||||
|
contents: write
|
||||||
|
deployments: write
|
||||||
|
issues: write
|
||||||
|
discussions: write
|
||||||
|
pages: write
|
||||||
pull-requests: write
|
pull-requests: write
|
||||||
|
statuses: write
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
preview:
|
preview:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
if: ${{ !github.event.pull_request.draft }}
|
if: ${{ !github.event.pull_request.draft }}
|
||||||
steps:
|
steps:
|
||||||
- name: cleanup node
|
|
||||||
run: |
|
|
||||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
|
||||||
|
|
||||||
- name: Check out repository
|
- name: Check out repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
@@ -39,10 +40,15 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
python-version: '3.11'
|
python-version: '3.11'
|
||||||
|
|
||||||
|
- name: Install uv
|
||||||
|
uses: astral-sh/setup-uv@v4
|
||||||
|
with:
|
||||||
|
version: "latest"
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install jupyter quartodoc
|
uv pip install --system jupyter quartodoc
|
||||||
python3 -m pip install -e .
|
uv pip install --system -e .
|
||||||
|
|
||||||
- name: Build autodoc
|
- name: Build autodoc
|
||||||
run: quartodoc build
|
run: quartodoc build
|
||||||
|
|||||||
26
.github/workflows/pypi.yml
vendored
26
.github/workflows/pypi.yml
vendored
@@ -3,11 +3,9 @@ name: publish pypi
|
|||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
tags:
|
tags:
|
||||||
- "v*"
|
- 'v*'
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
permissions: {}
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
setup_release:
|
setup_release:
|
||||||
name: Create Release
|
name: Create Release
|
||||||
@@ -30,7 +28,6 @@ jobs:
|
|||||||
name: pypi
|
name: pypi
|
||||||
url: https://pypi.org/p/axolotl
|
url: https://pypi.org/p/axolotl
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
|
||||||
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
|
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
|
||||||
steps:
|
steps:
|
||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
@@ -41,23 +38,24 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
|
|
||||||
|
- name: Install uv
|
||||||
|
uses: astral-sh/setup-uv@v4
|
||||||
|
with:
|
||||||
|
version: "latest"
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip3 install wheel packaging==26.0
|
uv pip install --system wheel packaging==23.2
|
||||||
pip3 install --no-build-isolation -e .
|
uv pip install --system --no-build-isolation -e ".[dev]"
|
||||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
|
||||||
|
|
||||||
- name: Extract tag name
|
- name: Extract tag name
|
||||||
id: tag
|
id: tag
|
||||||
run: echo "TAG_NAME=$(echo $GITHUB_REF | cut -d / -f 3)" >> "$GITHUB_OUTPUT"
|
run: echo "TAG_NAME=$(echo "$GITHUB_REF" | cut -d / -f 3)" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
- name: Update version in VERSION file
|
- name: Build package
|
||||||
run: |
|
run: |
|
||||||
echo "${{ steps.tag.outputs.TAG_NAME }}" | sed 's/^v//' > VERSION
|
uv pip install --system build
|
||||||
|
python -m build
|
||||||
- name: Build a source dist
|
|
||||||
run: |
|
|
||||||
python setup.py sdist
|
|
||||||
|
|
||||||
- name: Publish package distributions to PyPI
|
- name: Publish package distributions to PyPI
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
|
|||||||
110
.github/workflows/tests-nightly.yml
vendored
110
.github/workflows/tests-nightly.yml
vendored
@@ -3,13 +3,6 @@ on:
|
|||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '0 0 * * *' # Runs at 00:00 UTC every day
|
- cron: '0 0 * * *' # Runs at 00:00 UTC every day
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened, ready_for_review]
|
|
||||||
paths:
|
|
||||||
- '.github/workflows/tests-nightly.yml'
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pre-commit:
|
pre-commit:
|
||||||
@@ -20,31 +13,19 @@ jobs:
|
|||||||
- uses: actions/setup-python@v5
|
- uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
cache: 'pip' # caching pip dependencies
|
|
||||||
- uses: pre-commit/action@v3.0.1
|
- uses: pre-commit/action@v3.0.1
|
||||||
env:
|
env:
|
||||||
SKIP: no-commit-to-branch
|
SKIP: no-commit-to-branch
|
||||||
|
|
||||||
prime-cdn-s3-cache:
|
|
||||||
name: Prefetch S3 once to prime the CDN cache
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: ${{ !github.event.pull_request.draft }}
|
|
||||||
timeout-minutes: 10
|
|
||||||
steps:
|
|
||||||
- name: Restore Cache from S3
|
|
||||||
id: hf-cache-restore-s3
|
|
||||||
run: |
|
|
||||||
curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
|
|
||||||
|
|
||||||
pytest:
|
pytest:
|
||||||
name: PyTest
|
name: PyTest
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: [prime-cdn-s3-cache]
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
|
max-parallel: 2
|
||||||
matrix:
|
matrix:
|
||||||
python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
|
python_version: ["3.11"]
|
||||||
pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
|
pytorch_version: ["2.6.0", "2.7.0"]
|
||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@@ -55,38 +36,36 @@ jobs:
|
|||||||
id: hf-cache-restore-s3
|
id: hf-cache-restore-s3
|
||||||
run: |
|
run: |
|
||||||
mkdir -p /home/runner/.cache/huggingface/hub
|
mkdir -p /home/runner/.cache/huggingface/hub
|
||||||
curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python_version }}
|
python-version: ${{ matrix.python_version }}
|
||||||
cache: 'pip' # caching pip dependencies
|
|
||||||
|
|
||||||
- name: upgrade pip
|
- name: Install uv
|
||||||
run: |
|
uses: astral-sh/setup-uv@v4
|
||||||
pip3 install --upgrade pip
|
with:
|
||||||
pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel
|
version: "latest"
|
||||||
|
|
||||||
- name: Install PyTorch
|
- name: Install PyTorch
|
||||||
run: |
|
run: |
|
||||||
pip3 install torch==${{ matrix.pytorch_version }} torchvision
|
uv pip install --system torch==${{ matrix.pytorch_version }} torchvision
|
||||||
|
|
||||||
- name: Update requirements.txt
|
- name: Update pyproject.toml for nightly builds
|
||||||
run: |
|
run: |
|
||||||
sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
|
sed -i 's#"transformers==.*"#"transformers @ git+https://github.com/huggingface/transformers.git@main"#' pyproject.toml
|
||||||
sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
|
sed -i 's#"peft==.*"#"peft @ git+https://github.com/huggingface/peft.git@main"#' pyproject.toml
|
||||||
sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
|
sed -i 's#"accelerate==.*"#"accelerate @ git+https://github.com/huggingface/accelerate.git@main"#' pyproject.toml
|
||||||
sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
|
sed -i 's#"trl==.*"#"trl @ git+https://github.com/huggingface/trl.git@main"#' pyproject.toml
|
||||||
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt
|
sed -i 's#"datasets==.*"#"datasets @ git+https://github.com/huggingface/datasets.git@main"#' pyproject.toml
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip3 show torch
|
uv pip show --system torch
|
||||||
pip3 install --no-build-isolation -U -e .
|
uv pip install --system --no-build-isolation -e ".[dev]"
|
||||||
python scripts/unsloth_install.py | sh
|
python scripts/unsloth_install.py | sh
|
||||||
python scripts/cutcrossentropy_install.py | sh
|
python scripts/cutcrossentropy_install.py | sh
|
||||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
|
||||||
|
|
||||||
- name: Make sure PyTorch version wasn't clobbered
|
- name: Make sure PyTorch version wasn't clobbered
|
||||||
run: |
|
run: |
|
||||||
@@ -102,9 +81,6 @@ jobs:
|
|||||||
pytest -v --durations=10 tests/patched/
|
pytest -v --durations=10 tests/patched/
|
||||||
pytest -v --durations=10 tests/cli/
|
pytest -v --durations=10 tests/cli/
|
||||||
|
|
||||||
- name: cleanup pip cache
|
|
||||||
run: |
|
|
||||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
|
||||||
|
|
||||||
docker-e2e-tests:
|
docker-e2e-tests:
|
||||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||||
@@ -117,26 +93,19 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- cuda: 128
|
- cuda: 126
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.9.1
|
pytorch: 2.6.0
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
nightly_build: "true"
|
nightly_build: "true"
|
||||||
- cuda: 128
|
- cuda: 126
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.10.0
|
pytorch: 2.7.1
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: 130
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
python_version: "3.12"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras:
|
|
||||||
dockerfile: "Dockerfile-uv.jinja"
|
|
||||||
nightly_build: "true"
|
nightly_build: "true"
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -145,25 +114,26 @@ jobs:
|
|||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
|
- name: Install uv
|
||||||
|
uses: astral-sh/setup-uv@v4
|
||||||
|
with:
|
||||||
|
version: "latest"
|
||||||
- name: Install Modal
|
- name: Install Modal
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
uv pip install --system modal==1.0.2 jinja2
|
||||||
pip install modal==1.3.0.post1 jinja2
|
|
||||||
- name: Update env vars
|
- name: Update env vars
|
||||||
run: |
|
run: |
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
echo "BASE_TAG=main-base-uv-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||||
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
|
|
||||||
echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
|
echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
|
||||||
|
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
env:
|
|
||||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
|
||||||
run: |
|
run: |
|
||||||
modal run cicd.e2e_tests
|
modal run -m cicd.e2e_tests
|
||||||
docker-e2e-multigpu-tests:
|
docker-e2e-multigpu-tests:
|
||||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
@@ -175,10 +145,10 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- cuda: 128
|
- cuda: 126
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.9.1
|
pytorch: 2.7.1
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
nightly_build: "true"
|
nightly_build: "true"
|
||||||
@@ -189,21 +159,23 @@ jobs:
|
|||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
|
- name: Install uv
|
||||||
|
uses: astral-sh/setup-uv@v4
|
||||||
|
with:
|
||||||
|
version: "latest"
|
||||||
- name: Install Modal
|
- name: Install Modal
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
uv pip install --system modal==1.0.2 jinja2
|
||||||
pip install modal==1.3.0.post1 jinja2
|
|
||||||
- name: Update env vars
|
- name: Update env vars
|
||||||
run: |
|
run: |
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
echo "BASE_TAG=main-base-uv-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||||
echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
|
echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
|
||||||
|
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
env:
|
|
||||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
|
||||||
run: |
|
run: |
|
||||||
modal run cicd.multigpu
|
modal run cicd.multigpu
|
||||||
|
|||||||
233
.github/workflows/tests.yml
vendored
233
.github/workflows/tests.yml
vendored
@@ -7,18 +7,16 @@ on:
|
|||||||
- "main"
|
- "main"
|
||||||
paths:
|
paths:
|
||||||
- '**.py'
|
- '**.py'
|
||||||
- 'requirements.txt'
|
- 'pyproject.toml'
|
||||||
- '.github/workflows/*.yml'
|
- '.github/workflows/*.yml'
|
||||||
- 'requirements-tests.txt'
|
|
||||||
- 'cicd/cicd.sh'
|
- 'cicd/cicd.sh'
|
||||||
- 'cicd/Dockerfile.jinja'
|
- 'cicd/Dockerfile.jinja'
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened, ready_for_review]
|
types: [opened, synchronize, reopened, ready_for_review]
|
||||||
paths:
|
paths:
|
||||||
- '**.py'
|
- '**.py'
|
||||||
- 'requirements.txt'
|
- 'pyproject.toml'
|
||||||
- '.github/workflows/*.yml'
|
- '.github/workflows/*.yml'
|
||||||
- 'requirements-tests.txt'
|
|
||||||
- 'cicd/cicd.sh'
|
- 'cicd/cicd.sh'
|
||||||
- 'cicd/Dockerfile.jinja'
|
- 'cicd/Dockerfile.jinja'
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
@@ -28,9 +26,6 @@ concurrency:
|
|||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
env:
|
env:
|
||||||
TRANSFORMERS_IS_CI: "yes"
|
TRANSFORMERS_IS_CI: "yes"
|
||||||
|
|
||||||
@@ -44,80 +39,55 @@ jobs:
|
|||||||
- uses: actions/setup-python@v5
|
- uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
cache: 'pip' # caching pip dependencies
|
|
||||||
- uses: pre-commit/action@v3.0.1
|
- uses: pre-commit/action@v3.0.1
|
||||||
env:
|
env:
|
||||||
SKIP: no-commit-to-branch
|
SKIP: no-commit-to-branch
|
||||||
|
|
||||||
prime-cdn-s3-cache:
|
|
||||||
name: Prefetch S3 once to prime the CDN cache
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: ${{ !github.event.pull_request.draft }}
|
|
||||||
timeout-minutes: 10
|
|
||||||
steps:
|
|
||||||
- name: Restore Cache from S3
|
|
||||||
id: hf-cache-restore-s3
|
|
||||||
run: |
|
|
||||||
curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
|
|
||||||
|
|
||||||
pytest:
|
pytest:
|
||||||
name: PyTest
|
name: PyTest
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
if: ${{ !github.event.pull_request.draft }}
|
if: ${{ !github.event.pull_request.draft }}
|
||||||
needs: [prime-cdn-s3-cache]
|
# needs: [preload-cache]
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
|
python_version: ["3.11"]
|
||||||
pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
|
pytorch_version: ["2.6.0", "2.7.1", "2.8.0"]
|
||||||
# exclude:
|
|
||||||
# - python_version: "3.14"
|
|
||||||
# pytorch_version: "2.8.0"
|
|
||||||
# - python_version: "3.14"
|
|
||||||
# pytorch_version: "2.9.1"
|
|
||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: cleanup node
|
|
||||||
run: |
|
|
||||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
|
||||||
|
|
||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Restore Cache from S3
|
- name: Restore Cache from S3
|
||||||
id: hf-cache-restore-s3
|
id: hf-cache-restore-s3
|
||||||
run: |
|
run: |
|
||||||
mkdir -p ~/.cache/huggingface/hub
|
mkdir -p /home/runner/.cache/huggingface/hub
|
||||||
curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1
|
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
||||||
ls -ltr ~/.cache/huggingface/hub/
|
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python_version }}
|
python-version: ${{ matrix.python_version }}
|
||||||
cache: 'pip' # caching pip dependencies
|
|
||||||
|
|
||||||
- name: upgrade pip
|
- name: Install uv
|
||||||
run: |
|
uses: astral-sh/setup-uv@v4
|
||||||
pip3 install --upgrade pip
|
with:
|
||||||
pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel
|
version: "latest"
|
||||||
|
|
||||||
- name: Install PyTorch
|
- name: Install PyTorch
|
||||||
run: |
|
run: |
|
||||||
pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
|
uv pip install --system torch==${{ matrix.pytorch_version }} torchvision
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip3 show torch
|
uv pip show --system torch
|
||||||
pip3 install --no-cache-dir --no-build-isolation -U -e .
|
uv pip install --system wheel
|
||||||
python scripts/unsloth_install.py | sh
|
printf "torch==${{ matrix.pytorch_version }}\n" > torch-constraints.txt
|
||||||
python scripts/cutcrossentropy_install.py | sh
|
uv pip install --system --no-cache-dir --no-build-isolation -e ".[dev]" --constraints torch-constraints.txt
|
||||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
set -o pipefail
|
||||||
|
python scripts/unsloth_install.py | bash
|
||||||
- name: cleanup pip cache
|
python scripts/cutcrossentropy_install.py | bash
|
||||||
run: |
|
|
||||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
|
||||||
|
|
||||||
- name: Make sure PyTorch version wasn't clobbered
|
- name: Make sure PyTorch version wasn't clobbered
|
||||||
run: |
|
run: |
|
||||||
@@ -129,24 +99,14 @@ jobs:
|
|||||||
|
|
||||||
- name: Pre-Download dataset fixture
|
- name: Pre-Download dataset fixture
|
||||||
run: |
|
run: |
|
||||||
hf download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
||||||
|
|
||||||
- name: Show HF cache
|
|
||||||
run: hf cache ls
|
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
df -h
|
python -m pytest -v --durations=10 -n 8 --dist loadfile --cov=axolotl --cov-report=xml --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/
|
||||||
pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
|
python -m pytest -v --durations=10 -n 8 --cov=axolotl --cov-append --cov-report=xml tests/monkeypatch/
|
||||||
df -h
|
python -m pytest -v --durations=10 -n 8 --cov=axolotl --cov-append --cov-report=xml tests/patched/
|
||||||
pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
|
python -m pytest -v --durations=10 -n 8 --cov=axolotl --cov-append --cov-report=xml tests/cli/
|
||||||
df -h
|
|
||||||
pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml
|
|
||||||
df -h
|
|
||||||
pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
|
|
||||||
|
|
||||||
- name: Show HF cache
|
|
||||||
run: hf cache ls
|
|
||||||
|
|
||||||
- name: Upload coverage to Codecov
|
- name: Upload coverage to Codecov
|
||||||
uses: codecov/codecov-action@v5
|
uses: codecov/codecov-action@v5
|
||||||
@@ -156,65 +116,52 @@ jobs:
|
|||||||
flags: unittests,pytorch-${{ matrix.pytorch_version }}
|
flags: unittests,pytorch-${{ matrix.pytorch_version }}
|
||||||
fail_ci_if_error: false
|
fail_ci_if_error: false
|
||||||
|
|
||||||
|
|
||||||
pytest-sdist:
|
pytest-sdist:
|
||||||
name: PyTest from Source Dist
|
name: PyTest from Source Dist
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
if: ${{ !github.event.pull_request.draft }}
|
if: ${{ !github.event.pull_request.draft }}
|
||||||
needs: [prime-cdn-s3-cache]
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
|
python_version: ["3.11"]
|
||||||
pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
|
pytorch_version: ["2.6.0", "2.7.1", "2.8.0"]
|
||||||
# exclude:
|
timeout-minutes: 20
|
||||||
# - python_version: "3.14"
|
|
||||||
# pytorch_version: "2.8.0"
|
|
||||||
# - python_version: "3.14"
|
|
||||||
# pytorch_version: "2.9.1"
|
|
||||||
timeout-minutes: 30
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: cleanup node
|
|
||||||
run: |
|
|
||||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
|
||||||
|
|
||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Restore Cache from S3
|
- name: Restore Cache from S3
|
||||||
id: hf-cache-restore-s3
|
id: hf-cache-restore-s3
|
||||||
run: |
|
run: |
|
||||||
mkdir -p ~/.cache/huggingface/hub
|
mkdir -p /home/runner/.cache/huggingface/hub
|
||||||
curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1
|
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
||||||
ls -ltr ~/.cache/huggingface/hub/
|
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python_version }}
|
python-version: ${{ matrix.python_version }}
|
||||||
cache: 'pip' # caching pip dependencies
|
|
||||||
|
|
||||||
- name: upgrade pip
|
- name: Install uv
|
||||||
run: |
|
uses: astral-sh/setup-uv@v4
|
||||||
pip3 install --upgrade pip
|
with:
|
||||||
pip3 install --upgrade packaging==26.0 setuptools==75.8.0 setuptools_scm build wheel psutil
|
version: "latest"
|
||||||
|
|
||||||
- name: Install PyTorch
|
- name: Install PyTorch
|
||||||
run: |
|
run: |
|
||||||
pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
|
uv pip install --system torch==${{ matrix.pytorch_version }} torchvision
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip3 show torch
|
uv pip show --system torch
|
||||||
python -m build --no-isolation --sdist
|
uv pip install --system wheel build setuptools_scm
|
||||||
pip3 install --no-cache-dir --no-build-isolation dist/axolotl*.tar.gz
|
python -m build --sdist
|
||||||
|
printf "torch==${{ matrix.pytorch_version }}\n" > torch-constraints.txt
|
||||||
|
tarball_path=$(echo dist/axolotl*.tar.gz)
|
||||||
|
uv pip install --no-cache-dir --no-build-isolation --system "${tarball_path}[dev]" --constraints torch-constraints.txt
|
||||||
python scripts/unsloth_install.py | sh
|
python scripts/unsloth_install.py | sh
|
||||||
python scripts/cutcrossentropy_install.py | sh
|
python scripts/cutcrossentropy_install.py | sh
|
||||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
|
||||||
|
|
||||||
- name: cleanup pip cache
|
|
||||||
run: |
|
|
||||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
|
||||||
|
|
||||||
- name: Make sure PyTorch version wasn't clobbered
|
- name: Make sure PyTorch version wasn't clobbered
|
||||||
run: |
|
run: |
|
||||||
@@ -225,19 +172,16 @@ jobs:
|
|||||||
axolotl --help
|
axolotl --help
|
||||||
|
|
||||||
- name: Show HF cache
|
- name: Show HF cache
|
||||||
run: hf cache ls
|
run: huggingface-cli scan-cache
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
|
python -m pytest -v --durations=10 -n 8 --dist loadfile --cov=axolotl --cov-report=xml --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/
|
||||||
pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
|
python -m pytest -v --durations=10 -n 8 --cov=axolotl --cov-append --cov-report=xml tests/monkeypatch/
|
||||||
pytest -v --durations=10 tests/cli/
|
python -m pytest -v --durations=10 -n 8 tests/cli/
|
||||||
|
|
||||||
- name: Show HF cache
|
|
||||||
run: hf cache ls
|
|
||||||
|
|
||||||
gate-skip-e2e:
|
gate-skip-e2e:
|
||||||
needs: [pre-commit]
|
needs: [pre-commit, pytest, pytest-sdist]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
outputs:
|
outputs:
|
||||||
skip: ${{ steps.compute.outputs.skip }}
|
skip: ${{ steps.compute.outputs.skip }}
|
||||||
@@ -273,19 +217,25 @@ jobs:
|
|||||||
# this job needs to be run on self-hosted GPU runners...
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
runs-on: [self-hosted, modal]
|
runs-on: [self-hosted, modal]
|
||||||
timeout-minutes: 120
|
timeout-minutes: 120
|
||||||
needs: [pre-commit, pytest]
|
needs: [pre-commit, pytest, pytest-sdist, gate-skip-e2e]
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- cuda: 130
|
- cuda: 126
|
||||||
cuda_version: 13.0.0
|
cuda_version: 12.6.3
|
||||||
python_version: "3.12"
|
python_version: "3.11"
|
||||||
pytorch: 2.9.1
|
pytorch: 2.7.1
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
dockerfile: "Dockerfile-uv.jinja"
|
- cuda: 126
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras:
|
||||||
|
dockerfile: "Dockerfile.jinja"
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -293,23 +243,26 @@ jobs:
|
|||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
|
- name: Install uv
|
||||||
|
uses: astral-sh/setup-uv@v4
|
||||||
|
with:
|
||||||
|
version: "latest"
|
||||||
- name: Install Modal
|
- name: Install Modal
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install modal==1.3.0.post1 jinja2
|
pip install modal==1.0.2 jinja2 protobuf
|
||||||
- name: Update env vars
|
- name: Update env vars
|
||||||
run: |
|
run: |
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
echo "BASE_TAG=${{ github.ref_name }}-base-uv-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||||
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||||
|
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
||||||
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
|
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
|
||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
env:
|
|
||||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
|
||||||
run: |
|
run: |
|
||||||
modal run cicd.e2e_tests
|
modal run cicd.e2e_tests
|
||||||
|
|
||||||
@@ -329,6 +282,18 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
|
- cuda: 126
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras:
|
||||||
|
- cuda: 128
|
||||||
|
cuda_version: 12.8.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras:
|
||||||
- cuda: 128
|
- cuda: 128
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
@@ -336,24 +301,6 @@ jobs:
|
|||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
gpu_type: "B200"
|
gpu_type: "B200"
|
||||||
axolotl_extras: fbgemm-gpu
|
axolotl_extras: fbgemm-gpu
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras:
|
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.10.0
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras:
|
|
||||||
- cuda: 130
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.1
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras:
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -361,13 +308,17 @@ jobs:
|
|||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
|
- name: Install uv
|
||||||
|
uses: astral-sh/setup-uv@v4
|
||||||
|
with:
|
||||||
|
version: "latest"
|
||||||
- name: Install Modal
|
- name: Install Modal
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install modal==1.3.0.post1 jinja2
|
pip install modal==1.0.2 jinja2 protobuf
|
||||||
- name: Update env vars
|
- name: Update env vars
|
||||||
run: |
|
run: |
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
echo "BASE_TAG=${{ github.ref_name }}-base-uv-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||||
@@ -375,10 +326,9 @@ jobs:
|
|||||||
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||||
echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
|
echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
|
||||||
|
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
||||||
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
|
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
|
||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
env:
|
|
||||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
|
||||||
run: |
|
run: |
|
||||||
modal run cicd.e2e_tests
|
modal run cicd.e2e_tests
|
||||||
|
|
||||||
@@ -392,10 +342,10 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- cuda: 128
|
- cuda: 126
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.9.1
|
pytorch: 2.7.1
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
steps:
|
steps:
|
||||||
@@ -405,19 +355,24 @@ jobs:
|
|||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
|
- name: Install uv
|
||||||
|
uses: astral-sh/setup-uv@v4
|
||||||
|
with:
|
||||||
|
version: "latest"
|
||||||
- name: Install Modal
|
- name: Install Modal
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install modal==1.3.0.post1 jinja2
|
pip install modal==1.0.2 jinja2 protobuf
|
||||||
- name: Update env vars
|
- name: Update env vars
|
||||||
run: |
|
run: |
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
echo "BASE_TAG=${{ github.ref_name }}-base-uv-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||||
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||||
|
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
run: |
|
run: |
|
||||||
modal run cicd.cleanup
|
modal run cicd.cleanup
|
||||||
|
|||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -191,5 +191,5 @@ out/
|
|||||||
# vim
|
# vim
|
||||||
*.swp
|
*.swp
|
||||||
|
|
||||||
# scm auto-versioning
|
# setuptools-scm generated version file
|
||||||
src/axolotl/_version.py
|
src/axolotl/_version.py
|
||||||
|
|||||||
@@ -11,13 +11,13 @@ repos:
|
|||||||
- id: no-commit-to-branch
|
- id: no-commit-to-branch
|
||||||
args: ['--branch', 'main']
|
args: ['--branch', 'main']
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.15.4
|
rev: v0.12.12
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
args: [--fix]
|
args: [--fix]
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||||
rev: v1.19.1
|
rev: v1.17.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: mypy
|
- id: mypy
|
||||||
additional_dependencies:
|
additional_dependencies:
|
||||||
@@ -26,7 +26,7 @@ repos:
|
|||||||
'pydantic>=2.5.3',
|
'pydantic>=2.5.3',
|
||||||
]
|
]
|
||||||
- repo: https://github.com/PyCQA/bandit
|
- repo: https://github.com/PyCQA/bandit
|
||||||
rev: 1.9.4
|
rev: 1.8.6
|
||||||
hooks:
|
hooks:
|
||||||
- id: bandit
|
- id: bandit
|
||||||
args: [
|
args: [
|
||||||
|
|||||||
@@ -1,16 +1,14 @@
|
|||||||
FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0
|
FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0
|
||||||
|
|
||||||
COPY .runpod/requirements.txt /requirements.txt
|
COPY .runpod/requirements.txt /requirements.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
||||||
python3 -m pip install --upgrade pip && \
|
/root/.local/bin/uv pip install --system -r /requirements.txt
|
||||||
python3 -m pip install --upgrade -r /requirements.txt
|
|
||||||
|
|
||||||
# Environment settings
|
# Environment settings
|
||||||
ARG BASE_VOLUME="/runpod-volume"
|
ARG BASE_VOLUME="/runpod-volume"
|
||||||
ENV BASE_VOLUME=$BASE_VOLUME
|
ENV BASE_VOLUME=$BASE_VOLUME
|
||||||
ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
|
ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
|
||||||
ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
||||||
ENV HF_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
|
||||||
ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
||||||
|
|
||||||
COPY .runpod/src /src
|
COPY .runpod/src /src
|
||||||
|
|||||||
@@ -123,7 +123,7 @@ datasets:
|
|||||||
| --------------------------------- | -------------------------- | ----------------------------------- |
|
| --------------------------------- | -------------------------- | ----------------------------------- |
|
||||||
| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
|
| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
|
||||||
| `push_dataset_to_hub` | `""` | Push dataset to HF hub |
|
| `push_dataset_to_hub` | `""` | Push dataset to HF hub |
|
||||||
| `dataset_num_proc` | `4` | Number of preprocessing processes |
|
| `dataset_processes` | `4` | Number of preprocessing processes |
|
||||||
| `dataset_keep_in_memory` | `false` | Keep dataset in memory |
|
| `dataset_keep_in_memory` | `false` | Keep dataset in memory |
|
||||||
| `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
|
| `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
|
||||||
| `shuffle_before_merging_datasets` | `false` | Shuffle each dataset before merging |
|
| `shuffle_before_merging_datasets` | `false` | Shuffle each dataset before merging |
|
||||||
|
|||||||
@@ -39,6 +39,7 @@
|
|||||||
# type: # linear | dynamic
|
# type: # linear | dynamic
|
||||||
# factor: # float
|
# factor: # float
|
||||||
|
|
||||||
|
|
||||||
# # Whether you are training a 4-bit GPTQ quantized model
|
# # Whether you are training a 4-bit GPTQ quantized model
|
||||||
# gptq: true
|
# gptq: true
|
||||||
# gptq_groupsize: 128 # group size
|
# gptq_groupsize: 128 # group size
|
||||||
@@ -106,7 +107,7 @@
|
|||||||
# push_dataset_to_hub: # repo path
|
# push_dataset_to_hub: # repo path
|
||||||
# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
|
# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
|
||||||
# # if not set.
|
# # if not set.
|
||||||
# dataset_num_proc: # defaults to os.cpu_count() if not set
|
# dataset_processes: # defaults to os.cpu_count() if not set
|
||||||
# # push checkpoints to hub
|
# # push checkpoints to hub
|
||||||
# hub_model_id: # repo path to push finetuned model
|
# hub_model_id: # repo path to push finetuned model
|
||||||
# # how to push checkpoints to hub
|
# # how to push checkpoints to hub
|
||||||
@@ -223,6 +224,9 @@
|
|||||||
# eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
|
# eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
|
||||||
# eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
|
# eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
|
||||||
|
|
||||||
|
# # Save model as safetensors (require safetensors package)
|
||||||
|
# save_safetensors:
|
||||||
|
|
||||||
# # Whether to mask out or include the human's prompt from the training labels
|
# # Whether to mask out or include the human's prompt from the training labels
|
||||||
# train_on_inputs: false
|
# train_on_inputs: false
|
||||||
# # Group similarly sized data to minimize padding.
|
# # Group similarly sized data to minimize padding.
|
||||||
@@ -348,6 +352,8 @@
|
|||||||
# # Allow overwrite yml config using from cli
|
# # Allow overwrite yml config using from cli
|
||||||
# strict:
|
# strict:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
base_model: ${BASE_MODEL}
|
base_model: ${BASE_MODEL}
|
||||||
base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
|
base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
|
||||||
base_model_config: ${BASE_MODEL_CONFIG}
|
base_model_config: ${BASE_MODEL_CONFIG}
|
||||||
@@ -406,7 +412,7 @@ chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
|
|||||||
default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
|
default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
|
||||||
dataset_prepared_path: ${DATASET_PREPARED_PATH}
|
dataset_prepared_path: ${DATASET_PREPARED_PATH}
|
||||||
push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
|
push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
|
||||||
dataset_num_proc: ${DATASET_NUM_PROC}
|
dataset_processes: ${DATASET_PROCESSES}
|
||||||
dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
|
dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
|
||||||
hub_model_id: ${HUB_MODEL_ID}
|
hub_model_id: ${HUB_MODEL_ID}
|
||||||
hub_strategy: ${HUB_STRATEGY}
|
hub_strategy: ${HUB_STRATEGY}
|
||||||
@@ -506,6 +512,7 @@ profiler_steps: ${PROFILER_STEPS}
|
|||||||
loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
|
loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
|
||||||
loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
|
loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
|
||||||
|
|
||||||
|
save_safetensors: ${SAVE_SAFETENSORS}
|
||||||
train_on_inputs: ${TRAIN_ON_INPUTS}
|
train_on_inputs: ${TRAIN_ON_INPUTS}
|
||||||
group_by_length: ${GROUP_BY_LENGTH}
|
group_by_length: ${GROUP_BY_LENGTH}
|
||||||
gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
|
gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
include requirements.txt
|
include pyproject.toml
|
||||||
include README.md
|
include README.md
|
||||||
include LICENSE
|
include LICENSE
|
||||||
include src/setuptools_axolotl_dynamic_dependencies.py
|
|
||||||
include src/axolotl/utils/chat_templates/templates/*.jinja
|
include src/axolotl/utils/chat_templates/templates/*.jinja
|
||||||
recursive-include axolotl *.py
|
recursive-include src/axolotl *.py
|
||||||
|
|||||||
83
README.md
83
README.md
@@ -29,35 +29,21 @@
|
|||||||
|
|
||||||
## 🎉 Latest Updates
|
## 🎉 Latest Updates
|
||||||
|
|
||||||
- 2026/03:
|
- 2025/07:
|
||||||
- New model support has been added in Axolotl for [Mistral Small 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mistral4), [Qwen3.5, Qwen3.5 MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3.5), [GLM-4.7-Flash](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm47-flash), [GLM-4.6V](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm46v), and [GLM-4.5-Air](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm45).
|
- ND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the [blog post](https://huggingface.co/blog/accelerate-nd-parallel) for more info.
|
||||||
- [MoE expert quantization](https://docs.axolotl.ai/docs/expert_quantization.html) support (via `quantize_moe_experts: true`) greatly reduces VRAM when training MoE models (FSDP2 compat).
|
- Axolotl adds more models: [GPT-OSS](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gpt-oss), [Gemma 3n](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gemma3n), [Liquid Foundation Model 2 (LFM2)](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/lfm2), and [Arcee Foundation Models (AFM)](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/afm).
|
||||||
- 2026/02:
|
- FP8 finetuning with fp8 gather op is now possible in Axolotl via `torchao`. Get started [here](https://docs.axolotl.ai/docs/mixed_precision.html#sec-fp8)!
|
||||||
- [ScatterMoE LoRA](https://github.com/axolotl-ai-cloud/axolotl/pull/3410) support. LoRA fine-tuning directly on MoE expert weights using custom Triton kernels.
|
- [Voxtral](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/voxtral), [Magistral 1.1](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral), and [Devstral](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/devstral) with mistral-common tokenizer support has been integrated in Axolotl!
|
||||||
- Axolotl now has support for [SageAttention](https://github.com/axolotl-ai-cloud/axolotl/pull/2823) and [GDPO](https://github.com/axolotl-ai-cloud/axolotl/pull/3353) (Generalized DPO).
|
- TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
|
||||||
- 2026/01:
|
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
|
||||||
- New integration for [EAFT](https://github.com/axolotl-ai-cloud/axolotl/pull/3366) (Entropy-Aware Focal Training), weights loss by entropy of the top-k logit distribution, and [Scalable Softmax](https://github.com/axolotl-ai-cloud/axolotl/pull/3338), improves long context in attention.
|
- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
|
||||||
- 2025/12:
|
|
||||||
- Axolotl now includes support for [Kimi-Linear](https://docs.axolotl.ai/docs/models/kimi-linear.html), [Plano-Orchestrator](https://docs.axolotl.ai/docs/models/plano.html), [MiMo](https://docs.axolotl.ai/docs/models/mimo.html), [InternVL 3.5](https://docs.axolotl.ai/docs/models/internvl3_5.html), [Olmo3](https://docs.axolotl.ai/docs/models/olmo3.html), [Trinity](https://docs.axolotl.ai/docs/models/trinity.html), and [Ministral3](https://docs.axolotl.ai/docs/models/ministral3.html).
|
|
||||||
- [Distributed Muon Optimizer](https://github.com/axolotl-ai-cloud/axolotl/pull/3264) support has been added for FSDP2 pretraining.
|
|
||||||
- 2025/10: New model support has been added in Axolotl for: [Qwen3 Next](https://docs.axolotl.ai/docs/models/qwen3-next.html), [Qwen2.5-vl, Qwen3-vl](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl), [Qwen3, Qwen3MoE](https://docs.axolotl.ai/docs/models/qwen3.html), [Granite 4](https://docs.axolotl.ai/docs/models/granite4.html), [HunYuan](https://docs.axolotl.ai/docs/models/hunyuan.html), [Magistral 2509](https://docs.axolotl.ai/docs/models/magistral/vision.html), [Apertus](https://docs.axolotl.ai/docs/models/apertus.html), and [Seed-OSS](https://docs.axolotl.ai/docs/models/seed-oss.html).
|
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
|
|
||||||
<summary>Expand older updates</summary>
|
<summary>Expand older updates</summary>
|
||||||
|
|
||||||
- 2025/09: Axolotl now has text diffusion training. Read more [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/diffusion).
|
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
|
||||||
- 2025/08: QAT has been updated to include NVFP4 support. See [PR](https://github.com/axolotl-ai-cloud/axolotl/pull/3107).
|
- 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
|
||||||
- 2025/07:
|
|
||||||
- ND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the [blog post](https://huggingface.co/blog/accelerate-nd-parallel) for more info.
|
|
||||||
- Axolotl adds more models: [GPT-OSS](https://docs.axolotl.ai/docs/models/gpt-oss.html), [Gemma 3n](https://docs.axolotl.ai/docs/models/gemma3n.html), [Liquid Foundation Model 2 (LFM2)](https://docs.axolotl.ai/docs/models/LiquidAI.html), and [Arcee Foundation Models (AFM)](https://docs.axolotl.ai/docs/models/arcee.html).
|
|
||||||
- FP8 finetuning with fp8 gather op is now possible in Axolotl via `torchao`. Get started [here](https://docs.axolotl.ai/docs/mixed_precision.html#sec-fp8)!
|
|
||||||
- [Voxtral](https://docs.axolotl.ai/docs/models/voxtral.html), [Magistral 1.1](https://docs.axolotl.ai/docs/models/magistral.html), and [Devstral](https://docs.axolotl.ai/docs/models/devstral.html) with mistral-common tokenizer support has been integrated in Axolotl!
|
|
||||||
- TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
|
|
||||||
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [docs](https://docs.axolotl.ai/docs/models/magistral.html) to start training your own Magistral models with Axolotl!
|
|
||||||
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
|
|
||||||
- 2025/04: Llama 4 support has been added in Axolotl. See [docs](https://docs.axolotl.ai/docs/models/llama-4.html) to start training your own Llama 4 models with Axolotl's linearized version!
|
|
||||||
- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
|
|
||||||
- 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
|
- 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
|
||||||
- 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
|
- 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
|
||||||
- 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
|
- 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
|
||||||
@@ -72,22 +58,16 @@ Axolotl is a free and open-source tool designed to streamline post-training and
|
|||||||
Features:
|
Features:
|
||||||
|
|
||||||
- **Multiple Model Support**: Train various models like GPT-OSS, LLaMA, Mistral, Mixtral, Pythia, and many more models available on the Hugging Face Hub.
|
- **Multiple Model Support**: Train various models like GPT-OSS, LLaMA, Mistral, Mixtral, Pythia, and many more models available on the Hugging Face Hub.
|
||||||
- **Multimodal Training**: Fine-tune vision-language models (VLMs) including LLaMA-Vision, Qwen2-VL, Pixtral, LLaVA, SmolVLM2, GLM-4.6V, InternVL 3.5, Gemma 3n, and audio models like Voxtral with image, video, and audio support.
|
- **Multimodal Training**: Fine-tune vision-language models (VLMs) including LLaMA-Vision, Qwen2-VL, Pixtral, LLaVA, SmolVLM2, and audio models like Voxtral with image, video, and audio support.
|
||||||
- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO, GDPO), and Reward Modelling (RM) / Process Reward Modelling (PRM).
|
- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), and Reward Modelling (RM) / Process Reward Modelling (PRM).
|
||||||
- **Easy Configuration**: Re-use a single YAML configuration file across the full fine-tuning pipeline: dataset preprocessing, training, evaluation, quantization, and inference.
|
- **Easy Configuration**: Re-use a single YAML configuration file across the full fine-tuning pipeline: dataset preprocessing, training, evaluation, quantization, and inference.
|
||||||
- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention 2/3/4](https://docs.axolotl.ai/docs/attention.html#flash-attention), [Xformers](https://docs.axolotl.ai/docs/attention.html#xformers), [Flex Attention](https://docs.axolotl.ai/docs/attention.html#flex-attention), [SageAttention](https://docs.axolotl.ai/docs/attention.html#sageattention), [Liger Kernel](https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels), [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy), [ScatterMoE](https://docs.axolotl.ai/docs/custom_integrations.html#kernels-integration), [Sequence Parallelism (SP)](https://docs.axolotl.ai/docs/sequence_parallelism.html), [LoRA optimizations](https://docs.axolotl.ai/docs/lora_optims.html), [Multi-GPU training (FSDP1, FSDP2, DeepSpeed)](https://docs.axolotl.ai/docs/multi-gpu.html), [Multi-node training (Torchrun, Ray)](https://docs.axolotl.ai/docs/multi-node.html), and many more!
|
- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), [Sequence Parallelism (SP)](https://docs.axolotl.ai/docs/sequence_parallelism.html), [LoRA optimizations](https://docs.axolotl.ai/docs/lora_optims.html), [Multi-GPU training (FSDP1, FSDP2, DeepSpeed)](https://docs.axolotl.ai/docs/multi-gpu.html), [Multi-node training (Torchrun, Ray)](https://docs.axolotl.ai/docs/multi-node.html), and many more!
|
||||||
- **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
|
- **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
|
||||||
- **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.
|
- **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## 🚀 Quick Start - LLM Fine-tuning in Minutes
|
## 🚀 Quick Start - LLM Fine-tuning in Minutes
|
||||||
|
|
||||||
**Requirements**:
|
**Requirements**: NVIDIA GPU (Ampere+) or AMD GPU, Python 3.11+
|
||||||
|
|
||||||
- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
|
|
||||||
- Python 3.11
|
|
||||||
- PyTorch ≥2.8.0
|
|
||||||
|
|
||||||
### Google Colab
|
### Google Colab
|
||||||
|
|
||||||
@@ -95,15 +75,35 @@ Features:
|
|||||||
|
|
||||||
### Installation
|
### Installation
|
||||||
|
|
||||||
#### Using pip
|
#### Project setup (uv add)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja
|
# Install uv
|
||||||
pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
|
||||||
|
# Initialize or enter your project
|
||||||
|
uv init my-project && cd my-project
|
||||||
|
uv add axolotl
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
# Download example axolotl configs, deepspeed configs
|
# Download example axolotl configs, deepspeed configs
|
||||||
axolotl fetch examples
|
axolotl fetch examples
|
||||||
axolotl fetch deepspeed_configs # OPTIONAL
|
axolotl fetch deepspeed_configs # optional
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Quick try (uv pip)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install uv if needed
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
|
||||||
|
uv pip install axolotl
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
|
|
||||||
|
# Download example axolotl configs, deepspeed configs
|
||||||
|
axolotl fetch examples
|
||||||
|
axolotl fetch deepspeed_configs # optional
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Using Docker
|
#### Using Docker
|
||||||
@@ -168,13 +168,6 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge
|
|||||||
|
|
||||||
Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.
|
Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.
|
||||||
|
|
||||||
## 📈 Telemetry
|
|
||||||
|
|
||||||
Axolotl has opt-out telemetry that helps us understand how the project is being used
|
|
||||||
and prioritize improvements. We collect basic system information, model types, and
|
|
||||||
error rates—never personal data or file paths. Telemetry is enabled by default. To
|
|
||||||
disable it, set AXOLOTL_DO_NOT_TRACK=1. For more details, see our [telemetry documentation](https://docs.axolotl.ai/docs/telemetry.html).
|
|
||||||
|
|
||||||
## ❤️ Sponsors
|
## ❤️ Sponsors
|
||||||
|
|
||||||
Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai)
|
Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai)
|
||||||
|
|||||||
49
_quarto.yml
49
_quarto.yml
@@ -1,8 +1,6 @@
|
|||||||
project:
|
project:
|
||||||
type: website
|
type: website
|
||||||
pre-render:
|
pre-render: docs/scripts/generate_config_docs.py
|
||||||
- docs/scripts/generate_config_docs.py
|
|
||||||
- docs/scripts/generate_examples_docs.py
|
|
||||||
|
|
||||||
quartodoc:
|
quartodoc:
|
||||||
dir: docs/api
|
dir: docs/api
|
||||||
@@ -128,9 +126,11 @@ quartodoc:
|
|||||||
- monkeypatch.mistral_attn_hijack_flash
|
- monkeypatch.mistral_attn_hijack_flash
|
||||||
- monkeypatch.multipack
|
- monkeypatch.multipack
|
||||||
- monkeypatch.relora
|
- monkeypatch.relora
|
||||||
|
- monkeypatch.llama_expand_mask
|
||||||
- monkeypatch.lora_kernels
|
- monkeypatch.lora_kernels
|
||||||
- monkeypatch.utils
|
- monkeypatch.utils
|
||||||
- monkeypatch.btlm_attn_hijack_flash
|
- monkeypatch.btlm_attn_hijack_flash
|
||||||
|
- monkeypatch.llama_patch_multipack
|
||||||
- monkeypatch.stablelm_attn_hijack_flash
|
- monkeypatch.stablelm_attn_hijack_flash
|
||||||
- monkeypatch.trainer_fsdp_optim
|
- monkeypatch.trainer_fsdp_optim
|
||||||
- monkeypatch.transformers_fa_utils
|
- monkeypatch.transformers_fa_utils
|
||||||
@@ -240,48 +240,7 @@ website:
|
|||||||
- docs/getting-started.qmd
|
- docs/getting-started.qmd
|
||||||
- docs/installation.qmd
|
- docs/installation.qmd
|
||||||
- docs/inference.qmd
|
- docs/inference.qmd
|
||||||
- section: "Model Guides"
|
|
||||||
contents:
|
|
||||||
- docs/models/kimi-linear.qmd
|
|
||||||
- docs/models/plano.qmd
|
|
||||||
- docs/models/mimo.qmd
|
|
||||||
- docs/models/internvl3_5.qmd
|
|
||||||
- docs/models/olmo3.qmd
|
|
||||||
- docs/models/trinity.qmd
|
|
||||||
- docs/models/arcee.qmd
|
|
||||||
- section: "Ministral3"
|
|
||||||
contents:
|
|
||||||
- docs/models/ministral3.qmd
|
|
||||||
- docs/models/ministral3/think.qmd
|
|
||||||
- docs/models/ministral3/vision.qmd
|
|
||||||
- section: "Magistral"
|
|
||||||
contents:
|
|
||||||
- docs/models/magistral.qmd
|
|
||||||
- docs/models/magistral/think.qmd
|
|
||||||
- docs/models/magistral/vision.qmd
|
|
||||||
- docs/models/ministral.qmd
|
|
||||||
- docs/models/mistral-small.qmd
|
|
||||||
- docs/models/voxtral.qmd
|
|
||||||
- docs/models/devstral.qmd
|
|
||||||
- docs/models/mistral.qmd
|
|
||||||
- docs/models/llama-4.qmd
|
|
||||||
- docs/models/llama-2.qmd
|
|
||||||
- docs/models/qwen3-next.qmd
|
|
||||||
- docs/models/qwen3.qmd
|
|
||||||
- docs/models/gemma3n.qmd
|
|
||||||
- docs/models/apertus.qmd
|
|
||||||
- docs/models/gpt-oss.qmd
|
|
||||||
- docs/models/seed-oss.qmd
|
|
||||||
- docs/models/phi.qmd
|
|
||||||
- docs/models/smolvlm2.qmd
|
|
||||||
- docs/models/granite4.qmd
|
|
||||||
- docs/models/LiquidAI.qmd
|
|
||||||
- docs/models/hunyuan.qmd
|
|
||||||
- docs/models/jamba.qmd
|
|
||||||
- docs/models/orpheus.qmd
|
|
||||||
|
|
||||||
- docs/cli.qmd
|
- docs/cli.qmd
|
||||||
- docs/telemetry.qmd
|
|
||||||
- docs/config-reference.qmd
|
- docs/config-reference.qmd
|
||||||
- text: "API Reference"
|
- text: "API Reference"
|
||||||
href: docs/api
|
href: docs/api
|
||||||
@@ -318,7 +277,6 @@ website:
|
|||||||
- docs/multipack.qmd
|
- docs/multipack.qmd
|
||||||
- docs/mixed_precision.qmd
|
- docs/mixed_precision.qmd
|
||||||
- docs/optimizers.qmd
|
- docs/optimizers.qmd
|
||||||
- docs/attention.qmd
|
|
||||||
|
|
||||||
- section: "Advanced Features"
|
- section: "Advanced Features"
|
||||||
contents:
|
contents:
|
||||||
@@ -329,7 +287,6 @@ website:
|
|||||||
- docs/sequence_parallelism.qmd
|
- docs/sequence_parallelism.qmd
|
||||||
- docs/gradient_checkpointing.qmd
|
- docs/gradient_checkpointing.qmd
|
||||||
- docs/nd_parallelism.qmd
|
- docs/nd_parallelism.qmd
|
||||||
- docs/expert_quantization.qmd
|
|
||||||
|
|
||||||
- section: "Troubleshooting"
|
- section: "Troubleshooting"
|
||||||
contents:
|
contents:
|
||||||
|
|||||||
@@ -1,208 +0,0 @@
|
|||||||
"""Benchmark for entropy_from_logits Triton kernel vs original chunked implementation.
|
|
||||||
|
|
||||||
Usage: CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_entropy.py
|
|
||||||
"""
|
|
||||||
|
|
||||||
import gc
|
|
||||||
import statistics
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.nn.functional as F
|
|
||||||
|
|
||||||
from axolotl.monkeypatch.trainer.utils import entropy_from_logits
|
|
||||||
|
|
||||||
V = 151936 # Qwen vocab
|
|
||||||
WARMUP = 5
|
|
||||||
BENCH_ITERS = 20
|
|
||||||
MEM_ITERS = 10
|
|
||||||
|
|
||||||
|
|
||||||
def entropy_from_logits_original(logits: torch.Tensor, chunk_size: int = 128):
|
|
||||||
"""Original chunked implementation (reference)."""
|
|
||||||
original_shape = logits.shape[:-1]
|
|
||||||
num_classes = logits.shape[-1]
|
|
||||||
flat_logits = logits.reshape(-1, num_classes)
|
|
||||||
entropies = []
|
|
||||||
for chunk in flat_logits.split(chunk_size, dim=0):
|
|
||||||
logps = F.log_softmax(chunk, dim=-1)
|
|
||||||
chunk_entropy = -(torch.exp(logps) * logps).sum(-1)
|
|
||||||
entropies.append(chunk_entropy)
|
|
||||||
return torch.cat(entropies, dim=0).reshape(original_shape)
|
|
||||||
|
|
||||||
|
|
||||||
def _clean_gpu():
|
|
||||||
gc.collect()
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
torch.cuda.reset_peak_memory_stats()
|
|
||||||
torch.cuda.reset_accumulated_memory_stats()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
|
|
||||||
def profile_time(fn, logits, n_iters=BENCH_ITERS):
|
|
||||||
for _ in range(WARMUP):
|
|
||||||
out = fn(logits, chunk_size=128)
|
|
||||||
del out
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
times = []
|
|
||||||
for _ in range(n_iters):
|
|
||||||
s = torch.cuda.Event(enable_timing=True)
|
|
||||||
e = torch.cuda.Event(enable_timing=True)
|
|
||||||
s.record()
|
|
||||||
out = fn(logits, chunk_size=128)
|
|
||||||
e.record()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
times.append(s.elapsed_time(e))
|
|
||||||
del out
|
|
||||||
return times
|
|
||||||
|
|
||||||
|
|
||||||
def profile_memory(fn, logits, n_iters=MEM_ITERS):
|
|
||||||
for _ in range(WARMUP):
|
|
||||||
out = fn(logits, chunk_size=128)
|
|
||||||
del out
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
peaks = []
|
|
||||||
for _ in range(n_iters):
|
|
||||||
_clean_gpu()
|
|
||||||
base = torch.cuda.max_memory_allocated()
|
|
||||||
out = fn(logits, chunk_size=128)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
peaks.append(torch.cuda.max_memory_allocated() - base)
|
|
||||||
del out
|
|
||||||
return [p / 1e6 for p in peaks]
|
|
||||||
|
|
||||||
|
|
||||||
def fmt(values, unit=""):
|
|
||||||
mean = statistics.mean(values)
|
|
||||||
std = statistics.stdev(values) if len(values) > 1 else 0.0
|
|
||||||
return f"{mean:8.2f} ± {std:5.2f} {unit} [min={min(values):.2f}, max={max(values):.2f}]"
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_contiguous():
|
|
||||||
print("=" * 60)
|
|
||||||
print(
|
|
||||||
f"CONTIGUOUS BENCHMARK (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})"
|
|
||||||
)
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
configs = [
|
|
||||||
(1, 2048),
|
|
||||||
(1, 8192),
|
|
||||||
(1, 16384),
|
|
||||||
(4, 4096),
|
|
||||||
(8, 2048),
|
|
||||||
(16, 2048),
|
|
||||||
(16, 4096),
|
|
||||||
]
|
|
||||||
|
|
||||||
for B, L in configs:
|
|
||||||
mem_gb = B * L * V * 2 / 1e9
|
|
||||||
if mem_gb > 28:
|
|
||||||
print(f"\n skip B={B}, L={L} ({mem_gb:.1f} GB)")
|
|
||||||
continue
|
|
||||||
|
|
||||||
N = B * L
|
|
||||||
print(f"\n{'─' * 60}")
|
|
||||||
print(f"B={B:2d}, L={L:5d} ({N:6d} rows, logits {mem_gb:.2f} GB)")
|
|
||||||
print(f"{'─' * 60}")
|
|
||||||
|
|
||||||
torch.manual_seed(42)
|
|
||||||
logits = torch.randn(B, L, V, device="cuda", dtype=torch.bfloat16)
|
|
||||||
|
|
||||||
t_orig = profile_time(entropy_from_logits_original, logits)
|
|
||||||
t_triton = profile_time(entropy_from_logits, logits)
|
|
||||||
orig_mean = statistics.mean(t_orig)
|
|
||||||
triton_mean = statistics.mean(t_triton)
|
|
||||||
|
|
||||||
print(" TIME (ms):")
|
|
||||||
print(f" original: {fmt(t_orig, 'ms')}")
|
|
||||||
print(f" triton: {fmt(t_triton, 'ms')}")
|
|
||||||
print(f" speedup: {orig_mean / triton_mean:.2f}x")
|
|
||||||
|
|
||||||
m_orig = profile_memory(entropy_from_logits_original, logits)
|
|
||||||
m_triton = profile_memory(entropy_from_logits, logits)
|
|
||||||
orig_peak = statistics.mean(m_orig)
|
|
||||||
triton_peak = statistics.mean(m_triton)
|
|
||||||
|
|
||||||
print(" MEMORY (peak overhead):")
|
|
||||||
print(f" original: {fmt(m_orig, 'MB')}")
|
|
||||||
print(f" triton: {fmt(m_triton, 'MB')}")
|
|
||||||
print(f" saved: {orig_peak - triton_peak:.1f} MB")
|
|
||||||
|
|
||||||
del logits
|
|
||||||
_clean_gpu()
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_noncontiguous():
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print(
|
|
||||||
f"NON-CONTIGUOUS BENCHMARK (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})"
|
|
||||||
)
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
configs = [
|
|
||||||
(4, 2048, "transpose"),
|
|
||||||
(4, 8192, "transpose"),
|
|
||||||
(8, 2048, "transpose"),
|
|
||||||
(4, 4096, "slice_batch"),
|
|
||||||
]
|
|
||||||
|
|
||||||
for B, L, method in configs:
|
|
||||||
torch.manual_seed(42)
|
|
||||||
|
|
||||||
if method == "transpose":
|
|
||||||
raw = torch.randn(L, B, V, device="cuda", dtype=torch.bfloat16)
|
|
||||||
logits_nc = raw.transpose(0, 1)
|
|
||||||
raw_gb = L * B * V * 2 / 1e9
|
|
||||||
elif method == "slice_batch":
|
|
||||||
raw = torch.randn(B * 2, L, V, device="cuda", dtype=torch.bfloat16)
|
|
||||||
logits_nc = raw[::2]
|
|
||||||
raw_gb = B * 2 * L * V * 2 / 1e9
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if raw_gb > 28:
|
|
||||||
print(f"\n skip B={B}, L={L}, {method} ({raw_gb:.1f} GB)")
|
|
||||||
del raw, logits_nc
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
continue
|
|
||||||
|
|
||||||
N = B * L
|
|
||||||
print(f"\n{'─' * 60}")
|
|
||||||
print(f"B={B}, L={L} {method} ({N} rows, raw {raw_gb:.2f} GB)")
|
|
||||||
print(f"{'─' * 60}")
|
|
||||||
|
|
||||||
def original_with_copy(logits, chunk_size=128):
|
|
||||||
return entropy_from_logits_original(
|
|
||||||
logits.contiguous(), chunk_size=chunk_size
|
|
||||||
)
|
|
||||||
|
|
||||||
t_orig = profile_time(original_with_copy, logits_nc)
|
|
||||||
t_triton = profile_time(entropy_from_logits, logits_nc)
|
|
||||||
orig_mean = statistics.mean(t_orig)
|
|
||||||
triton_mean = statistics.mean(t_triton)
|
|
||||||
|
|
||||||
print(" TIME (ms):")
|
|
||||||
print(f" orig+copy: {fmt(t_orig, 'ms')}")
|
|
||||||
print(f" triton-strided:{fmt(t_triton, 'ms')}")
|
|
||||||
print(f" speedup: {orig_mean / triton_mean:.2f}x")
|
|
||||||
|
|
||||||
m_orig = profile_memory(original_with_copy, logits_nc)
|
|
||||||
m_triton = profile_memory(entropy_from_logits, logits_nc)
|
|
||||||
orig_peak = statistics.mean(m_orig)
|
|
||||||
triton_peak = statistics.mean(m_triton)
|
|
||||||
|
|
||||||
print(" MEMORY (peak overhead):")
|
|
||||||
print(f" orig+copy: {fmt(m_orig, 'MB')}")
|
|
||||||
print(f" triton-strided:{fmt(m_triton, 'MB')}")
|
|
||||||
print(f" saved: {orig_peak - triton_peak:.1f} MB")
|
|
||||||
|
|
||||||
del raw, logits_nc
|
|
||||||
_clean_gpu()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
benchmark_contiguous()
|
|
||||||
benchmark_noncontiguous()
|
|
||||||
@@ -1,284 +0,0 @@
|
|||||||
"""Benchmark for ScatterMoE LoRA Triton kernels.
|
|
||||||
|
|
||||||
Measures forward, backward dX, and backward dA/dB kernels at common MoE
|
|
||||||
model shapes. Reports per-kernel timings, LoRA overhead vs base scatter2scatter,
|
|
||||||
and full fwd+bwd autograd throughput.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py --ranks 16 64
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py --models Qwen/Qwen3.5-35B-A3B
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import gc
|
|
||||||
import time
|
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from axolotl.integrations.kernels.libs.scattermoe_lora.kernels import (
|
|
||||||
lora_ops,
|
|
||||||
ops as base_ops,
|
|
||||||
)
|
|
||||||
from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
|
|
||||||
flatten_sort_count,
|
|
||||||
)
|
|
||||||
from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_linear_lora import (
|
|
||||||
ScatterMoELoRA,
|
|
||||||
)
|
|
||||||
|
|
||||||
DEVICE = "cuda"
|
|
||||||
DTYPE = torch.bfloat16
|
|
||||||
WARMUP = 5
|
|
||||||
ITERS = 20
|
|
||||||
|
|
||||||
# ─── Model configs ──────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
BUILTIN_CONFIGS = {
|
|
||||||
"Qwen3.5-35B-A3B": (256, 2048, 512, 8), # E, H, I, k
|
|
||||||
"Qwen3-30B-A3B": (128, 2048, 768, 8),
|
|
||||||
"OLMoE-1B-7B": (64, 2048, 1024, 8),
|
|
||||||
"Mixtral-8x7B": (8, 4096, 14336, 2),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _resolve_config(spec):
|
|
||||||
"""Resolve a model spec to (E, H, I, k). Accepts builtin names or HF IDs."""
|
|
||||||
key = spec.lower().replace("/", "-")
|
|
||||||
for name, cfg in BUILTIN_CONFIGS.items():
|
|
||||||
if key in name.lower() or name.lower() in key:
|
|
||||||
return name, cfg
|
|
||||||
|
|
||||||
from transformers import AutoConfig
|
|
||||||
|
|
||||||
hf_cfg = AutoConfig.from_pretrained(spec, trust_remote_code=True)
|
|
||||||
if callable(getattr(hf_cfg, "get_text_config", None)):
|
|
||||||
tc = hf_cfg.get_text_config()
|
|
||||||
if hasattr(tc, "model_type") and tc.model_type != hf_cfg.model_type:
|
|
||||||
hf_cfg = tc
|
|
||||||
hidden = hf_cfg.hidden_size
|
|
||||||
inter = getattr(hf_cfg, "moe_intermediate_size", None) or hf_cfg.intermediate_size
|
|
||||||
experts = (
|
|
||||||
getattr(hf_cfg, "num_experts", None)
|
|
||||||
or getattr(hf_cfg, "num_local_experts", None)
|
|
||||||
or getattr(hf_cfg, "n_routed_experts", None)
|
|
||||||
)
|
|
||||||
top_k = (
|
|
||||||
getattr(hf_cfg, "num_experts_per_tok", None)
|
|
||||||
or getattr(hf_cfg, "num_experts_per_token", None)
|
|
||||||
or 2
|
|
||||||
)
|
|
||||||
name = spec.split("/")[-1]
|
|
||||||
return name, (experts, hidden, inter, top_k)
|
|
||||||
|
|
||||||
|
|
||||||
# ─── Benchmark helpers ──────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def _clean():
|
|
||||||
gc.collect()
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
|
|
||||||
def _bench(fn, warmup=WARMUP, iters=ITERS):
|
|
||||||
for _ in range(warmup):
|
|
||||||
fn()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
times = []
|
|
||||||
for _ in range(iters):
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
t0 = time.perf_counter()
|
|
||||||
fn()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
times.append((time.perf_counter() - t0) * 1000)
|
|
||||||
times.sort()
|
|
||||||
return times[len(times) // 2]
|
|
||||||
|
|
||||||
|
|
||||||
def _setup(num_experts, K, N, T, top_k, R):
|
|
||||||
torch.manual_seed(42)
|
|
||||||
x = torch.randn(T, K, device=DEVICE, dtype=DTYPE)
|
|
||||||
W = torch.randn(num_experts, K, N, device=DEVICE, dtype=DTYPE) * 0.02
|
|
||||||
lora_A = torch.randn(R * num_experts, K, device=DEVICE, dtype=DTYPE) * 0.01
|
|
||||||
lora_B = torch.randn(N, R * num_experts, device=DEVICE, dtype=DTYPE) * 0.01
|
|
||||||
logits = torch.randn(T, num_experts, device=DEVICE)
|
|
||||||
_, top_idx = torch.topk(torch.softmax(logits, dim=-1), top_k, dim=-1)
|
|
||||||
sei, ssi, eo = flatten_sort_count(top_idx, num_experts)
|
|
||||||
gx = base_ops.group(x, ssi, fan_out=top_k)
|
|
||||||
dy = torch.randn(gx.size(0), N, device=DEVICE, dtype=DTYPE)
|
|
||||||
return x, W, lora_A, lora_B, sei, ssi, eo, gx, dy
|
|
||||||
|
|
||||||
|
|
||||||
# ─── Kernel wrappers (avoid B023 loop-variable capture) ──────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def _call_fwd(x, W, sei, ssi, top_k, lA, lB):
|
|
||||||
return lora_ops.scatter2scatter_lora(
|
|
||||||
X=x,
|
|
||||||
W=W,
|
|
||||||
sorted_expert_idxs=sei,
|
|
||||||
sorted_scattered_idxs=ssi,
|
|
||||||
k=top_k,
|
|
||||||
lora_A=lA,
|
|
||||||
lora_B=lB,
|
|
||||||
scaling=2.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _call_base(x, W, sei, ssi, top_k):
|
|
||||||
return base_ops.scatter2scatter(
|
|
||||||
X=x,
|
|
||||||
W=W,
|
|
||||||
sorted_expert_idxs=sei,
|
|
||||||
sorted_scattered_idxs=ssi,
|
|
||||||
k=top_k,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _call_dx(dy, W, sei, ssi, lA, lB):
|
|
||||||
return lora_ops.scatter2scatter_lora_dX(
|
|
||||||
DY=dy,
|
|
||||||
W=W,
|
|
||||||
sorted_expert_idxs=sei,
|
|
||||||
sorted_scattered_idxs=ssi,
|
|
||||||
k=1,
|
|
||||||
lora_A=lA,
|
|
||||||
lora_B=lB,
|
|
||||||
scaling=2.0,
|
|
||||||
dy_grouped=True,
|
|
||||||
dx_grouped=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _call_bwd(dy, gx, lA, lB, eo, num_experts):
|
|
||||||
return lora_ops.group_bwd_lora(
|
|
||||||
DY=dy,
|
|
||||||
X=gx,
|
|
||||||
lora_A=lA,
|
|
||||||
lora_B=lB,
|
|
||||||
expert_offsets=eo,
|
|
||||||
E=num_experts,
|
|
||||||
scaling=2.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ─── Main ────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(description="ScatterMoE LoRA kernel benchmark")
|
|
||||||
parser.add_argument(
|
|
||||||
"--models",
|
|
||||||
"-m",
|
|
||||||
nargs="+",
|
|
||||||
help="Model names or HF IDs (default: all builtins)",
|
|
||||||
)
|
|
||||||
parser.add_argument("--ranks", "-r", nargs="+", type=int, default=[16, 32, 64])
|
|
||||||
parser.add_argument("--seq-len", "-T", type=int, default=2048)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
T = args.seq_len
|
|
||||||
print(f"GPU: {torch.cuda.get_device_name()}")
|
|
||||||
print(f"T={T}, ranks={args.ranks}\n")
|
|
||||||
|
|
||||||
if args.models:
|
|
||||||
configs = [_resolve_config(m) for m in args.models]
|
|
||||||
else:
|
|
||||||
configs = list(BUILTIN_CONFIGS.items())
|
|
||||||
|
|
||||||
for model_name, (num_experts, hidden, inter, top_k) in configs:
|
|
||||||
print(f"{'=' * 70}")
|
|
||||||
print(f" {model_name}: E={num_experts}, H={hidden}, I={inter}, k={top_k}")
|
|
||||||
print(f"{'=' * 70}")
|
|
||||||
|
|
||||||
for R in args.ranks:
|
|
||||||
for proj, K, N in [("gate_up", hidden, 2 * inter), ("down", inter, hidden)]:
|
|
||||||
_clean()
|
|
||||||
x, W, lA, lB, sei, ssi, eo, gx, dy = _setup(
|
|
||||||
num_experts, K, N, T, top_k, R
|
|
||||||
)
|
|
||||||
|
|
||||||
# Forward with LoRA (auto-dispatched: fused or split)
|
|
||||||
dispatch = (
|
|
||||||
"split"
|
|
||||||
if (
|
|
||||||
num_experts <= lora_ops._SPLIT_LORA_FWD_MAX_EXPERTS
|
|
||||||
and K * N >= lora_ops._SPLIT_LORA_FWD_THRESHOLD
|
|
||||||
)
|
|
||||||
else "fused"
|
|
||||||
)
|
|
||||||
t_fwd = _bench(partial(_call_fwd, x, W, sei, ssi, top_k, lA, lB))
|
|
||||||
t_base = _bench(partial(_call_base, x, W, sei, ssi, top_k))
|
|
||||||
t_dx = _bench(partial(_call_dx, dy, W, sei, ssi, lA, lB))
|
|
||||||
t_bwd = _bench(partial(_call_bwd, dy, gx, lA, lB, eo, num_experts))
|
|
||||||
|
|
||||||
total = t_fwd + t_dx + t_bwd
|
|
||||||
overhead = t_fwd / t_base - 1 if t_base > 0 else 0
|
|
||||||
|
|
||||||
print(
|
|
||||||
f" R={R:>2} {proj:<8} "
|
|
||||||
f"fwd={t_fwd:>6.2f}ms [{dispatch}] "
|
|
||||||
f"base={t_base:>6.2f}ms "
|
|
||||||
f"(+{overhead * 100:.0f}%) "
|
|
||||||
f"dx={t_dx:>6.2f}ms bwd={t_bwd:>6.2f}ms "
|
|
||||||
f"total={total:>6.2f}ms"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Full autograd fwd+bwd with memory measurement
|
|
||||||
x_ag = x.clone().requires_grad_(True)
|
|
||||||
lA_ag = lA.clone().requires_grad_(True)
|
|
||||||
lB_ag = lB.clone().requires_grad_(True)
|
|
||||||
|
|
||||||
def _run_autograd(
|
|
||||||
_x=x_ag,
|
|
||||||
_W=W,
|
|
||||||
_k=top_k,
|
|
||||||
_sei=sei,
|
|
||||||
_ssi=ssi,
|
|
||||||
_eo=eo,
|
|
||||||
_lA=lA_ag,
|
|
||||||
_lB=lB_ag,
|
|
||||||
):
|
|
||||||
out = ScatterMoELoRA.apply(
|
|
||||||
_x,
|
|
||||||
_W,
|
|
||||||
_k,
|
|
||||||
_sei,
|
|
||||||
_ssi,
|
|
||||||
_eo,
|
|
||||||
_lA,
|
|
||||||
_lB,
|
|
||||||
2.0,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
False,
|
|
||||||
False,
|
|
||||||
True,
|
|
||||||
False,
|
|
||||||
)
|
|
||||||
out.sum().backward()
|
|
||||||
_x.grad = None
|
|
||||||
_lA.grad = None
|
|
||||||
_lB.grad = None
|
|
||||||
|
|
||||||
t_full = _bench(_run_autograd)
|
|
||||||
|
|
||||||
_clean()
|
|
||||||
torch.cuda.reset_peak_memory_stats()
|
|
||||||
mem_before = torch.cuda.memory_allocated()
|
|
||||||
_run_autograd()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
mem_peak = torch.cuda.max_memory_allocated() - mem_before
|
|
||||||
|
|
||||||
print(
|
|
||||||
f" full_fwd_bwd={t_full:>6.2f}ms "
|
|
||||||
f"peak_delta={mem_peak / 1e6:>6.1f}MB"
|
|
||||||
)
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,191 +0,0 @@
|
|||||||
"""Benchmark for selective_log_softmax Triton kernel vs original implementation.
|
|
||||||
|
|
||||||
Usage: CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_selective_logsoftmax.py
|
|
||||||
"""
|
|
||||||
|
|
||||||
import gc
|
|
||||||
import statistics
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from axolotl.monkeypatch.trainer.utils import (
|
|
||||||
selective_log_softmax,
|
|
||||||
selective_log_softmax_original,
|
|
||||||
)
|
|
||||||
|
|
||||||
V = 151936 # Qwen vocab
|
|
||||||
WARMUP = 5
|
|
||||||
BENCH_ITERS = 20
|
|
||||||
MEM_ITERS = 10
|
|
||||||
|
|
||||||
|
|
||||||
def _clean_gpu():
|
|
||||||
gc.collect()
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
torch.cuda.reset_peak_memory_stats()
|
|
||||||
torch.cuda.reset_accumulated_memory_stats()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
|
|
||||||
def profile_time(fn, args, n_iters=BENCH_ITERS):
|
|
||||||
for _ in range(WARMUP):
|
|
||||||
fn(*args)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
times = []
|
|
||||||
for _ in range(n_iters):
|
|
||||||
s = torch.cuda.Event(enable_timing=True)
|
|
||||||
e = torch.cuda.Event(enable_timing=True)
|
|
||||||
s.record()
|
|
||||||
fn(*args)
|
|
||||||
e.record()
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
times.append(s.elapsed_time(e))
|
|
||||||
return times
|
|
||||||
|
|
||||||
|
|
||||||
def profile_memory(fn, args, n_iters=MEM_ITERS):
|
|
||||||
for _ in range(WARMUP):
|
|
||||||
out = fn(*args)
|
|
||||||
del out
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
|
|
||||||
peaks = []
|
|
||||||
for _ in range(n_iters):
|
|
||||||
_clean_gpu()
|
|
||||||
base = torch.cuda.max_memory_allocated()
|
|
||||||
out = fn(*args)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
peaks.append(torch.cuda.max_memory_allocated() - base)
|
|
||||||
del out
|
|
||||||
return [p / 1e6 for p in peaks]
|
|
||||||
|
|
||||||
|
|
||||||
def fmt(values, unit=""):
|
|
||||||
mean = statistics.mean(values)
|
|
||||||
std = statistics.stdev(values) if len(values) > 1 else 0.0
|
|
||||||
return f"{mean:8.2f} ± {std:5.2f} {unit} [min={min(values):.2f}, max={max(values):.2f}]"
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_forward():
|
|
||||||
print("=" * 60)
|
|
||||||
print(f"FORWARD BENCHMARK (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
configs = [
|
|
||||||
(1, 2048),
|
|
||||||
(1, 8192),
|
|
||||||
(4, 4096),
|
|
||||||
(8, 2048),
|
|
||||||
(16, 2048),
|
|
||||||
(16, 4096),
|
|
||||||
]
|
|
||||||
|
|
||||||
for B, L in configs:
|
|
||||||
mem_gb = B * L * V * 2 / 1e9
|
|
||||||
if mem_gb > 28:
|
|
||||||
print(f"\n skip B={B}, L={L} ({mem_gb:.1f} GB)")
|
|
||||||
continue
|
|
||||||
|
|
||||||
N = B * L
|
|
||||||
print(f"\n{'─' * 60}")
|
|
||||||
print(f"B={B:2d}, L={L:5d} ({N:6d} rows, logits {mem_gb:.2f} GB)")
|
|
||||||
print(f"{'─' * 60}")
|
|
||||||
|
|
||||||
torch.manual_seed(42)
|
|
||||||
logits = torch.randn(B, L, V, device="cuda", dtype=torch.bfloat16)
|
|
||||||
index = torch.randint(0, V, (B, L), device="cuda")
|
|
||||||
|
|
||||||
t_orig = profile_time(selective_log_softmax_original, (logits, index))
|
|
||||||
t_triton = profile_time(selective_log_softmax, (logits, index))
|
|
||||||
orig_mean = statistics.mean(t_orig)
|
|
||||||
triton_mean = statistics.mean(t_triton)
|
|
||||||
|
|
||||||
print(" TIME (ms):")
|
|
||||||
print(f" original: {fmt(t_orig, 'ms')}")
|
|
||||||
print(f" triton: {fmt(t_triton, 'ms')}")
|
|
||||||
print(f" speedup: {orig_mean / triton_mean:.2f}x")
|
|
||||||
|
|
||||||
m_orig = profile_memory(selective_log_softmax_original, (logits, index))
|
|
||||||
m_triton = profile_memory(selective_log_softmax, (logits, index))
|
|
||||||
orig_peak = statistics.mean(m_orig)
|
|
||||||
triton_peak = statistics.mean(m_triton)
|
|
||||||
|
|
||||||
print(" MEMORY (peak overhead):")
|
|
||||||
print(f" original: {fmt(m_orig, 'MB')}")
|
|
||||||
print(f" triton: {fmt(m_triton, 'MB')}")
|
|
||||||
print(f" saved: {orig_peak - triton_peak:.1f} MB")
|
|
||||||
|
|
||||||
del logits, index
|
|
||||||
_clean_gpu()
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark_backward():
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print(f"FWD+BWD BENCHMARK (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
configs = [
|
|
||||||
(1, 2048),
|
|
||||||
(1, 8192),
|
|
||||||
(4, 4096),
|
|
||||||
(8, 2048),
|
|
||||||
(16, 2048),
|
|
||||||
(16, 4096),
|
|
||||||
]
|
|
||||||
|
|
||||||
def fwd_bwd_original(logits, index):
|
|
||||||
logits.grad = None
|
|
||||||
out = selective_log_softmax_original(logits, index)
|
|
||||||
out.sum().backward()
|
|
||||||
|
|
||||||
def fwd_bwd_triton(logits, index):
|
|
||||||
logits.grad = None
|
|
||||||
out = selective_log_softmax(logits, index)
|
|
||||||
out.sum().backward()
|
|
||||||
|
|
||||||
for B, L in configs:
|
|
||||||
mem_gb = B * L * V * 2 / 1e9
|
|
||||||
if mem_gb > 20:
|
|
||||||
print(f"\n skip B={B}, L={L} ({mem_gb:.1f} GB, need room for grads)")
|
|
||||||
continue
|
|
||||||
|
|
||||||
N = B * L
|
|
||||||
print(f"\n{'─' * 60}")
|
|
||||||
print(f"B={B:2d}, L={L:5d} ({N:6d} rows, logits {mem_gb:.2f} GB)")
|
|
||||||
print(f"{'─' * 60}")
|
|
||||||
|
|
||||||
torch.manual_seed(42)
|
|
||||||
logits_orig = torch.randn(
|
|
||||||
B, L, V, device="cuda", dtype=torch.bfloat16, requires_grad=True
|
|
||||||
)
|
|
||||||
logits_tri = logits_orig.detach().clone().requires_grad_(True)
|
|
||||||
index = torch.randint(0, V, (B, L), device="cuda")
|
|
||||||
|
|
||||||
t_orig = profile_time(fwd_bwd_original, (logits_orig, index))
|
|
||||||
t_triton = profile_time(fwd_bwd_triton, (logits_tri, index))
|
|
||||||
orig_mean = statistics.mean(t_orig)
|
|
||||||
triton_mean = statistics.mean(t_triton)
|
|
||||||
|
|
||||||
print(" FWD+BWD TIME (ms):")
|
|
||||||
print(f" original: {fmt(t_orig, 'ms')}")
|
|
||||||
print(f" triton: {fmt(t_triton, 'ms')}")
|
|
||||||
print(f" speedup: {orig_mean / triton_mean:.2f}x")
|
|
||||||
|
|
||||||
m_orig = profile_memory(fwd_bwd_original, (logits_orig, index))
|
|
||||||
m_triton = profile_memory(fwd_bwd_triton, (logits_tri, index))
|
|
||||||
orig_peak = statistics.mean(m_orig)
|
|
||||||
triton_peak = statistics.mean(m_triton)
|
|
||||||
|
|
||||||
print(" FWD+BWD MEMORY (peak overhead):")
|
|
||||||
print(f" original: {fmt(m_orig, 'MB')}")
|
|
||||||
print(f" triton: {fmt(m_triton, 'MB')}")
|
|
||||||
print(f" saved: {orig_peak - triton_peak:.1f} MB")
|
|
||||||
|
|
||||||
del logits_orig, logits_tri, index
|
|
||||||
_clean_gpu()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
benchmark_forward()
|
|
||||||
benchmark_backward()
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}
|
|
||||||
|
|
||||||
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
|
||||||
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
|
|
||||||
ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
|
|
||||||
ENV CUDA="{{ CUDA }}"
|
|
||||||
ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
|
|
||||||
ENV GITHUB_REF="{{ GITHUB_REF }}"
|
|
||||||
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
|
|
||||||
ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
|
|
||||||
ENV HF_HOME="{{ HF_HOME }}"
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
|
|
||||||
|
|
||||||
WORKDIR /workspace/axolotl
|
|
||||||
|
|
||||||
RUN git fetch origin +$GITHUB_REF && \
|
|
||||||
git checkout FETCH_HEAD
|
|
||||||
|
|
||||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
|
||||||
RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
|
|
||||||
sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
|
|
||||||
sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
|
|
||||||
sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
|
|
||||||
sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
|
|
||||||
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
RUN uv pip install packaging==26.0 setuptools==78.1.1
|
|
||||||
RUN uv pip install torchvision
|
|
||||||
RUN uv pip uninstall causal_conv1d
|
|
||||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
|
||||||
uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
|
||||||
else \
|
|
||||||
uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
RUN python scripts/unsloth_install.py --uv | sh
|
|
||||||
RUN python scripts/cutcrossentropy_install.py --uv | sh
|
|
||||||
|
|
||||||
# So we can test the Docker image
|
|
||||||
RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt
|
|
||||||
|
|
||||||
# fix so that git fetch/pull from remote works
|
|
||||||
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
|
||||||
git config --get remote.origin.fetch
|
|
||||||
|
|
||||||
# helper for huggingface-login cli
|
|
||||||
RUN git config --global credential.helper store
|
|
||||||
@@ -1,6 +1,10 @@
|
|||||||
FROM axolotlai/axolotl-base:{{ BASE_TAG }}
|
FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}
|
||||||
|
|
||||||
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
SHELL ["/bin/bash", "-euxo", "pipefail", "-c"]
|
||||||
|
|
||||||
|
ARG VENV_PYTHON="/workspace/axolotl-venv/bin/python"
|
||||||
|
|
||||||
|
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
||||||
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
|
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
|
||||||
ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
|
ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
|
||||||
ENV CUDA="{{ CUDA }}"
|
ENV CUDA="{{ CUDA }}"
|
||||||
@@ -9,10 +13,10 @@ ENV GITHUB_REF="{{ GITHUB_REF }}"
|
|||||||
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
|
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
|
||||||
ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
|
ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
|
||||||
ENV HF_HOME="{{ HF_HOME }}"
|
ENV HF_HOME="{{ HF_HOME }}"
|
||||||
ENV AXOLOTL_DATASET_NUM_PROC="8"
|
ENV VENV_PYTHON=$VENV_PYTHON
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
|
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
@@ -25,26 +29,27 @@ RUN git fetch origin +$GITHUB_REF && \
|
|||||||
|
|
||||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||||
RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
|
RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
|
||||||
sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
|
sed -i 's#"transformers[^"]*"#"transformers @ git+https://github.com/huggingface/transformers.git@main"#' pyproject.toml; \
|
||||||
sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
|
sed -i 's#"peft[^"]*"#"peft @ git+https://github.com/huggingface/peft.git@main"#' pyproject.toml; \
|
||||||
sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
|
sed -i 's#"accelerate[^"]*"#"accelerate @ git+https://github.com/huggingface/accelerate.git@main"#' pyproject.toml; \
|
||||||
sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
|
sed -i 's#"trl[^"]*"#"trl @ git+https://github.com/huggingface/trl.git@main"#' pyproject.toml; \
|
||||||
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
|
sed -i 's#"datasets[^"]*"#"datasets @ git+https://github.com/huggingface/datasets.git@main"#' pyproject.toml; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
RUN pip install packaging==26.0 setuptools==78.1.1 psutil
|
RUN uv pip install --python "$VENV_PYTHON" packaging==23.2 setuptools==75.8.0 pip
|
||||||
RUN pip uninstall -y causal_conv1d
|
|
||||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
uv pip install --python "$VENV_PYTHON" --no-build-isolation -e .[ring-flash-attn,optimizers,ray,${AXOLOTL_EXTRAS}] $AXOLOTL_ARGS; \
|
||||||
else \
|
else \
|
||||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
uv pip install --python "$VENV_PYTHON" --no-build-isolation -e .[ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
RUN python scripts/unsloth_install.py | sh
|
RUN uv pip install --python "$VENV_PYTHON" --no-build-isolation flash-attn $AXOLOTL_ARGS
|
||||||
RUN python scripts/cutcrossentropy_install.py | sh
|
|
||||||
|
RUN "$VENV_PYTHON" scripts/unsloth_install.py | sh
|
||||||
|
RUN "$VENV_PYTHON" scripts/cutcrossentropy_install.py | sh
|
||||||
|
|
||||||
# So we can test the Docker image
|
# So we can test the Docker image
|
||||||
RUN pip install -r requirements-dev.txt -r requirements-tests.txt
|
RUN uv pip install --python "$VENV_PYTHON" -e ".[dev]"
|
||||||
|
|
||||||
# fix so that git fetch/pull from remote works
|
# fix so that git fetch/pull from remote works
|
||||||
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
||||||
|
|||||||
24
cicd/cicd.sh
24
cicd/cicd.sh
@@ -3,16 +3,8 @@ set -e
|
|||||||
|
|
||||||
python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
|
python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
curl --silent --show-error --fail --retry 3 --retry-delay 5 -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/" --use-compress-program unzstd --strip-components=1
|
|
||||||
# hf download "NousResearch/Meta-Llama-3-8B"
|
|
||||||
# hf download "NousResearch/Meta-Llama-3-8B-Instruct"
|
|
||||||
# hf download "microsoft/Phi-4-reasoning"
|
|
||||||
# hf download "microsoft/Phi-3.5-mini-instruct"
|
|
||||||
# hf download "microsoft/Phi-3-medium-128k-instruct"
|
|
||||||
|
|
||||||
# Run unit tests with initial coverage report
|
# Run unit tests with initial coverage report
|
||||||
pytest -v --durations=10 -n8 \
|
uv run pytest -v --durations=10 -n8 \
|
||||||
--ignore=tests/e2e/ \
|
--ignore=tests/e2e/ \
|
||||||
--ignore=tests/patched/ \
|
--ignore=tests/patched/ \
|
||||||
--ignore=tests/cli \
|
--ignore=tests/cli \
|
||||||
@@ -20,36 +12,36 @@ pytest -v --durations=10 -n8 \
|
|||||||
--cov=axolotl
|
--cov=axolotl
|
||||||
|
|
||||||
# Run lora kernels tests with coverage append
|
# Run lora kernels tests with coverage append
|
||||||
pytest -v --durations=10 \
|
uv run pytest -v --durations=10 \
|
||||||
/workspace/axolotl/tests/e2e/patched/lora_kernels \
|
/workspace/axolotl/tests/e2e/patched/lora_kernels \
|
||||||
--cov=axolotl \
|
--cov=axolotl \
|
||||||
--cov-append
|
--cov-append
|
||||||
|
|
||||||
# Run patched tests excluding lora kernels with coverage append
|
# Run patched tests excluding lora kernels with coverage append
|
||||||
pytest --full-trace -vvv --durations=10 \
|
uv run pytest --full-trace -vvv --durations=10 \
|
||||||
--ignore=tests/e2e/patched/lora_kernels \
|
--ignore=tests/e2e/patched/lora_kernels \
|
||||||
/workspace/axolotl/tests/e2e/patched \
|
/workspace/axolotl/tests/e2e/patched \
|
||||||
--cov=axolotl \
|
--cov=axolotl \
|
||||||
--cov-append
|
--cov-append
|
||||||
|
|
||||||
# Run solo tests with coverage append
|
# Run solo tests with coverage append
|
||||||
pytest -v --durations=10 -n1 \
|
uv run pytest -v --durations=10 -n1 \
|
||||||
/workspace/axolotl/tests/e2e/solo/ \
|
/workspace/axolotl/tests/e2e/solo/ \
|
||||||
--cov=axolotl \
|
--cov=axolotl \
|
||||||
--cov-append
|
--cov-append
|
||||||
|
|
||||||
# Run integration tests with coverage append
|
# Run integration tests with coverage append
|
||||||
pytest -v --durations=10 \
|
uv run pytest -v --durations=10 \
|
||||||
/workspace/axolotl/tests/e2e/integrations/ \
|
/workspace/axolotl/tests/e2e/integrations/ \
|
||||||
--cov=axolotl \
|
--cov=axolotl \
|
||||||
--cov-append
|
--cov-append
|
||||||
|
|
||||||
pytest -v --durations=10 /workspace/axolotl/tests/cli \
|
uv run pytest -v --durations=10 /workspace/axolotl/tests/cli \
|
||||||
--cov=axolotl \
|
--cov=axolotl \
|
||||||
--cov-append
|
--cov-append
|
||||||
|
|
||||||
# Run remaining e2e tests with coverage append and final report
|
# Run remaining e2e tests with coverage append and final report
|
||||||
pytest -v --durations=10 \
|
uv run pytest -v --durations=10 \
|
||||||
--ignore=tests/e2e/solo/ \
|
--ignore=tests/e2e/solo/ \
|
||||||
--ignore=tests/e2e/patched/ \
|
--ignore=tests/e2e/patched/ \
|
||||||
--ignore=tests/e2e/multigpu/ \
|
--ignore=tests/e2e/multigpu/ \
|
||||||
@@ -60,4 +52,4 @@ pytest -v --durations=10 \
|
|||||||
--cov-append \
|
--cov-append \
|
||||||
--cov-report=xml:e2e-coverage.xml
|
--cov-report=xml:e2e-coverage.xml
|
||||||
|
|
||||||
codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true
|
uv run codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true
|
||||||
|
|||||||
@@ -17,22 +17,18 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
|
|||||||
template_env = jinja2.Environment(
|
template_env = jinja2.Environment(
|
||||||
loader=template_loader, autoescape=select_autoescape()
|
loader=template_loader, autoescape=select_autoescape()
|
||||||
)
|
)
|
||||||
dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
|
df_template = template_env.get_template("Dockerfile.jinja")
|
||||||
df_template = template_env.get_template(dockerfile)
|
|
||||||
|
|
||||||
df_args = {
|
df_args = {
|
||||||
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
||||||
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
||||||
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
|
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
|
||||||
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
|
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-uv-py3.11-cu126-2.6.0"),
|
||||||
"CUDA": os.environ.get("CUDA", "126"),
|
"CUDA": os.environ.get("CUDA", "126"),
|
||||||
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
||||||
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
||||||
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
|
|
||||||
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
|
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
|
||||||
"HF_HOME": "/workspace/data/huggingface-cache/hub",
|
"HF_HOME": "/workspace/data/huggingface-cache/hub",
|
||||||
"PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"),
|
|
||||||
"DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
dockerfile_contents = df_template.render(**df_args)
|
dockerfile_contents = df_template.render(**df_args)
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
|
# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
|
||||||
pytest -v --durations=10 -n2 --maxfail=3 \
|
pytest -v --durations=10 -n2 \
|
||||||
--ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
|
--ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
|
||||||
--ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
|
--ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
|
||||||
/workspace/axolotl/tests/e2e/multigpu/ \
|
/workspace/axolotl/tests/e2e/multigpu/ \
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ df_args = {
|
|||||||
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
||||||
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
||||||
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
|
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
|
||||||
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
|
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-uv-py3.11-cu126-2.6.0"),
|
||||||
"CUDA": os.environ.get("CUDA", "126"),
|
"CUDA": os.environ.get("CUDA", "126"),
|
||||||
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
||||||
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
||||||
@@ -65,9 +65,8 @@ def run_cmd(cmd: str, run_folder: str):
|
|||||||
import subprocess # nosec
|
import subprocess # nosec
|
||||||
|
|
||||||
sp_env = os.environ.copy()
|
sp_env = os.environ.copy()
|
||||||
sp_env["AXOLOTL_DATASET_NUM_PROC"] = "8"
|
sp_env["AXOLOTL_DATASET_PROCESSES"] = "8"
|
||||||
|
|
||||||
# Propagate errors from subprocess.
|
# Propagate errors from subprocess.
|
||||||
exit_code = subprocess.call(cmd.split(), cwd=run_folder, env=sp_env) # nosec
|
if exit_code := subprocess.call(cmd.split(), cwd=run_folder, env=sp_env): # nosec
|
||||||
if exit_code:
|
exit(exit_code)
|
||||||
raise RuntimeError(f"Command '{cmd}' failed with exit code {exit_code}")
|
|
||||||
|
|||||||
@@ -37,7 +37,6 @@ coverage:
|
|||||||
only_pulls: false
|
only_pulls: false
|
||||||
flags: null
|
flags: null
|
||||||
paths: null
|
paths: null
|
||||||
informational: true
|
|
||||||
|
|
||||||
parsers:
|
parsers:
|
||||||
gcov:
|
gcov:
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ datasets:
|
|||||||
val_set_size: 0
|
val_set_size: 0
|
||||||
output_dir: temp_debug/axolotl_outputs/model
|
output_dir: temp_debug/axolotl_outputs/model
|
||||||
dataset_prepared_path: temp_debug/axolotl_outputs/data
|
dataset_prepared_path: temp_debug/axolotl_outputs/data
|
||||||
dataset_num_proc: 1
|
dataset_processes: 1
|
||||||
|
|
||||||
sequence_len: 4096
|
sequence_len: 4096
|
||||||
sample_packing: false
|
sample_packing: false
|
||||||
|
|||||||
@@ -1,14 +1,19 @@
|
|||||||
ARG BASE_TAG=main-base
|
ARG BASE_TAG=main-base-uv
|
||||||
FROM axolotlai/axolotl-base:$BASE_TAG
|
FROM axolotlai/axolotl-base-uv:$BASE_TAG
|
||||||
|
|
||||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
||||||
ARG AXOLOTL_EXTRAS=""
|
ARG AXOLOTL_EXTRAS=""
|
||||||
ARG AXOLOTL_ARGS=""
|
ARG AXOLOTL_ARGS=""
|
||||||
ARG CUDA="118"
|
ARG CUDA="118"
|
||||||
ARG PYTORCH_VERSION="2.1.2"
|
ARG PYTORCH_VERSION="2.1.2"
|
||||||
ARG TARGETARCH
|
ARG GIT_REF="refs/heads/main"
|
||||||
|
ARG GIT_SHA="HEAD"
|
||||||
|
ARG VENV_PYTHON="/workspace/axolotl-venv/bin/python"
|
||||||
|
|
||||||
ENV PYTORCH_VERSION=$PYTORCH_VERSION
|
ENV PYTORCH_VERSION=$PYTORCH_VERSION
|
||||||
|
ENV GIT_REF=$GIT_REF
|
||||||
|
ENV GIT_SHA=$GIT_SHA
|
||||||
|
ENV VENV_PYTHON=$VENV_PYTHON
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
|
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
|
||||||
@@ -21,21 +26,19 @@ RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
|
|||||||
|
|
||||||
WORKDIR /workspace/axolotl
|
WORKDIR /workspace/axolotl
|
||||||
|
|
||||||
# If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
|
# Ensure we are on the expected commit and break Docker cache between revisions
|
||||||
RUN pip uninstall -y causal_conv1d
|
RUN git fetch origin "$GIT_REF" && git checkout "$GIT_SHA"
|
||||||
RUN if [ "$TARGETARCH" = "arm64" ]; then \
|
|
||||||
BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
|
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||||
|
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||||
|
uv pip install --python "$VENV_PYTHON" --no-build-isolation -e .[ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||||
else \
|
else \
|
||||||
BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
|
uv pip install --python "$VENV_PYTHON" --no-build-isolation -e .[ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
||||||
fi && \
|
fi && \
|
||||||
if [ "$AXOLOTL_EXTRAS" != "" ]; then \
|
uv pip install --python "$VENV_PYTHON" --no-build-isolation flash-attn $AXOLOTL_ARGS && \
|
||||||
pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
"$VENV_PYTHON" scripts/unsloth_install.py | sh && \
|
||||||
else \
|
"$VENV_PYTHON" scripts/cutcrossentropy_install.py | sh && \
|
||||||
pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
|
uv pip install --python "$VENV_PYTHON" pytest
|
||||||
fi && \ python scripts/unsloth_install.py | sh && \
|
|
||||||
python scripts/cutcrossentropy_install.py | sh && \
|
|
||||||
pip install pytest && \
|
|
||||||
pip cache purge
|
|
||||||
|
|
||||||
# fix so that git fetch/pull from remote works with shallow clone
|
# fix so that git fetch/pull from remote works with shallow clone
|
||||||
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
||||||
|
|||||||
@@ -2,16 +2,14 @@ ARG CUDA_VERSION="11.8.0"
|
|||||||
ARG CUDNN_VERSION="8"
|
ARG CUDNN_VERSION="8"
|
||||||
ARG UBUNTU_VERSION="22.04"
|
ARG UBUNTU_VERSION="22.04"
|
||||||
ARG MAX_JOBS=4
|
ARG MAX_JOBS=4
|
||||||
ARG TARGETARCH
|
|
||||||
|
|
||||||
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
||||||
|
|
||||||
ENV PATH="/root/miniconda3/bin:${PATH}"
|
ENV PATH="/root/miniconda3/bin:${PATH}"
|
||||||
|
|
||||||
ARG TARGETARCH
|
ARG PYTHON_VERSION="3.10"
|
||||||
ARG PYTHON_VERSION="3.11"
|
|
||||||
ARG PYTORCH_VERSION="2.1.2"
|
ARG PYTORCH_VERSION="2.1.2"
|
||||||
ARG CUDA="128"
|
ARG CUDA="118"
|
||||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
||||||
|
|
||||||
ENV PYTHON_VERSION=$PYTHON_VERSION
|
ENV PYTHON_VERSION=$PYTHON_VERSION
|
||||||
@@ -24,17 +22,11 @@ RUN apt-get update \
|
|||||||
librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm \
|
librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm \
|
||||||
&& rm -rf /var/cache/apt/archives \
|
&& rm -rf /var/cache/apt/archives \
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& if [ "$TARGETARCH" = "amd64" ]; then \
|
&& wget \
|
||||||
MINICONDA_ARCH="x86_64"; \
|
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
||||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
|
||||||
MINICONDA_ARCH="aarch64"; \
|
|
||||||
else \
|
|
||||||
echo "Unsupported architecture: $TARGETARCH"; exit 1; \
|
|
||||||
fi \
|
|
||||||
&& wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
|
|
||||||
&& mkdir /root/.conda \
|
&& mkdir /root/.conda \
|
||||||
&& bash Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh -b \
|
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||||
&& rm -f Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
|
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
||||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
|
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
|
||||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
|
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
|
||||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||||
@@ -43,34 +35,18 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
|||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel psutil && \
|
RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
|
||||||
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
|
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
|
||||||
|
CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir causal_conv1d==1.5.2 && \
|
||||||
|
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
|
||||||
python3 -m pip cache purge
|
python3 -m pip cache purge
|
||||||
|
|
||||||
RUN if [ "$CUDA" != "130" ] ; then \
|
|
||||||
CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@v1.5.4"; \
|
|
||||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
|
|
||||||
python3 -m pip cache purge; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
RUN git lfs install --skip-repo && \
|
RUN git lfs install --skip-repo && \
|
||||||
pip3 install awscli && \
|
pip3 install awscli && \
|
||||||
# The base image ships with `pydantic==1.8.2` which is not working
|
# The base image ships with `pydantic==1.8.2` which is not working
|
||||||
pip3 install -U --no-cache-dir pydantic==1.10.10 && \
|
pip3 install -U --no-cache-dir pydantic==1.10.10 && \
|
||||||
pip3 cache purge
|
pip3 cache purge
|
||||||
|
|
||||||
# Map Python version (e.g., 3.12 -> cp312)
|
RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
|
||||||
RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
|
FLASH_ATTENTION_FORCE_BUILD="TRUE" uv pip install --no-build-isolation flash-attn==2.8.0.post2; \
|
||||||
# Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
|
fi
|
||||||
TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
|
|
||||||
# Map architecture
|
|
||||||
case "$TARGETARCH" in \
|
|
||||||
amd64) ARCH_TAG="x86_64" ;; \
|
|
||||||
arm64) ARCH_TAG="aarch64" ;; \
|
|
||||||
*) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
|
|
||||||
esac && \
|
|
||||||
WHL_VERSION="v0.7.16" && \
|
|
||||||
WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \
|
|
||||||
wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
|
|
||||||
pip3 install --no-cache-dir "${WHL_FILE}" && \
|
|
||||||
rm "${WHL_FILE}"
|
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
|||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel && \
|
RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
|
||||||
python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
|
python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
|
||||||
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
|
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
|
||||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
|
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ EXPOSE 22
|
|||||||
COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
|
COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
|
||||||
COPY scripts/motd /etc/motd
|
COPY scripts/motd /etc/motd
|
||||||
|
|
||||||
RUN pip install jupyterlab notebook ipywidgets && \
|
RUN uv pip install --python "$VENV_PYTHON" jupyterlab notebook ipywidgets && \
|
||||||
jupyter lab clean
|
"$VENV_PYTHON" -m jupyter lab clean
|
||||||
RUN apt update && \
|
RUN apt update && \
|
||||||
apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
|
apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
|
||||||
rm -rf /var/cache/apt/archives && \
|
rm -rf /var/cache/apt/archives && \
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ EXPOSE 22
|
|||||||
COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
|
COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
|
||||||
COPY scripts/motd /etc/motd
|
COPY scripts/motd /etc/motd
|
||||||
|
|
||||||
RUN pip install jupyterlab notebook ipywidgets && \
|
RUN uv pip install --python "$VENV_PYTHON" jupyterlab notebook ipywidgets && \
|
||||||
jupyter lab clean
|
"$VENV_PYTHON" -m jupyter lab clean
|
||||||
RUN apt update && \
|
RUN apt update && \
|
||||||
apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
|
apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
|
||||||
rm -rf /var/cache/apt/archives && \
|
rm -rf /var/cache/apt/archives && \
|
||||||
|
|||||||
@@ -1,30 +0,0 @@
|
|||||||
ARG BASE_TAG=main
|
|
||||||
FROM axolotlai/axolotl-uv:$BASE_TAG
|
|
||||||
|
|
||||||
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
|
|
||||||
ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
|
||||||
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
|
|
||||||
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
|
||||||
|
|
||||||
EXPOSE 8888
|
|
||||||
EXPOSE 22
|
|
||||||
|
|
||||||
COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
|
|
||||||
COPY scripts/motd /etc/motd
|
|
||||||
|
|
||||||
RUN uv pip install jupyterlab notebook ipywidgets && \
|
|
||||||
jupyter lab clean
|
|
||||||
RUN apt update && \
|
|
||||||
apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
|
|
||||||
rm -rf /var/cache/apt/archives && \
|
|
||||||
rm -rf /var/lib/apt/lists/* && \
|
|
||||||
mkdir -p ~/.ssh && \
|
|
||||||
chmod 700 ~/.ssh && \
|
|
||||||
printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
|
|
||||||
printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
|
|
||||||
chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
|
|
||||||
chmod +x /root/cloud-entrypoint.sh && \
|
|
||||||
echo 'set-option -g history-limit 5000' >> ~/.tmux.conf
|
|
||||||
|
|
||||||
ENTRYPOINT ["/root/cloud-entrypoint.sh"]
|
|
||||||
CMD ["sleep", "infinity"]
|
|
||||||
@@ -24,13 +24,14 @@ RUN git fetch origin +$GITHUB_REF && \
|
|||||||
|
|
||||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
uv pip install --no-build-isolation -e .[deepspeed,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||||
else \
|
else \
|
||||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
|
uv pip install --no-build-isolation -e .[deepspeed,mamba-ssm] $AXOLOTL_ARGS; \
|
||||||
fi
|
fi && \
|
||||||
|
uv pip install --no-build-isolation flash-attn $AXOLOTL_ARGS
|
||||||
|
|
||||||
# So we can test the Docker image
|
# So we can test the Docker image
|
||||||
RUN pip install pytest
|
RUN uv pip install pytest
|
||||||
|
|
||||||
# fix so that git fetch/pull from remote works
|
# fix so that git fetch/pull from remote works
|
||||||
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
||||||
|
|||||||
@@ -1,48 +0,0 @@
|
|||||||
ARG BASE_TAG=main-base
|
|
||||||
FROM axolotlai/axolotl-base-uv:$BASE_TAG
|
|
||||||
|
|
||||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
|
||||||
ARG AXOLOTL_EXTRAS=""
|
|
||||||
ARG AXOLOTL_ARGS=""
|
|
||||||
ARG CUDA="118"
|
|
||||||
ARG PYTORCH_VERSION="2.1.2"
|
|
||||||
ARG TARGETARCH
|
|
||||||
|
|
||||||
ENV PYTORCH_VERSION=$PYTORCH_VERSION
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
|
|
||||||
rm -rf /var/cache/apt/archives && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
|
|
||||||
|
|
||||||
WORKDIR /workspace/axolotl
|
|
||||||
|
|
||||||
# If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
|
|
||||||
RUN uv pip uninstall causal_conv1d
|
|
||||||
RUN if [ "$TARGETARCH" = "arm64" ]; then \
|
|
||||||
BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
|
|
||||||
else \
|
|
||||||
BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
|
|
||||||
fi && \
|
|
||||||
if [ "$AXOLOTL_EXTRAS" != "" ]; then \
|
|
||||||
uv pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
|
||||||
else \
|
|
||||||
uv pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
|
|
||||||
fi && \
|
|
||||||
python scripts/unsloth_install.py --uv | sh && \
|
|
||||||
python scripts/cutcrossentropy_install.py --uv | sh && \
|
|
||||||
uv pip install pytest && \
|
|
||||||
uv cache clean
|
|
||||||
|
|
||||||
# fix so that git fetch/pull from remote works with shallow clone
|
|
||||||
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
|
||||||
git config --get remote.origin.fetch && \
|
|
||||||
git config --global credential.helper store
|
|
||||||
|
|
||||||
COPY .axolotl-complete.bash /root/.axolotl-complete.bash
|
|
||||||
RUN chmod +x /root/.axolotl-complete.bash && \
|
|
||||||
echo 'source /root/.axolotl-complete.bash' >> ~/.bashrc
|
|
||||||
@@ -2,11 +2,9 @@ ARG CUDA_VERSION="12.6.3"
|
|||||||
ARG CUDNN_VERSION=""
|
ARG CUDNN_VERSION=""
|
||||||
ARG UBUNTU_VERSION="22.04"
|
ARG UBUNTU_VERSION="22.04"
|
||||||
ARG MAX_JOBS=4
|
ARG MAX_JOBS=4
|
||||||
ARG TARGETARCH
|
|
||||||
|
|
||||||
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
||||||
|
|
||||||
ARG TARGETARCH
|
|
||||||
ARG PYTHON_VERSION="3.11"
|
ARG PYTHON_VERSION="3.11"
|
||||||
ARG PYTORCH_VERSION="2.6.0"
|
ARG PYTORCH_VERSION="2.6.0"
|
||||||
ARG CUDA="126"
|
ARG CUDA="126"
|
||||||
@@ -15,6 +13,7 @@ ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
|||||||
ENV PYTHON_VERSION=$PYTHON_VERSION
|
ENV PYTHON_VERSION=$PYTHON_VERSION
|
||||||
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
|
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
|
||||||
ENV UV_TORCH_BACKEND="cu${CUDA}"
|
ENV UV_TORCH_BACKEND="cu${CUDA}"
|
||||||
|
ENV VENV_PYTHON=/workspace/axolotl-venv/bin/python
|
||||||
|
|
||||||
RUN apt-get update \
|
RUN apt-get update \
|
||||||
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
|
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
|
||||||
@@ -31,27 +30,8 @@ RUN uv venv --no-project --relocatable axolotl-venv
|
|||||||
|
|
||||||
ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
|
ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
|
||||||
|
|
||||||
RUN uv pip install packaging setuptools wheel psutil \
|
RUN uv pip install --python "$VENV_PYTHON" packaging setuptools wheel psutil protobuf grpclib \
|
||||||
&& uv pip install torch==${PYTORCH_VERSION} torchvision \
|
&& uv pip install --python "$VENV_PYTHON" torch==${PYTORCH_VERSION} \
|
||||||
&& uv pip install awscli pydantic
|
&& uv pip install --python "$VENV_PYTHON" --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
|
||||||
|
&& uv pip install --python "$VENV_PYTHON" "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
|
||||||
RUN if [ "$TARGETARCH" = "amd64" ]; then \
|
&& uv pip install --python "$VENV_PYTHON" awscli pydantic
|
||||||
uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main"; \
|
|
||||||
uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Map Python version (e.g., 3.12 -> cp312)
|
|
||||||
RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
|
|
||||||
# Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
|
|
||||||
TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
|
|
||||||
# Map architecture
|
|
||||||
case "$TARGETARCH" in \
|
|
||||||
amd64) ARCH_TAG="x86_64" ;; \
|
|
||||||
arm64) ARCH_TAG="aarch64" ;; \
|
|
||||||
*) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
|
|
||||||
esac && \
|
|
||||||
WHL_VERSION="v0.7.16" && \
|
|
||||||
WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \
|
|
||||||
wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
|
|
||||||
uv pip install --no-cache-dir "${WHL_FILE}" && \
|
|
||||||
rm "${WHL_FILE}"
|
|
||||||
|
|||||||
2
docs/.gitignore
vendored
2
docs/.gitignore
vendored
@@ -3,5 +3,3 @@ _site/
|
|||||||
/api/*.qmd
|
/api/*.qmd
|
||||||
/api/*.html
|
/api/*.html
|
||||||
config-reference.qmd
|
config-reference.qmd
|
||||||
models/**/*.qmd
|
|
||||||
models/**/*.html
|
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ export HF_DATASETS_OFFLINE=1
|
|||||||
Download a base model using the Hugging Face CLI:
|
Download a base model using the Hugging Face CLI:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
hf download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B
|
huggingface-cli download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B
|
||||||
```
|
```
|
||||||
|
|
||||||
### 10. Create Axolotl Configuration
|
### 10. Create Axolotl Configuration
|
||||||
|
|||||||
@@ -1,178 +0,0 @@
|
|||||||
---
|
|
||||||
title: Attention
|
|
||||||
description: Supported attention modules in Axolotl
|
|
||||||
---
|
|
||||||
|
|
||||||
## SDP Attention
|
|
||||||
|
|
||||||
This is the default built-in attention in PyTorch.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
sdp_attention: true
|
|
||||||
```
|
|
||||||
|
|
||||||
For more details: [PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
|
|
||||||
|
|
||||||
## Flash Attention
|
|
||||||
|
|
||||||
Axolotl supports Flash Attention 2, 3, and 4. The best available version is used automatically
|
|
||||||
based on your installed packages and GPU.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
flash_attention: true
|
|
||||||
```
|
|
||||||
|
|
||||||
For more details: [Flash Attention](https://github.com/Dao-AILab/flash-attention/)
|
|
||||||
|
|
||||||
### Flash Attention 2
|
|
||||||
|
|
||||||
Requirements: Ampere, Ada, or Hopper GPUs (Turing or lower not supported)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install flash-attn --no-build-isolation
|
|
||||||
```
|
|
||||||
|
|
||||||
::: {.callout-tip}
|
|
||||||
|
|
||||||
If you get `undefined symbol` while training, ensure you installed PyTorch prior to Axolotl.
|
|
||||||
Alternatively, try reinstall or downgrade a version.
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
### Flash Attention 3
|
|
||||||
|
|
||||||
Requirements: Hopper only and CUDA 12.8 (recommended)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git clone https://github.com/Dao-AILab/flash-attention.git
|
|
||||||
cd flash-attention/hopper
|
|
||||||
|
|
||||||
python setup.py install
|
|
||||||
```
|
|
||||||
|
|
||||||
### Flash Attention 4
|
|
||||||
|
|
||||||
Requirements: Hopper or Blackwell GPUs
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install flash-attn-4
|
|
||||||
```
|
|
||||||
|
|
||||||
Or from source:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git clone https://github.com/Dao-AILab/flash-attention.git
|
|
||||||
cd flash-attention/flash_attn/cute
|
|
||||||
|
|
||||||
pip install -e .
|
|
||||||
|
|
||||||
# FA2's flash_attn package includes a cute/ stub that shadows FA4.
|
|
||||||
# Remove it so Python can find the real FA4 module:
|
|
||||||
rm -r $(python -c "import flash_attn; print(flash_attn.__path__[0])")/cute
|
|
||||||
```
|
|
||||||
|
|
||||||
::: {.callout-note}
|
|
||||||
|
|
||||||
**Hopper (SM90) users**: The backward kernel is not yet included in the pip package. To use FA4
|
|
||||||
for training on Hopper, install from source using the instructions above.
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
::: {.callout-warning}
|
|
||||||
|
|
||||||
FA4 only supports head dimensions up to 128 (`d ≤ 128`). The DeepSeek shape `(192, 128)` is
|
|
||||||
also supported but only on Blackwell. Axolotl automatically detects incompatible head dimensions
|
|
||||||
and falls back to FA2/3.
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
For more details: [flash-attention/flash_attn/cute](https://github.com/Dao-AILab/flash-attention/tree/main/flash_attn/cute)
|
|
||||||
|
|
||||||
### AMD
|
|
||||||
|
|
||||||
Requirements: ROCm 6.0 and above.
|
|
||||||
|
|
||||||
See [Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support).
|
|
||||||
|
|
||||||
## Flex Attention
|
|
||||||
|
|
||||||
A flexible PyTorch API for attention used in combination with `torch.compile`.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
flex_attention: true
|
|
||||||
|
|
||||||
# recommended
|
|
||||||
torch_compile: true
|
|
||||||
```
|
|
||||||
|
|
||||||
::: {.callout-note}
|
|
||||||
|
|
||||||
We recommend using latest stable version of PyTorch for best performance.
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
For more details: [PyTorch docs](https://pytorch.org/blog/flexattention/)
|
|
||||||
|
|
||||||
## SageAttention
|
|
||||||
|
|
||||||
Attention kernels with QK Int8 and PV FP16 accumulator.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
sage_attention: true
|
|
||||||
```
|
|
||||||
|
|
||||||
Requirements: Ampere, Ada, or Hopper GPUs
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install sageattention==2.2.0 --no-build-isolation
|
|
||||||
```
|
|
||||||
|
|
||||||
::: {.callout-warning}
|
|
||||||
|
|
||||||
Only LoRA/QLoRA recommended at the moment. We found loss drop to 0 for full finetuning. See [GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198).
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
For more details: [Sage Attention](https://github.com/thu-ml/SageAttention)
|
|
||||||
|
|
||||||
::: {.callout-note}
|
|
||||||
|
|
||||||
We do not support SageAttention 3 at the moment. If you are interested on adding this or improving SageAttention implementation, please make an Issue.
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
|
|
||||||
## xFormers
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
xformers_attention: true
|
|
||||||
```
|
|
||||||
|
|
||||||
::: {.callout-tip}
|
|
||||||
|
|
||||||
We recommend using with Turing GPUs or below (such as on Colab).
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
For more details: [xFormers](https://github.com/facebookresearch/xformers)
|
|
||||||
|
|
||||||
## Shifted Sparse Attention
|
|
||||||
|
|
||||||
::: {.callout-warning}
|
|
||||||
|
|
||||||
We plan to deprecate this! If you use this feature, we recommend switching to methods above.
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
Requirements: LLaMA model architecture
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
flash_attention: true
|
|
||||||
s2_attention: true
|
|
||||||
```
|
|
||||||
|
|
||||||
::: {.callout-tip}
|
|
||||||
|
|
||||||
No sample packing support!
|
|
||||||
|
|
||||||
:::
|
|
||||||
@@ -1,86 +0,0 @@
|
|||||||
---
|
|
||||||
title: "Checkpoint Saving"
|
|
||||||
format:
|
|
||||||
html:
|
|
||||||
toc: true
|
|
||||||
toc-depth: 2
|
|
||||||
number-sections: true
|
|
||||||
execute:
|
|
||||||
enabled: false
|
|
||||||
---
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
Axolotl supports on-demand checkpoint saving during training. You can trigger checkpoints via file-based triggers (for programmatic control) or Control+C (for interactive use).
|
|
||||||
|
|
||||||
## File-Based Checkpoint Trigger
|
|
||||||
|
|
||||||
### Configuration
|
|
||||||
|
|
||||||
Enable in your config:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
dynamic_checkpoint:
|
|
||||||
enabled: true
|
|
||||||
check_interval: 100 # Optional: check every N steps (default: 100)
|
|
||||||
trigger_file_path: "axolotl_checkpoint.save" # Optional: custom filename
|
|
||||||
```
|
|
||||||
|
|
||||||
**Options:**
|
|
||||||
- `enabled`: `true` to enable (required)
|
|
||||||
- `check_interval`: Steps between file checks. Default: 100. Lower = faster response, higher I/O overhead.
|
|
||||||
- `trigger_file_path`: Custom trigger filename. Default: `axolotl_checkpoint.save`
|
|
||||||
|
|
||||||
### How It Works
|
|
||||||
|
|
||||||
1. Rank 0 checks for trigger file every `check_interval` steps in `output_dir`
|
|
||||||
2. When detected, file is deleted and checkpoint is saved
|
|
||||||
3. In distributed training, rank 0 broadcasts to synchronize all ranks
|
|
||||||
|
|
||||||
### Usage
|
|
||||||
|
|
||||||
**Command line:**
|
|
||||||
```bash
|
|
||||||
touch /path/to/output_dir/axolotl_checkpoint.save
|
|
||||||
```
|
|
||||||
|
|
||||||
**Programmatic:**
|
|
||||||
```python
|
|
||||||
from pathlib import Path
|
|
||||||
Path("/path/to/output_dir/axolotl_checkpoint.save").touch()
|
|
||||||
```
|
|
||||||
|
|
||||||
Checkpoint saves within the next `check_interval` steps. The trigger file is auto-deleted after detection, so you can create it multiple times.
|
|
||||||
|
|
||||||
**Custom filename:**
|
|
||||||
```yaml
|
|
||||||
dynamic_checkpoint:
|
|
||||||
enabled: true
|
|
||||||
trigger_file_path: "my_trigger.save"
|
|
||||||
```
|
|
||||||
```bash
|
|
||||||
touch /path/to/output_dir/my_trigger.save
|
|
||||||
```
|
|
||||||
|
|
||||||
## Control+C (SIGINT) Checkpoint
|
|
||||||
|
|
||||||
Pressing `Ctrl+C` during training saves the model state and exits gracefully. **Note:** This saves only the model weights, not optimizer state. For resumable checkpoints, use the file-based trigger.
|
|
||||||
|
|
||||||
## Best Practices
|
|
||||||
|
|
||||||
- **Check interval**: Lower values (10-50) for fast training, default 100 for slower training
|
|
||||||
- **Distributed training**: Create trigger file once; rank 0 handles synchronization
|
|
||||||
- **Resume**: Dynamic checkpoints can be resumed like regular checkpoints via `resume_from_checkpoint`
|
|
||||||
|
|
||||||
## Example
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
output_dir: ./outputs/lora-out
|
|
||||||
save_steps: 500 # Scheduled checkpoints
|
|
||||||
|
|
||||||
dynamic_checkpoint:
|
|
||||||
enabled: true
|
|
||||||
check_interval: 50
|
|
||||||
```
|
|
||||||
|
|
||||||
This enables scheduled checkpoints every 500 steps plus on-demand saves via file trigger (checked every 50 steps).
|
|
||||||
@@ -210,8 +210,6 @@ axolotl lm-eval config.yml
|
|||||||
Configuration options:
|
Configuration options:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
lm_eval_model: # model to evaluate (local or hf path)
|
|
||||||
|
|
||||||
# List of tasks to evaluate
|
# List of tasks to evaluate
|
||||||
lm_eval_tasks:
|
lm_eval_tasks:
|
||||||
- arc_challenge
|
- arc_challenge
|
||||||
@@ -220,7 +218,7 @@ lm_eval_batch_size: # Batch size for evaluation
|
|||||||
output_dir: # Directory to save evaluation results
|
output_dir: # Directory to save evaluation results
|
||||||
```
|
```
|
||||||
|
|
||||||
See [LM Eval Harness integration docs](https://docs.axolotl.ai/docs/custom_integrations.html#language-model-evaluation-harness-lm-eval) for full configuration details.
|
See [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) for more details.
|
||||||
|
|
||||||
### delinearize-llama4
|
### delinearize-llama4
|
||||||
|
|
||||||
|
|||||||
@@ -218,13 +218,6 @@ If you have tool arguments with same name but different dtypes (like `"time": st
|
|||||||
```
|
```
|
||||||
"arguments": "{\"...\": \"...\"}"
|
"arguments": "{\"...\": \"...\"}"
|
||||||
```
|
```
|
||||||
|
|
||||||
The same is applicable for tool parameters.
|
|
||||||
|
|
||||||
```
|
|
||||||
"parameters": "{\"...\": \"...\"}"
|
|
||||||
```
|
|
||||||
|
|
||||||
:::
|
:::
|
||||||
|
|
||||||
Example config for Llama4:
|
Example config for Llama4:
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ While debugging it's helpful to simplify your test scenario as much as possible.
|
|||||||
1. **Make sure you are using the latest version of axolotl**: This project changes often and bugs get fixed fast. Check your git branch and make sure you have pulled the latest changes from `main`.
|
1. **Make sure you are using the latest version of axolotl**: This project changes often and bugs get fixed fast. Check your git branch and make sure you have pulled the latest changes from `main`.
|
||||||
1. **Eliminate concurrency**: Restrict the number of processes to 1 for both training and data preprocessing:
|
1. **Eliminate concurrency**: Restrict the number of processes to 1 for both training and data preprocessing:
|
||||||
- Set `CUDA_VISIBLE_DEVICES` to a single GPU, ex: `export CUDA_VISIBLE_DEVICES=0`.
|
- Set `CUDA_VISIBLE_DEVICES` to a single GPU, ex: `export CUDA_VISIBLE_DEVICES=0`.
|
||||||
- Set `dataset_num_proc: 1` in your axolotl config or run the training command with `--dataset_num_proc=1`.
|
- Set `dataset_processes: 1` in your axolotl config or run the training command with `--dataset_processes=1`.
|
||||||
2. **Use a small dataset**: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure `sample_packing: False` and `eval_sample_packing: False` to avoid errors. If you are in a pinch and don't have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training. For example, to shard the dataset into 20 pieces, add the following to your axolotl config):
|
2. **Use a small dataset**: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure `sample_packing: False` and `eval_sample_packing: False` to avoid errors. If you are in a pinch and don't have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training. For example, to shard the dataset into 20 pieces, add the following to your axolotl config):
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
@@ -72,8 +72,8 @@ datasets:
|
|||||||
Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) of Axolotl, which ensures that changes you make to the code are reflected at runtime. Run the following commands from the root of this project:
|
Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) of Axolotl, which ensures that changes you make to the code are reflected at runtime. Run the following commands from the root of this project:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip3 install packaging
|
uv sync --extra deepspeed
|
||||||
pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
|
uv pip install flash-attn --no-build-isolation
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Remote Hosts
|
#### Remote Hosts
|
||||||
@@ -101,7 +101,7 @@ For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 acceler
|
|||||||
"-m", "axolotl.cli.train", "dev_chat_template.yml",
|
"-m", "axolotl.cli.train", "dev_chat_template.yml",
|
||||||
// The flags below simplify debugging by overriding the axolotl config
|
// The flags below simplify debugging by overriding the axolotl config
|
||||||
// with the debugging tips above. Modify as needed.
|
// with the debugging tips above. Modify as needed.
|
||||||
"--dataset_num_proc=1", // limits data preprocessing to one process
|
"--dataset_processes=1", // limits data preprocessing to one process
|
||||||
"--max_steps=1", // limits training to just one step
|
"--max_steps=1", // limits training to just one step
|
||||||
"--batch_size=1", // minimizes batch size
|
"--batch_size=1", // minimizes batch size
|
||||||
"--micro_batch_size=1", // minimizes batch size
|
"--micro_batch_size=1", // minimizes batch size
|
||||||
@@ -213,8 +213,8 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --
|
|||||||
You will now be in the container. Next, perform an editable install of Axolotl:
|
You will now be in the container. Next, perform an editable install of Axolotl:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip3 install packaging
|
uv sync --extra deepspeed
|
||||||
pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
|
uv pip install flash-attn --no-build-isolation
|
||||||
```
|
```
|
||||||
|
|
||||||
### Attach To Container
|
### Attach To Container
|
||||||
|
|||||||
@@ -32,8 +32,11 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
|
|||||||
|
|
||||||
Tags examples:
|
Tags examples:
|
||||||
|
|
||||||
- `main-base-py3.11-cu128-2.8.0`
|
- `main-base-py3.11-cu128-2.7.1`
|
||||||
- `main-base-py3.11-cu128-2.9.1`
|
- `main-base-py3.11-cu126-2.7.1`
|
||||||
|
- `main-base-py3.11-cu126-2.7.0`
|
||||||
|
- `main-base-py3.11-cu126-2.6.0`
|
||||||
|
- `main-base-py3.11-cu124-2.6.0`
|
||||||
|
|
||||||
## Main
|
## Main
|
||||||
|
|
||||||
@@ -71,12 +74,15 @@ There may be some extra tags appended to the image, like `-vllm` which installs
|
|||||||
|
|
||||||
Tags examples:
|
Tags examples:
|
||||||
|
|
||||||
- `main-py3.11-cu128-2.8.0`
|
- `main-py3.11-cu128-2.7.1`
|
||||||
- `main-py3.11-cu128-2.9.1`
|
- `main-py3.11-cu126-2.7.1`
|
||||||
|
- `main-py3.11-cu126-2.7.0`
|
||||||
|
- `main-py3.11-cu126-2.6.0`
|
||||||
|
- `main-py3.11-cu124-2.6.0`
|
||||||
- `main-latest`
|
- `main-latest`
|
||||||
- `main-20250303-py3.11-cu124-2.6.0`
|
- `main-20250303-py3.11-cu124-2.6.0`
|
||||||
- `main-20250303-py3.11-cu126-2.6.0`
|
- `main-20250303-py3.11-cu126-2.6.0`
|
||||||
- `0.12.0`
|
- `0.10.1`
|
||||||
|
|
||||||
## Cloud
|
## Cloud
|
||||||
|
|
||||||
|
|||||||
@@ -1,67 +0,0 @@
|
|||||||
---
|
|
||||||
title: "MoE Expert Quantization"
|
|
||||||
description: "Reduce VRAM usage when training MoE model adapters by quantizing expert weights on load"
|
|
||||||
---
|
|
||||||
|
|
||||||
Transformers v5 changed MoE expert layers from `nn.Linear` to fused `nn.Parameter` (3D+ tensors).
|
|
||||||
This means `bitsandbytes` can no longer quantize them during model loading, resulting in all expert
|
|
||||||
weights being loaded in full bf16 precision and causing massive VRAM usage.
|
|
||||||
|
|
||||||
`quantize_moe_experts` solves this by quantizing expert weights during model loading.
|
|
||||||
It intercepts the weight loading process, quantizes each expert tensor on the fly, and
|
|
||||||
immediately frees the original bf16 tensor from VRAM. This dramatically reduces peak memory.
|
|
||||||
For example, GLM-4.7-Flash QLoRA drops from ~127GiB to ~23GiB reserved memory.
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
Enable expert quantization in your Axolotl config:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
quantize_moe_experts: true
|
|
||||||
```
|
|
||||||
|
|
||||||
This works with both 4-bit (QLoRA) and 8-bit (LoRA) quantization.
|
|
||||||
|
|
||||||
### Expert LoRA targeting
|
|
||||||
|
|
||||||
You can optionally apply LoRA adapters directly to expert weights using `lora_target_parameters`:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
lora_target_parameters:
|
|
||||||
- mlp.experts.gate_up_proj
|
|
||||||
- mlp.experts.down_proj
|
|
||||||
# - mlp.gate.weight # router
|
|
||||||
```
|
|
||||||
|
|
||||||
::: {.callout-note}
|
|
||||||
`lora_dropout` must be `0` when using `lora_target_parameters`.
|
|
||||||
:::
|
|
||||||
|
|
||||||
## Requirements
|
|
||||||
|
|
||||||
- Requires (`adapter: lora` and `load_in_8bit: true`) or (`adapter: qlora` and `load_in_4bit: true`)
|
|
||||||
- CUDA GPUs only (not tested with ROCm or other backends)
|
|
||||||
- FSDP2 compatible for distributed training
|
|
||||||
|
|
||||||
## Limitations
|
|
||||||
|
|
||||||
- `lora_target_linear` is not compatible with `quantize_moe_experts`. See [Expert LoRA targeting](#expert-lora-targeting) instead.
|
|
||||||
- `cpu_ram_efficient_loading` hangs / takes long time with FSDP2 + QLoRA.
|
|
||||||
- Total model parameter count may display incorrectly (trainable param count is correct).
|
|
||||||
- FSDP LoRA (8-bit) may have a large initial VRAM spike at the first 1-2 steps, which then drops. QLoRA does not exhibit this.
|
|
||||||
- FSDP2 may use more VRAM per GPU than single GPU training due to not all layers being properly sharded across ranks.
|
|
||||||
- Model loading takes longer due to on-demand quantization, even on consecutive runs.
|
|
||||||
- DeepSpeed has not been tested.
|
|
||||||
|
|
||||||
## Implementation details
|
|
||||||
|
|
||||||
The quantization is applied by patching transformers to intercept weight loading.
|
|
||||||
When a 3D+ CUDA tensor with "expert" in its name is detected:
|
|
||||||
|
|
||||||
- **4-bit mode:** Uses bitsandbytes NF4 parametrization (configurable via `bnb_4bit_quant_type`).
|
|
||||||
- **8-bit mode:** Uses a custom row-wise int8 parametrization with bitsandbytes dequantization.
|
|
||||||
|
|
||||||
The original bf16 tensor is freed immediately after quantization. Multiple sub-patches are applied to
|
|
||||||
transformers, PEFT and accelerate FSDP2 to support these parametrized expert modules.
|
|
||||||
|
|
||||||
For full implementation details, see [PR #3439](https://github.com/axolotl-ai-cloud/axolotl/pull/3439).
|
|
||||||
@@ -63,14 +63,6 @@ description: Frequently asked questions
|
|||||||
|
|
||||||
> A: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.
|
> A: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.
|
||||||
|
|
||||||
**Q: Can we mix text and text+image datasets for VLM training?**
|
|
||||||
|
|
||||||
> A: Yes, you can for newer VLM arch. The ones that would not work are LLaVA / Pixtral arch. If you notice one not working, please let us know!
|
|
||||||
|
|
||||||
**Q: Why is `memory/max_*` different from `nvidia-smi`?**
|
|
||||||
|
|
||||||
> A: We use `torch` APIs to retrieve this information. You can see https://docs.pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management for more information.
|
|
||||||
|
|
||||||
### Chat templates
|
### Chat templates
|
||||||
|
|
||||||
**Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
|
**Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
---
|
---
|
||||||
title: Gradient Checkpointing, Activation Offloading, and Layer Offloading
|
title: Gradient Checkpointing and Activation Offloading
|
||||||
---
|
---
|
||||||
|
|
||||||
Gradient checkpointing and activation offloading are techniques used to optimize the performance of deep learning
|
Gradient checkpointing and activation offloading are techniques used to optimize the performance of deep learning
|
||||||
@@ -27,33 +27,3 @@ The `activation_offloading: legacy` naively offloads activations to CPU and with
|
|||||||
|
|
||||||
For resource constrained environments with limited CPU memory, `activation_offloading: disk` offloads
|
For resource constrained environments with limited CPU memory, `activation_offloading: disk` offloads
|
||||||
activations to disk instead of CPU RAM so that much larger context lengths can be trained with minimal memory.
|
activations to disk instead of CPU RAM so that much larger context lengths can be trained with minimal memory.
|
||||||
|
|
||||||
### Enabling Layer Offloading
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
layer_offloading: true
|
|
||||||
```
|
|
||||||
|
|
||||||
Layer offloading reduces GPU memory usage by moving frozen (non-trainable) decoder layer parameters to CPU
|
|
||||||
and streaming them back to GPU one layer at a time during the forward and backward passes. This is
|
|
||||||
particularly useful for LoRA/QLoRA training where most of the model's parameters are frozen — only the
|
|
||||||
trainable adapter weights stay on GPU permanently.
|
|
||||||
|
|
||||||
During training, forward and backward hooks on each decoder layer handle the transfer automatically:
|
|
||||||
|
|
||||||
- **Forward pass:** Before a layer executes, its frozen params are loaded to GPU. The next layer is
|
|
||||||
prefetched asynchronously on a separate CUDA stream for overlap.
|
|
||||||
- **Backward pass:** Same pattern in reverse — the current layer's frozen params are loaded and the
|
|
||||||
previous layer is prefetched.
|
|
||||||
|
|
||||||
After each layer finishes, its frozen params are offloaded back to CPU pinned memory.
|
|
||||||
|
|
||||||
This approach trades some CPU-GPU transfer overhead for significant GPU memory savings — the freed memory
|
|
||||||
is roughly equal to the size of all frozen parameters across all decoder layers, minus one layer's worth
|
|
||||||
that is kept on GPU at any given time.
|
|
||||||
|
|
||||||
**Requirements:**
|
|
||||||
|
|
||||||
- CUDA GPU (CPU-only training is not supported for this feature)
|
|
||||||
- Works with any HuggingFace model architecture that uses decoder layers (Llama, Mistral, Qwen, etc.)
|
|
||||||
- Best combined with LoRA/QLoRA where most parameters are frozen
|
|
||||||
|
|||||||
@@ -26,22 +26,43 @@ Follow the instructions at: [https://pytorch.org/get-started/locally/](https://p
|
|||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.callout-important}
|
::: {.callout-important}
|
||||||
For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.
|
For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
### PyPI Installation (Recommended) {#sec-pypi}
|
### uv Installation (Recommended) {#sec-uv-quick}
|
||||||
|
|
||||||
```{.bash}
|
```{.bash}
|
||||||
pip3 install -U packaging setuptools wheel ninja
|
# Install uv if not already installed
|
||||||
pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
|
||||||
|
# Add Axolotl to a project (recommended)
|
||||||
|
uv init my-project && cd my-project
|
||||||
|
uv add axolotl
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
|
source .venv/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
|
For a quick one-off install without creating a project:
|
||||||
|
|
||||||
|
```{.bash}
|
||||||
|
uv pip install axolotl
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
|
```
|
||||||
|
|
||||||
|
### pip Installation {#sec-pypi}
|
||||||
|
|
||||||
|
```{.bash}
|
||||||
|
pip install --no-build-isolation axolotl[deepspeed]
|
||||||
|
pip install --no-build-isolation flash-attn
|
||||||
```
|
```
|
||||||
|
|
||||||
We use `--no-build-isolation` in order to detect the installed PyTorch version (if
|
We use `--no-build-isolation` in order to detect the installed PyTorch version (if
|
||||||
installed) in order not to clobber it, and so that we set the correct version of
|
installed) in order not to clobber it, and so that we set the correct version of
|
||||||
dependencies that are specific to the PyTorch version or other installed
|
dependencies that are specific to the PyTorch version or other installed
|
||||||
co-dependencies.
|
co-dependencies. Flash Attention is resolved separately so it can be built against
|
||||||
|
the environment configured by the previous step.
|
||||||
|
|
||||||
### uv Installation {#sec-uv}
|
### Advanced uv Installation {#sec-uv}
|
||||||
|
|
||||||
uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
|
uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
|
||||||
|
|
||||||
@@ -62,28 +83,38 @@ source .venv/bin/activate
|
|||||||
Install PyTorch
|
Install PyTorch
|
||||||
- PyTorch 2.6.0 recommended
|
- PyTorch 2.6.0 recommended
|
||||||
```{.bash}
|
```{.bash}
|
||||||
uv pip install packaging setuptools wheel
|
|
||||||
uv pip install torch==2.6.0
|
uv pip install torch==2.6.0
|
||||||
uv pip install awscli pydantic
|
uv pip install awscli pydantic
|
||||||
```
|
```
|
||||||
|
|
||||||
Install axolotl from PyPi
|
Install axolotl from PyPi
|
||||||
```{.bash}
|
```{.bash}
|
||||||
uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
|
uv pip install --no-build-isolation axolotl[deepspeed]
|
||||||
|
|
||||||
# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
|
# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
|
||||||
uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
|
# uv pip install --no-build-isolation axolotl[deepspeed,vllm]
|
||||||
|
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
```
|
```
|
||||||
|
|
||||||
### Edge/Development Build {#sec-edge-build}
|
### Edge/Development Build {#sec-edge-build}
|
||||||
|
|
||||||
For the latest features between releases:
|
For the latest features between releases:
|
||||||
|
|
||||||
|
#### Using uv (recommended)
|
||||||
```{.bash}
|
```{.bash}
|
||||||
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
||||||
cd axolotl
|
cd axolotl
|
||||||
pip3 install -U packaging setuptools wheel ninja
|
curl -LsSf https://astral.sh/uv/install.sh | sh # If not already installed
|
||||||
pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
|
uv sync
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Using pip
|
||||||
|
```{.bash}
|
||||||
|
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
||||||
|
cd axolotl
|
||||||
|
pip install --no-build-isolation -e '.[deepspeed]'
|
||||||
|
pip install --no-build-isolation flash-attn
|
||||||
```
|
```
|
||||||
|
|
||||||
### Docker {#sec-docker}
|
### Docker {#sec-docker}
|
||||||
@@ -111,7 +142,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
|
|||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.callout-important}
|
::: {.callout-important}
|
||||||
For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1`.
|
For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.7.0` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0`.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
|
Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
|
||||||
@@ -141,7 +172,7 @@ For providers supporting Docker:
|
|||||||
### macOS {#sec-macos}
|
### macOS {#sec-macos}
|
||||||
|
|
||||||
```{.bash}
|
```{.bash}
|
||||||
pip3 install --no-build-isolation -e '.'
|
uv pip install --no-build-isolation -e '.'
|
||||||
```
|
```
|
||||||
|
|
||||||
See @sec-troubleshooting for Mac-specific issues.
|
See @sec-troubleshooting for Mac-specific issues.
|
||||||
@@ -160,12 +191,17 @@ We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
|
|||||||
2. Install PyTorch: https://pytorch.org/get-started/locally/
|
2. Install PyTorch: https://pytorch.org/get-started/locally/
|
||||||
3. Install Axolotl:
|
3. Install Axolotl:
|
||||||
```{.bash}
|
```{.bash}
|
||||||
pip3 install -U packaging setuptools wheel ninja
|
# Option A: add Axolotl to the environment
|
||||||
pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
|
uv add axolotl
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
|
|
||||||
|
# Option B: quick install
|
||||||
|
uv pip install axolotl
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
```
|
```
|
||||||
4. (Optional) Login to Hugging Face:
|
4. (Optional) Login to Hugging Face:
|
||||||
```{.bash}
|
```{.bash}
|
||||||
hf auth login
|
huggingface-cli login
|
||||||
```
|
```
|
||||||
|
|
||||||
## Troubleshooting {#sec-troubleshooting}
|
## Troubleshooting {#sec-troubleshooting}
|
||||||
|
|||||||
@@ -89,10 +89,6 @@ lora_o_kernel: true
|
|||||||
Currently, LoRA kernels are not supported for RLHF training, only SFT.
|
Currently, LoRA kernels are not supported for RLHF training, only SFT.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.callout-warning}
|
|
||||||
LoRA kernels do not support remote modeling code.
|
|
||||||
:::
|
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
- One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
|
- One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
|
||||||
|
|||||||
@@ -27,9 +27,3 @@ learning_rate: 2e-5
|
|||||||
In this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate
|
In this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate
|
||||||
of 1e-6 for all the self attention `o_proj` modules across all layers, and a learning are of 1e-5 to the 3rd layer's
|
of 1e-6 for all the self attention `o_proj` modules across all layers, and a learning are of 1e-5 to the 3rd layer's
|
||||||
self attention `q_proj` module.
|
self attention `q_proj` module.
|
||||||
|
|
||||||
::: {.callout-note}
|
|
||||||
|
|
||||||
We currently only support varying `lr` for now. If you're interested in adding support for others (`weight_decay`), we welcome PRs. See https://github.com/axolotl-ai-cloud/axolotl/blob/613bcf90e58f3ab81d3827e7fc572319908db9fb/src/axolotl/core/trainers/mixins/optimizer.py#L17
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ format:
|
|||||||
html:
|
html:
|
||||||
toc: true
|
toc: true
|
||||||
toc-depth: 3
|
toc-depth: 3
|
||||||
# number-sections: true
|
number-sections: true
|
||||||
code-tools: true
|
code-tools: true
|
||||||
execute:
|
execute:
|
||||||
enabled: false
|
enabled: false
|
||||||
@@ -14,18 +14,12 @@ This guide covers advanced training configurations for multi-GPU setups using Ax
|
|||||||
|
|
||||||
## Overview {#sec-overview}
|
## Overview {#sec-overview}
|
||||||
|
|
||||||
When training on multiple GPUs, Axolotl supports 3 sharding/parallelism strategies. Additionally, you can layer specific optimization features on top of that strategy.
|
Axolotl supports several methods for multi-GPU training:
|
||||||
|
|
||||||
You generally cannot combine these strategies; they are mutually exclusive.
|
- DeepSpeed (recommended)
|
||||||
|
- FSDP (Fully Sharded Data Parallel)
|
||||||
1. **DeepSpeed**: Powerful optimization library, supports ZeRO stages 1-3.
|
- Sequence parallelism
|
||||||
2. **FSDP (Fully Sharded Data Parallel)**: PyTorch's native sharding implementation (Recommended).
|
- FSDP + QLoRA
|
||||||
3. **DDP (Distributed Data Parallel)**: PyTorch's native parallelism implementation (Default if neither of the above are selected).
|
|
||||||
|
|
||||||
These features can often be combined with the strategies above:
|
|
||||||
|
|
||||||
* **Sequence Parallelism**: Splits long sequences across GPUs (Compatible with DDP, DeepSpeed, and FSDP).
|
|
||||||
* **FSDP + QLoRA**: Combines 4-bit quantization with FSDP (Specific to FSDP).
|
|
||||||
|
|
||||||
## DeepSpeed {#sec-deepspeed}
|
## DeepSpeed {#sec-deepspeed}
|
||||||
|
|
||||||
@@ -71,18 +65,12 @@ Start from Stage 1 -> Stage 2 -> Stage 3.
|
|||||||
|
|
||||||
## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
|
## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
|
||||||
|
|
||||||
FSDP allows you to shard model parameters, gradients, and optimizer states across data parallel workers.
|
|
||||||
|
|
||||||
::: {.callout-note}
|
::: {.callout-note}
|
||||||
|
|
||||||
FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.
|
FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.
|
||||||
|
|
||||||
:::
|
:::
|
||||||
|
|
||||||
### FSDP + QLoRA {#sec-fsdp-qlora}
|
|
||||||
|
|
||||||
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
|
|
||||||
|
|
||||||
### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}
|
### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}
|
||||||
|
|
||||||
To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and
|
To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and
|
||||||
@@ -100,7 +88,6 @@ fsdp_sync_module_states | **REMOVED**
|
|||||||
fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading
|
fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading
|
||||||
fsdp_state_dict_type | state_dict_type
|
fsdp_state_dict_type | state_dict_type
|
||||||
fsdp_use_orig_params | **REMOVED**
|
fsdp_use_orig_params | **REMOVED**
|
||||||
fsdp_activation_checkpointing | activation_checkpointing
|
|
||||||
|
|
||||||
For more details, please see the migration guide in the [torchtitan repo](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md). In Axolotl,
|
For more details, please see the migration guide in the [torchtitan repo](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md). In Axolotl,
|
||||||
if you were using the following FSDP1 config:
|
if you were using the following FSDP1 config:
|
||||||
@@ -157,6 +144,10 @@ single sequence causes OOM errors during model training.
|
|||||||
|
|
||||||
See our [dedicated guide](sequence_parallelism.qmd) for more information.
|
See our [dedicated guide](sequence_parallelism.qmd) for more information.
|
||||||
|
|
||||||
|
### FSDP + QLoRA {#sec-fsdp-qlora}
|
||||||
|
|
||||||
|
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
|
||||||
|
|
||||||
## Performance Optimization {#sec-performance}
|
## Performance Optimization {#sec-performance}
|
||||||
|
|
||||||
### Liger Kernel Integration {#sec-liger}
|
### Liger Kernel Integration {#sec-liger}
|
||||||
|
|||||||
@@ -13,18 +13,14 @@ format:
|
|||||||
- [Pixtral](#sec-pixtral)
|
- [Pixtral](#sec-pixtral)
|
||||||
- [Llava-1.5](#sec-llava-15)
|
- [Llava-1.5](#sec-llava-15)
|
||||||
- [Mistral-Small-3.1](#sec-mistral-small-31)
|
- [Mistral-Small-3.1](#sec-mistral-small-31)
|
||||||
- [Mistral-Small-4](#sec-mistral-small-4)
|
|
||||||
- [Magistral-Small-2509](#sec-magistral-small-2509)
|
- [Magistral-Small-2509](#sec-magistral-small-2509)
|
||||||
- [Voxtral](#sec-voxtral)
|
- [Voxtral](#sec-voxtral)
|
||||||
- [Gemma-3](#sec-gemma-3)
|
- [Gemma-3](#sec-gemma-3)
|
||||||
- [Gemma-3n](#sec-gemma-3n)
|
- [Gemma-3n](#sec-gemma-3n)
|
||||||
- [Qwen2-VL](#sec-qwen2-vl)
|
- [Qwen2-VL](#sec-qwen2-vl)
|
||||||
- [Qwen2.5-VL](#sec-qwen25-vl)
|
- [Qwen2.5-VL](#sec-qwen25-vl)
|
||||||
- [Qwen3.5](#sec-qwen3-5)
|
|
||||||
- [GLM-4.6V](#sec-glm-4-6v)
|
|
||||||
- [SmolVLM2](#sec-smolvlm2)
|
- [SmolVLM2](#sec-smolvlm2)
|
||||||
- [LFM2-VL](#sec-lfm2-vl)
|
- [LFM2-VL](#sec-lfm2-vl)
|
||||||
- [Intern-VL](#sec-intern-vl)
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
@@ -60,14 +56,10 @@ image_resize_algorithm: bilinear
|
|||||||
|
|
||||||
Please see [examples](https://github.com/axolotl-ai/axolotl/tree/main/examples) folder for full configs.
|
Please see [examples](https://github.com/axolotl-ai/axolotl/tree/main/examples) folder for full configs.
|
||||||
|
|
||||||
::: {.callout-tip}
|
::: {.callout-warning}
|
||||||
Some of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.
|
Some of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.callout-note}
|
|
||||||
As of now, we do not truncate nor drop samples based on `sequence_len` as each arch has different ways to process non-text tokens. We are looking for help on this.
|
|
||||||
:::
|
|
||||||
|
|
||||||
### Mllama {#sec-mllama}
|
### Mllama {#sec-mllama}
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
@@ -103,23 +95,17 @@ chat_template: llava
|
|||||||
### Mistral-Small-3.1 {#sec-mistral-small-31}
|
### Mistral-Small-3.1 {#sec-mistral-small-31}
|
||||||
|
|
||||||
::: {.callout-tip}
|
::: {.callout-tip}
|
||||||
Please make sure to install vision lib via `pip install 'mistral-common[opencv]==1.8.5'`
|
Please make sure to install vision lib via `uv pip install 'mistral-common[opencv]==1.8.5'`
|
||||||
:::
|
:::
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
|
base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
|
||||||
```
|
```
|
||||||
|
|
||||||
### Mistral-Small-4 {#sec-mistral-small-4}
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
base_model: mistralai/Mistral-Small-4-119B-2603
|
|
||||||
```
|
|
||||||
|
|
||||||
### Magistral-Small-2509 {#sec-magistral-small-2509}
|
### Magistral-Small-2509 {#sec-magistral-small-2509}
|
||||||
|
|
||||||
::: {.callout-tip}
|
::: {.callout-tip}
|
||||||
Please make sure to install vision lib via `pip install 'mistral-common[opencv]==1.8.5'`
|
Please make sure to install vision lib via `uv pip install 'mistral-common[opencv]==1.8.5'`
|
||||||
:::
|
:::
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
@@ -129,13 +115,11 @@ base_model: mistralai/Magistral-Small-2509
|
|||||||
### Voxtral {#sec-voxtral}
|
### Voxtral {#sec-voxtral}
|
||||||
|
|
||||||
::: {.callout-tip}
|
::: {.callout-tip}
|
||||||
Please make sure to install audio lib via `pip3 install librosa==0.11.0 'mistral_common[audio]==1.8.3'`
|
Please make sure to install audio lib via `uv pip install librosa==0.11.0 'mistral_common[audio]==1.8.3'`
|
||||||
:::
|
:::
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
base_model: mistralai/Voxtral-Mini-3B-2507
|
base_model: mistralai/Voxtral-Mini-3B-2507
|
||||||
|
|
||||||
processor_type: VoxtralProcessor
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Gemma-3 {#sec-gemma-3}
|
### Gemma-3 {#sec-gemma-3}
|
||||||
@@ -159,7 +143,7 @@ The model's initial loss and grad norm will be very high. We suspect this to be
|
|||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.callout-tip}
|
::: {.callout-tip}
|
||||||
Please make sure to install `timm` via `pip3 install timm==1.0.17`
|
Please make sure to install `timm` via `uv pip install timm==1.0.17`
|
||||||
:::
|
:::
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
@@ -184,38 +168,10 @@ base_model: Qwen/Qwen2.5-VL-7B-Instruct
|
|||||||
chat_template: qwen2_vl # same as qwen2-vl
|
chat_template: qwen2_vl # same as qwen2-vl
|
||||||
```
|
```
|
||||||
|
|
||||||
### Qwen3-VL {#sec-qwen3-vl}
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
base_model: Qwen/Qwen3-VL-4B-Instruct
|
|
||||||
|
|
||||||
chat_template: qwen2_vl # same as qwen2-vl
|
|
||||||
```
|
|
||||||
|
|
||||||
### Qwen3.5 {#sec-qwen3-5}
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
base_model: Qwen/Qwen3.5-9B
|
|
||||||
|
|
||||||
chat_template: qwen3_5
|
|
||||||
```
|
|
||||||
|
|
||||||
### GLM-4.6V {#sec-glm-4-6v}
|
|
||||||
|
|
||||||
Both GLM-4.6V (106B MoE) and GLM-4.6V-Flash (9B) are supported.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# GLM-4.6V (106B MoE version)
|
|
||||||
base_model: zai-org/GLM-4.6V
|
|
||||||
|
|
||||||
# OR GLM-4.6V-Flash (9B version)
|
|
||||||
base_model: zai-org/GLM-4.6V-Flash
|
|
||||||
```
|
|
||||||
|
|
||||||
### SmolVLM2 {#sec-smolvlm2}
|
### SmolVLM2 {#sec-smolvlm2}
|
||||||
|
|
||||||
::: {.callout-tip}
|
::: {.callout-tip}
|
||||||
Please make sure to install `num2words` via `pip3 install num2words==0.5.14`
|
Please make sure to install `num2words` via `uv pip install num2words==0.5.14`
|
||||||
:::
|
:::
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
@@ -225,23 +181,13 @@ base_model: HuggingFaceTB/SmolVLM2-500M-Video-Instruct
|
|||||||
### LFM2-VL {#sec-lfm2-vl}
|
### LFM2-VL {#sec-lfm2-vl}
|
||||||
|
|
||||||
::: {.callout-warning}
|
::: {.callout-warning}
|
||||||
Please uninstall `causal-conv1d` via `pip3 uninstall -y causal-conv1d`
|
Please uninstall `causal-conv1d` via `uv pip uninstall -y causal-conv1d`
|
||||||
:::
|
:::
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
base_model: LiquidAI/LFM2-VL-450M
|
base_model: LiquidAI/LFM2-VL-450M
|
||||||
```
|
```
|
||||||
|
|
||||||
### Intern-VL {#sec-intern-vl}
|
|
||||||
|
|
||||||
::: {.callout-tip}
|
|
||||||
Please make sure to install `timm` via `pip3 install timm==1.0.19`
|
|
||||||
:::
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
base_model: OpenGVLab/InternVL3_5-8B
|
|
||||||
```
|
|
||||||
|
|
||||||
## Dataset Format
|
## Dataset Format
|
||||||
|
|
||||||
For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
|
For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
|
||||||
@@ -276,7 +222,7 @@ For audio loading, you can use the following keys within `content` alongside `"t
|
|||||||
|
|
||||||
::: {.callout-tip}
|
::: {.callout-tip}
|
||||||
|
|
||||||
You may need to install `librosa` via `pip3 install librosa==0.11.0`.
|
You may need to install `librosa` via `uv pip install librosa==0.11.0`.
|
||||||
|
|
||||||
:::
|
:::
|
||||||
|
|
||||||
|
|||||||
@@ -54,13 +54,6 @@ These techniques save VRAM by changing how activations are handled.
|
|||||||
- Activation Offloading: moves activations to CPU RAM or disk, trading I/O overhead for VRAM.
|
- Activation Offloading: moves activations to CPU RAM or disk, trading I/O overhead for VRAM.
|
||||||
- Learn more: [Gradient Checkpointing and Offloading Docs](gradient_checkpointing.qmd)
|
- Learn more: [Gradient Checkpointing and Offloading Docs](gradient_checkpointing.qmd)
|
||||||
|
|
||||||
### Layer Offloading
|
|
||||||
|
|
||||||
Offloads frozen (non-trainable) decoder layer parameters to CPU and streams them back to GPU one layer at a time during forward/backward passes using CUDA stream prefetching. Especially effective for LoRA/QLoRA where most parameters are frozen.
|
|
||||||
|
|
||||||
- **Config:** `layer_offloading: true`
|
|
||||||
- **Learn more:** [Layer Offloading Docs](gradient_checkpointing.qmd#enabling-layer-offloading)
|
|
||||||
|
|
||||||
### Cut Cross Entropy (CCE)
|
### Cut Cross Entropy (CCE)
|
||||||
|
|
||||||
Reduces VRAM usage by using an optimized cross-entropy loss calculation.
|
Reduces VRAM usage by using an optimized cross-entropy loss calculation.
|
||||||
@@ -73,15 +66,6 @@ Provides efficient Triton kernels to improve training speed and reduce memory us
|
|||||||
|
|
||||||
- **Learn more:** [Custom Integrations - Liger Kernels](custom_integrations.qmd#liger-kernels)
|
- **Learn more:** [Custom Integrations - Liger Kernels](custom_integrations.qmd#liger-kernels)
|
||||||
|
|
||||||
### Expert Kernels
|
|
||||||
|
|
||||||
Optimized kernel implementations for Mixture of Experts (MoE) model training.
|
|
||||||
|
|
||||||
- **ScatterMoE**: Triton-based MoE kernels with fused LoRA support.
|
|
||||||
- **SonicMoE**: CUTLASS-based MoE kernels for NVIDIA Hopper and Blackwell GPUs.
|
|
||||||
|
|
||||||
- **Learn more:** [Custom Integrations - Kernels Integration](custom_integrations.qmd#kernels-integration)
|
|
||||||
|
|
||||||
## Long Context Models
|
## Long Context Models
|
||||||
|
|
||||||
Techniques to train models on sequences longer than their original context window.
|
Techniques to train models on sequences longer than their original context window.
|
||||||
@@ -147,10 +131,3 @@ Simulates quantization effects during training, helping the model adapt and pote
|
|||||||
Allows you to finetune LoRA adapters on top of a model that has already been quantized using the GPTQ method.
|
Allows you to finetune LoRA adapters on top of a model that has already been quantized using the GPTQ method.
|
||||||
|
|
||||||
- **Example:** [GPTQ LoRA Example](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-2/gptq-lora.yml)
|
- **Example:** [GPTQ LoRA Example](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-2/gptq-lora.yml)
|
||||||
|
|
||||||
### MoE Expert Quantization
|
|
||||||
|
|
||||||
Quantizes MoE expert weights on load to reduce VRAM when training MoE models with adapters. Required for Transformers v5+ MoE models where experts use fused `nn.Parameter` tensors.
|
|
||||||
|
|
||||||
- **Config:** `quantize_moe_experts: true`
|
|
||||||
- **Learn more:** [MoE Expert Quantization](expert_quantization.qmd)
|
|
||||||
|
|||||||
429
docs/rlhf.qmd
429
docs/rlhf.qmd
@@ -17,7 +17,6 @@ feedback. Various methods include, but not limited to:
|
|||||||
- [Kahneman-Tversky Optimization (KTO)](#kto)
|
- [Kahneman-Tversky Optimization (KTO)](#kto)
|
||||||
- [Odds Ratio Preference Optimization (ORPO)](#orpo)
|
- [Odds Ratio Preference Optimization (ORPO)](#orpo)
|
||||||
- [Group Relative Policy Optimization (GRPO)](#grpo)
|
- [Group Relative Policy Optimization (GRPO)](#grpo)
|
||||||
- [Group Reward-Decoupled Policy Optimization (GDPO)](#gdpo)
|
|
||||||
|
|
||||||
|
|
||||||
## RLHF using Axolotl
|
## RLHF using Axolotl
|
||||||
@@ -220,21 +219,6 @@ DPO supports the following types with the following dataset format:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
#### chat_template.argilla_chat
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"chosen": [
|
|
||||||
{"role": "user", "content": "..."},
|
|
||||||
{"role": "assistant", "content": "..."}
|
|
||||||
],
|
|
||||||
"rejected": [
|
|
||||||
{"role": "user", "content": "..."},
|
|
||||||
{"role": "assistant", "content": "..."}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### chat_template.default
|
#### chat_template.default
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
@@ -598,116 +582,6 @@ To see other examples of custom reward functions, please see [TRL GRPO Docs](htt
|
|||||||
|
|
||||||
To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
|
To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
|
||||||
|
|
||||||
#### OpenEnv Rollout Functions
|
|
||||||
|
|
||||||
GRPO supports custom rollout functions for OpenEnv-style environments, enabling interactive tasks like web browsing, code execution, or tool use. This allows you to implement custom generation logic that interacts with external environments.
|
|
||||||
|
|
||||||
For example, to implement a simple math-solving environment with step-by-step verification:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# math_env.py
|
|
||||||
import re
|
|
||||||
|
|
||||||
def math_solver_rollout(model, processing_class, prompts, generation_config=None):
|
|
||||||
"""
|
|
||||||
Custom rollout function that generates step-by-step math solutions.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
model: The language model
|
|
||||||
processing_class: The tokenizer/processing_class
|
|
||||||
prompts: List of prompt dicts (with 'messages' key for chat format)
|
|
||||||
generation_config: Optional generation configuration
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of completion strings
|
|
||||||
"""
|
|
||||||
completions = []
|
|
||||||
|
|
||||||
for prompt in prompts:
|
|
||||||
# Apply chat template to prompt
|
|
||||||
messages = prompt.get("messages", [])
|
|
||||||
formatted_prompt = processing_class.apply_chat_template(
|
|
||||||
messages, processing_class=False, add_generation_prompt=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate step-by-step solution
|
|
||||||
full_response = ""
|
|
||||||
for step in range(5): # Max 5 reasoning steps
|
|
||||||
current_input = formatted_prompt + full_response + "\nNext step:"
|
|
||||||
inputs = processing_class(current_input, return_tensors="pt").to(model.device)
|
|
||||||
|
|
||||||
outputs = model.generate(
|
|
||||||
**inputs,
|
|
||||||
max_new_tokens=100,
|
|
||||||
generation_config=generation_config,
|
|
||||||
)
|
|
||||||
step_text = processing_class.decode(
|
|
||||||
outputs[0][inputs.input_ids.shape[1]:],
|
|
||||||
skip_special_tokens=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check if solution is complete
|
|
||||||
if "FINAL ANSWER:" in step_text:
|
|
||||||
full_response += step_text
|
|
||||||
break
|
|
||||||
full_response += step_text + "\n"
|
|
||||||
|
|
||||||
completions.append(full_response)
|
|
||||||
|
|
||||||
return completions
|
|
||||||
|
|
||||||
def math_reward(prompts, completions, answers, **kwargs):
|
|
||||||
"""Reward function that checks mathematical correctness"""
|
|
||||||
rewards = []
|
|
||||||
for completion, correct_answer in zip(completions, answers):
|
|
||||||
# Extract predicted answer
|
|
||||||
match = re.search(r"FINAL ANSWER:\s*(.+)", completion)
|
|
||||||
predicted = match.group(1).strip() if match else ""
|
|
||||||
|
|
||||||
# Compare with correct answer
|
|
||||||
reward = 1.0 if predicted == str(correct_answer) else 0.0
|
|
||||||
rewards.append(reward)
|
|
||||||
|
|
||||||
return rewards
|
|
||||||
|
|
||||||
def math_transform(cfg, *args, **kwargs):
|
|
||||||
"""Transform dataset to GRPO format with answer field"""
|
|
||||||
def transform_fn(example, processing_class=None):
|
|
||||||
return {
|
|
||||||
"prompt": [{"role": "user", "content": example["question"]}],
|
|
||||||
"answer": str(example["answer"]),
|
|
||||||
}
|
|
||||||
return transform_fn, {"remove_columns": ["question"]}
|
|
||||||
```
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
rl: grpo
|
|
||||||
|
|
||||||
trl:
|
|
||||||
beta: 0.001
|
|
||||||
max_completion_length: 512
|
|
||||||
num_generations: 4
|
|
||||||
rollout_func: "math_env.math_solver_rollout" # Custom rollout function
|
|
||||||
reward_funcs: ["math_env.math_reward"]
|
|
||||||
reward_weights: [1.0]
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: openai/gsm8k
|
|
||||||
name: main
|
|
||||||
type: math_env.math_transform
|
|
||||||
```
|
|
||||||
|
|
||||||
The `rollout_func` parameter accepts a fully qualified name (e.g., `module_name.function_name`) that points to a callable function in your local directory. The function receives:
|
|
||||||
|
|
||||||
- `model`: The language model
|
|
||||||
- `processing_class`: The tokenizer/processing class
|
|
||||||
- `prompts`: List of prompt dictionaries
|
|
||||||
- `generation_config` (optional): Generation configuration
|
|
||||||
|
|
||||||
And should return a list of completion strings.
|
|
||||||
|
|
||||||
For more OpenEnv examples, see [TRL OpenEnv Documentation](https://huggingface.co/docs/trl/main/en/openenv).
|
|
||||||
|
|
||||||
#### GRPO with DAPO/Dr. GRPO loss
|
#### GRPO with DAPO/Dr. GRPO loss
|
||||||
|
|
||||||
The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
|
The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
|
||||||
@@ -721,309 +595,6 @@ trl:
|
|||||||
|
|
||||||
For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types).
|
For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types).
|
||||||
|
|
||||||
#### Async GRPO
|
|
||||||
|
|
||||||
Async GRPO overlaps vLLM generation with training by producing rollouts in a background thread. While the model trains on the current batch, the next batch is already being generated. This can significantly reduce wall-clock time per step.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
trl:
|
|
||||||
use_data_producer: true # Enable data producer protocol
|
|
||||||
use_vllm: true
|
|
||||||
async_prefetch: true # Generate rollouts in background thread
|
|
||||||
prefetch_depth: 1 # Number of rollouts to prefetch
|
|
||||||
vllm_sync_interval: 2 # Sync weights to vLLM every N steps
|
|
||||||
```
|
|
||||||
|
|
||||||
::: {.callout-note}
|
|
||||||
Because the background thread generates completions with slightly stale model weights, async GRPO uses importance sampling correction to account for the distribution shift. This is controlled by `vllm_importance_sampling_correction: true` (default when async is enabled).
|
|
||||||
:::
|
|
||||||
|
|
||||||
##### vLLM LoRA Sync
|
|
||||||
|
|
||||||
By default, weight sync to vLLM merges the LoRA adapter into the base model and broadcasts all parameters via NCCL. LoRA sync is a faster alternative that saves only the adapter weights to the filesystem and has vLLM load them natively using Punica kernels.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
adapter: lora
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 64
|
|
||||||
lora_target_linear: true
|
|
||||||
|
|
||||||
trl:
|
|
||||||
vllm_lora_sync: true # Enable native LoRA sync
|
|
||||||
```
|
|
||||||
|
|
||||||
When `vllm_lora_sync: true` is set, axolotl automatically selects the LoRA-aware vLLM serve module. Start vLLM as usual:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
CUDA_VISIBLE_DEVICES=0 axolotl vllm-serve config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
Then start training on a separate GPU:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
CUDA_VISIBLE_DEVICES=1 axolotl train config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
::: {.callout-tip}
|
|
||||||
LoRA sync is especially beneficial with multi-GPU training (FSDP/DeepSpeed), where NCCL merge-sync can cause GPU contention with vLLM generation.
|
|
||||||
:::
|
|
||||||
|
|
||||||
##### Streaming Partial Batch
|
|
||||||
|
|
||||||
Instead of scoring the entire batch at once, streaming mode scores one prompt group at a time. This enables finer-grained zero-advantage skipping and reduces peak memory usage during scoring.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
trl:
|
|
||||||
streaming_partial_batch: true
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Importance Sampling Correction
|
|
||||||
|
|
||||||
When using async prefetch, completions are generated from a slightly older version of the model. Importance sampling (IS) correction adjusts the policy gradient to account for this distribution shift.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
trl:
|
|
||||||
vllm_importance_sampling_correction: true # Enable IS correction
|
|
||||||
importance_sampling_level: token # 'token' or 'sequence'
|
|
||||||
off_policy_mask_threshold: 0.5 # Mask sequences with IS ratio below this
|
|
||||||
```
|
|
||||||
|
|
||||||
- `importance_sampling_level: token` applies per-token IS ratios (recommended with Liger kernel)
|
|
||||||
- `importance_sampling_level: sequence` applies per-sequence IS ratios
|
|
||||||
- `off_policy_mask_threshold` masks out sequences where the IS ratio indicates they are too far off-policy
|
|
||||||
|
|
||||||
##### Replay Buffer
|
|
||||||
|
|
||||||
The replay buffer caches rollout groups that had learning signal (non-zero reward variance) and uses them to replace zero-signal groups in later batches.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
trl:
|
|
||||||
replay_buffer_size: 100 # Max cached groups (0 = disabled)
|
|
||||||
replay_recompute_logps: true # Recompute log-probs for replayed data (recommended)
|
|
||||||
```
|
|
||||||
|
|
||||||
::: {.callout-note}
|
|
||||||
When `replay_recompute_logps: true` (default), old log-probabilities are recomputed using the current model weights. This fixes the IS mismatch that would otherwise occur when replaying stale data.
|
|
||||||
:::
|
|
||||||
|
|
||||||
##### Deferred Re-rolling
|
|
||||||
|
|
||||||
Failed prompts (where the model produces zero reward for all generations) are buffered and re-injected into later batches when the model may be better equipped to solve them.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
trl:
|
|
||||||
reroll_start_fraction: 0.5 # Start re-rolling after 50% of training
|
|
||||||
reroll_max_groups: 1 # Max groups to replace per batch
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Zero-Advantage Batch Skipping
|
|
||||||
|
|
||||||
When all advantages in a micro-batch are zero (no learning signal), the forward/backward pass is skipped entirely. This is enabled by default and logged as `skipped_zero_adv_batches=1`.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
trl:
|
|
||||||
skip_zero_advantage_batches: true # default
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Parallel Reward Workers
|
|
||||||
|
|
||||||
Reward functions that use `signal.alarm()` (e.g., `math_verify`) must run in the main thread. Parallel reward workers use subprocesses to work around this limitation while enabling concurrent reward computation.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
trl:
|
|
||||||
reward_num_workers: 4 # Number of subprocess workers (1 = no parallelism)
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Full Async GRPO Example
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
|
||||||
|
|
||||||
vllm:
|
|
||||||
host: 0.0.0.0
|
|
||||||
port: 8000
|
|
||||||
gpu_memory_utilization: 0.35
|
|
||||||
dtype: auto
|
|
||||||
|
|
||||||
adapter: lora
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 64
|
|
||||||
lora_target_linear: true
|
|
||||||
|
|
||||||
rl: grpo
|
|
||||||
trl:
|
|
||||||
use_data_producer: true
|
|
||||||
use_vllm: true
|
|
||||||
async_prefetch: true
|
|
||||||
prefetch_depth: 1
|
|
||||||
vllm_sync_interval: 2
|
|
||||||
vllm_lora_sync: true
|
|
||||||
streaming_partial_batch: true
|
|
||||||
vllm_importance_sampling_correction: true
|
|
||||||
off_policy_mask_threshold: 0.5
|
|
||||||
importance_sampling_level: token
|
|
||||||
num_generations: 8
|
|
||||||
max_completion_length: 512
|
|
||||||
reward_funcs:
|
|
||||||
- rewards.accuracy_reward
|
|
||||||
reroll_start_fraction: 0.5
|
|
||||||
replay_buffer_size: 100
|
|
||||||
reward_num_workers: 4
|
|
||||||
skip_zero_advantage_batches: true
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: AI-MO/NuminaMath-TIR
|
|
||||||
type: rewards.prompt_transform
|
|
||||||
split: train
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
max_steps: 500
|
|
||||||
learning_rate: 1e-5
|
|
||||||
bf16: true
|
|
||||||
gradient_checkpointing: true
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Terminal 1: Start vLLM on GPU 0
|
|
||||||
CUDA_VISIBLE_DEVICES=0 axolotl vllm-serve config.yaml
|
|
||||||
|
|
||||||
# Terminal 2: Train on GPU 1
|
|
||||||
CUDA_VISIBLE_DEVICES=1 axolotl train config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Multi-GPU Async GRPO
|
|
||||||
|
|
||||||
Async GRPO supports FSDP and DeepSpeed ZeRO-3 for multi-GPU training. vLLM runs on one GPU while training is distributed across the remaining GPUs.
|
|
||||||
|
|
||||||
**FSDP:**
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
fsdp:
|
|
||||||
- full_shard
|
|
||||||
- auto_wrap
|
|
||||||
fsdp_config:
|
|
||||||
fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
|
||||||
gradient_checkpointing_kwargs:
|
|
||||||
use_reentrant: false
|
|
||||||
```
|
|
||||||
|
|
||||||
**DeepSpeed ZeRO-3:**
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
deepspeed: deepspeed_configs/zero3_bf16.json
|
|
||||||
gradient_checkpointing_kwargs:
|
|
||||||
use_reentrant: true # Required for ZeRO-3
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Terminal 1: Start vLLM on GPU 0
|
|
||||||
CUDA_VISIBLE_DEVICES=0 axolotl vllm-serve config.yaml
|
|
||||||
|
|
||||||
# Terminal 2: Train on GPUs 0,1
|
|
||||||
CUDA_VISIBLE_DEVICES=0,1 accelerate launch --num_processes 2 -m axolotl.cli.train config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
::: {.callout-important}
|
|
||||||
With multi-GPU async prefetch, only rank 0 generates completions in the background thread. Results are broadcast to all ranks on the main thread. This avoids FSDP/DeepSpeed collective deadlocks from unsynchronized background threads.
|
|
||||||
:::
|
|
||||||
|
|
||||||
### GDPO
|
|
||||||
|
|
||||||
GDPO (Group Reward-Decoupled Policy Optimization) extends GRPO for multi-reward training. It addresses the **reward advantage collapse** problem by normalizing each reward function independently before combining them.
|
|
||||||
|
|
||||||
::: {.callout-tip}
|
|
||||||
Use GDPO when training with multiple reward functions. For single reward, GRPO and GDPO produce equivalent results.
|
|
||||||
:::
|
|
||||||
|
|
||||||
Paper: [https://arxiv.org/pdf/2501.05242](https://arxiv.org/pdf/2501.05242)
|
|
||||||
|
|
||||||
GDPO uses TRL's native `multi_objective_aggregation` parameter under the hood. When you set `rl: gdpo`, axolotl automatically configures TRL to use `normalize_then_sum` aggregation.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
|
||||||
|
|
||||||
vllm:
|
|
||||||
host: 0.0.0.0
|
|
||||||
port: 8000
|
|
||||||
tensor_parallel_size: 2
|
|
||||||
gpu_memory_utilization: 0.85
|
|
||||||
|
|
||||||
rl: gdpo
|
|
||||||
|
|
||||||
trl:
|
|
||||||
beta: 0.001
|
|
||||||
max_completion_length: 256
|
|
||||||
use_vllm: true
|
|
||||||
num_generations: 4
|
|
||||||
reward_funcs:
|
|
||||||
- rewards.format_reward
|
|
||||||
- rewards.correctness_reward
|
|
||||||
reward_weights: [1.0, 2.0]
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: openai/gsm8k
|
|
||||||
name: main
|
|
||||||
type: rewards.oai_gsm8k_transform
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also use GRPO with explicit aggregation control:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
rl: grpo
|
|
||||||
trl:
|
|
||||||
multi_objective_aggregation: normalize_then_sum # GDPO behavior
|
|
||||||
# or: sum_then_normalize # Default GRPO behavior
|
|
||||||
```
|
|
||||||
|
|
||||||
#### GDPO vs GRPO
|
|
||||||
|
|
||||||
| Aspect | GRPO | GDPO |
|
|
||||||
|--------|------|------|
|
|
||||||
| **Aggregation** | `sum_then_normalize` | `normalize_then_sum` |
|
|
||||||
| **Multi-reward** | May collapse advantages | Preserves reward signals |
|
|
||||||
| **Single reward** | Standard behavior | Equivalent to GRPO |
|
|
||||||
|
|
||||||
#### Why GDPO?
|
|
||||||
|
|
||||||
When using multiple rewards with GRPO, different reward combinations can produce identical advantages:
|
|
||||||
|
|
||||||
```
|
|
||||||
# Example: format + correctness rewards
|
|
||||||
[format=0, correct=3] → sum=3
|
|
||||||
[format=1, correct=2] → sum=3 ← GRPO sees these as equal!
|
|
||||||
[format=2, correct=1] → sum=3
|
|
||||||
[format=3, correct=0] → sum=3
|
|
||||||
```
|
|
||||||
|
|
||||||
GDPO normalizes each reward independently, preserving their relative differences.
|
|
||||||
|
|
||||||
#### Reward Functions
|
|
||||||
|
|
||||||
GDPO uses the same reward function format as GRPO:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# rewards.py
|
|
||||||
def format_reward(completions, **kwargs) -> list[float]:
|
|
||||||
return [1.0 if len(c) > 10 else 0.0 for c in completions]
|
|
||||||
|
|
||||||
def correctness_reward(completions, answers, **kwargs) -> list[float]:
|
|
||||||
rewards = []
|
|
||||||
for completion, answer in zip(completions, answers):
|
|
||||||
# Your scoring logic here
|
|
||||||
rewards.append(score)
|
|
||||||
return rewards
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Sequence Parallelism
|
|
||||||
|
|
||||||
GDPO supports sequence parallelism for long-context training:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
rl: gdpo
|
|
||||||
context_parallel_size: 2
|
|
||||||
```
|
|
||||||
|
|
||||||
### SimPO
|
### SimPO
|
||||||
|
|
||||||
SimPO uses [CPOTrainer](https://huggingface.co/docs/trl/main/en/cpo_trainer) but with alternative loss function.
|
SimPO uses [CPOTrainer](https://huggingface.co/docs/trl/main/en/cpo_trainer) but with alternative loss function.
|
||||||
|
|||||||
@@ -1,90 +0,0 @@
|
|||||||
examples:
|
|
||||||
# December 2025
|
|
||||||
- name: kimi-linear
|
|
||||||
title: Kimi Linear
|
|
||||||
- name: plano
|
|
||||||
title: Plano Orchestrator
|
|
||||||
- name: mimo
|
|
||||||
title: MiMo
|
|
||||||
- name: internvl3_5
|
|
||||||
title: InternVL 3.5
|
|
||||||
|
|
||||||
# AllenAI
|
|
||||||
- name: olmo3
|
|
||||||
title: OLMo 3
|
|
||||||
|
|
||||||
# ArceeAI
|
|
||||||
- name: trinity
|
|
||||||
title: Trinity
|
|
||||||
- name: arcee
|
|
||||||
title: Arcee AFM
|
|
||||||
|
|
||||||
# MistralAI
|
|
||||||
- name: ministral3/think
|
|
||||||
title: Ministral 3 Thinking
|
|
||||||
- name: ministral3/vision
|
|
||||||
title: Ministral 3 Vision
|
|
||||||
- name: magistral/think
|
|
||||||
title: Magistral Thinking
|
|
||||||
- name: magistral/vision
|
|
||||||
title: Magistral Vision
|
|
||||||
- name: ministral
|
|
||||||
title: Ministral
|
|
||||||
- name: mistral-small
|
|
||||||
title: Mistral Small 3.1/3.2
|
|
||||||
- name: voxtral
|
|
||||||
title: Voxtral
|
|
||||||
- name: devstral
|
|
||||||
title: Devstral
|
|
||||||
- name: mistral
|
|
||||||
title: Mistral 7B
|
|
||||||
|
|
||||||
# Meta
|
|
||||||
- name: llama-4
|
|
||||||
title: Llama 4
|
|
||||||
- name: llama-2
|
|
||||||
title: Llama 2
|
|
||||||
|
|
||||||
# Alibaba
|
|
||||||
- name: qwen3-next
|
|
||||||
title: Qwen 3 Next
|
|
||||||
- name: qwen3
|
|
||||||
title: Qwen 3
|
|
||||||
|
|
||||||
# Google
|
|
||||||
- name: gemma3n
|
|
||||||
title: Gemma 3n
|
|
||||||
|
|
||||||
# Swiss AI
|
|
||||||
- name: apertus
|
|
||||||
title: Apertus
|
|
||||||
|
|
||||||
# GPT-OSS
|
|
||||||
- name: gpt-oss
|
|
||||||
title: GPT-OSS
|
|
||||||
- name: seed-oss
|
|
||||||
title: Seed-OSS
|
|
||||||
|
|
||||||
# Microsoft
|
|
||||||
- name: phi
|
|
||||||
title: Phi
|
|
||||||
|
|
||||||
# SmolVLM
|
|
||||||
- name: smolvlm2
|
|
||||||
title: SmolVLM 2
|
|
||||||
|
|
||||||
# IBM
|
|
||||||
- name: granite4
|
|
||||||
title: Granite 4
|
|
||||||
|
|
||||||
# LiquidAI
|
|
||||||
- name: LiquidAI
|
|
||||||
title: Liquid Foundation Models 2
|
|
||||||
|
|
||||||
# Other
|
|
||||||
- name: hunyuan
|
|
||||||
title: Hunyuan
|
|
||||||
- name: jamba
|
|
||||||
title: Jamba
|
|
||||||
- name: orpheus
|
|
||||||
title: Orpheus
|
|
||||||
@@ -1,424 +0,0 @@
|
|||||||
"""
|
|
||||||
auto generate example docs from allowlist
|
|
||||||
"""
|
|
||||||
|
|
||||||
import re
|
|
||||||
import shutil
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import yaml
|
|
||||||
|
|
||||||
# Paths
|
|
||||||
THIS = Path(__file__).resolve()
|
|
||||||
ROOT = THIS.parents[2] # repo root (docs/scripts -> docs -> ROOT)
|
|
||||||
EXAMPLES_DIR = ROOT / "examples"
|
|
||||||
OUTPUT_DIR = ROOT / "docs" / "models"
|
|
||||||
ALLOWLIST_YML = THIS.parent / "examples-allowlist.yml"
|
|
||||||
|
|
||||||
|
|
||||||
def slugify(name: str) -> str:
|
|
||||||
"""Convert a name to a slug (lowercase, hyphens for spaces)."""
|
|
||||||
s = re.sub(r"[^a-zA-Z0-9\s\-]+", "", name.strip())
|
|
||||||
s = re.sub(r"\s+", "-", s).strip("-").lower()
|
|
||||||
return s or "example"
|
|
||||||
|
|
||||||
|
|
||||||
def read_allowlist():
|
|
||||||
with open(ALLOWLIST_YML, "r", encoding="utf-8") as f:
|
|
||||||
data = yaml.safe_load(f) or {}
|
|
||||||
items = data.get("examples", [])
|
|
||||||
if not isinstance(items, list):
|
|
||||||
raise ValueError("`examples` must be a list in examples-allowlist.yml")
|
|
||||||
return items
|
|
||||||
|
|
||||||
|
|
||||||
def find_readme(folder: Path) -> Path | None:
|
|
||||||
for name in ("README.md", "Readme.md", "readme.md"):
|
|
||||||
p = folder / name
|
|
||||||
if p.exists():
|
|
||||||
return p
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def remove_first_h1(md: str) -> tuple[str, str | None]:
|
|
||||||
"""
|
|
||||||
Remove the first H1 from markdown and return (modified_md, h1_title).
|
|
||||||
The H1 is removed since we use the frontmatter title instead.
|
|
||||||
"""
|
|
||||||
lines = md.splitlines()
|
|
||||||
result = []
|
|
||||||
h1_title = None
|
|
||||||
skipped_first = False
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
if not skipped_first and line.startswith("# "):
|
|
||||||
h1_title = line[2:].strip()
|
|
||||||
skipped_first = True
|
|
||||||
continue
|
|
||||||
result.append(line)
|
|
||||||
|
|
||||||
return "\n".join(result), h1_title
|
|
||||||
|
|
||||||
|
|
||||||
IMG_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
|
|
||||||
LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
|
|
||||||
|
|
||||||
|
|
||||||
def rewrite_and_copy_assets(md: str, src_dir: Path, dest_assets_root: Path) -> str:
|
|
||||||
"""
|
|
||||||
Copy local image assets referenced in markdown to
|
|
||||||
docs/examples/assets/... and rewrite the links.
|
|
||||||
"""
|
|
||||||
dest_assets = dest_assets_root / "assets"
|
|
||||||
|
|
||||||
def repl(m):
|
|
||||||
url = m.group(1).strip()
|
|
||||||
if re.match(r"^(https?:)?//", url):
|
|
||||||
return m.group(0) # leave remote URLs
|
|
||||||
src_path = (src_dir / url).resolve()
|
|
||||||
if not src_path.exists():
|
|
||||||
return m.group(0) # leave as-is if not found
|
|
||||||
rel = src_path.relative_to(src_dir)
|
|
||||||
# Create a unique asset path based on source directory name
|
|
||||||
asset_name = src_dir.name.replace("/", "-")
|
|
||||||
dest_path = dest_assets / asset_name / rel
|
|
||||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
shutil.copy2(src_path, dest_path)
|
|
||||||
new_rel = f"assets/{asset_name}/{rel.as_posix()}"
|
|
||||||
return m.group(0).replace(url, new_rel)
|
|
||||||
|
|
||||||
return IMG_RE.sub(repl, md)
|
|
||||||
|
|
||||||
|
|
||||||
def rewrite_readme_links(
|
|
||||||
md: str,
|
|
||||||
src_dir: Path,
|
|
||||||
examples_dir: Path,
|
|
||||||
parent_index_only: set,
|
|
||||||
current_src_path: str,
|
|
||||||
allowlist_entries: set,
|
|
||||||
current_output_path: str,
|
|
||||||
) -> str:
|
|
||||||
"""
|
|
||||||
Rewrite links between README.md files to point to the correct .qmd files.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def repl(m):
|
|
||||||
text = m.group(1)
|
|
||||||
url = m.group(2).strip()
|
|
||||||
|
|
||||||
# Skip remote URLs and anchor links
|
|
||||||
if re.match(r"^(https?:)?//", url) or url.startswith("#"):
|
|
||||||
return m.group(0)
|
|
||||||
|
|
||||||
# Skip non-markdown files
|
|
||||||
if not url.lower().endswith(".md"):
|
|
||||||
return m.group(0)
|
|
||||||
|
|
||||||
# Resolve the target path
|
|
||||||
try:
|
|
||||||
target_path = (src_dir / url).resolve()
|
|
||||||
|
|
||||||
# Check if target is outside examples_dir
|
|
||||||
try:
|
|
||||||
rel_path = target_path.relative_to(examples_dir)
|
|
||||||
except ValueError:
|
|
||||||
# Target is outside examples_dir, leave as-is
|
|
||||||
return m.group(0)
|
|
||||||
|
|
||||||
parts = list(rel_path.parts)
|
|
||||||
|
|
||||||
# Determine the output path for the target
|
|
||||||
if len(parts) > 0 and parts[-1].lower() in ("readme.md", "readme"):
|
|
||||||
# This is a README link
|
|
||||||
if len(parts) == 1:
|
|
||||||
# Link to root README -> index.qmd
|
|
||||||
target_output = "index.qmd"
|
|
||||||
elif len(parts) == 2:
|
|
||||||
if parts[0] == ".":
|
|
||||||
# Current directory README
|
|
||||||
target_output = "index.qmd"
|
|
||||||
else:
|
|
||||||
# subdir/README.md
|
|
||||||
parent_dir = parts[0]
|
|
||||||
if parent_dir in parent_index_only:
|
|
||||||
target_output = f"{parent_dir}/index.qmd"
|
|
||||||
else:
|
|
||||||
target_output = f"{parent_dir}.qmd"
|
|
||||||
else:
|
|
||||||
# Deeper nesting: parent/subdir/README.md
|
|
||||||
# Build the full path like "parent/subdir"
|
|
||||||
full_path = "/".join(parts[:-1]) # Remove README.md
|
|
||||||
# Check if this exact path is in allowlist
|
|
||||||
if full_path in allowlist_entries:
|
|
||||||
# This is a sub-entry with its own entry -> use .qmd
|
|
||||||
target_output = f"{full_path}.qmd"
|
|
||||||
elif parts[0] == ".":
|
|
||||||
# ./subdir/README.md -> check if subdir has own entry
|
|
||||||
subdir = parts[1]
|
|
||||||
if subdir in parent_index_only:
|
|
||||||
target_output = f"{subdir}/index.qmd"
|
|
||||||
else:
|
|
||||||
target_output = f"{subdir}.qmd"
|
|
||||||
else:
|
|
||||||
# parent/subdir where parent doesn't have own entry
|
|
||||||
target_output = f"{full_path}/index.qmd"
|
|
||||||
else:
|
|
||||||
# Regular .md file -> convert to .qmd, keep path structure
|
|
||||||
target_output = "/".join(parts)[:-2] + "qmd"
|
|
||||||
|
|
||||||
# Compute relative path from current output file to target
|
|
||||||
current_parts = current_output_path.split("/")
|
|
||||||
target_parts = target_output.split("/")
|
|
||||||
|
|
||||||
# Special case: if current is a subdir file and target is a single-component file at root
|
|
||||||
# Example: current="magistral/vision", target="magistral.qmd"
|
|
||||||
if len(current_parts) > 1 and len(target_parts) == 1:
|
|
||||||
# Current is in subdir, target is at root level
|
|
||||||
# Go up to root: ../ for each level
|
|
||||||
up_count = len(current_parts) - 1
|
|
||||||
rel_parts = [".."] * up_count + [target_parts[0]]
|
|
||||||
new_url = "/".join(rel_parts)
|
|
||||||
else:
|
|
||||||
# Find common prefix
|
|
||||||
i = 0
|
|
||||||
while (
|
|
||||||
i < min(len(current_parts) - 1, len(target_parts))
|
|
||||||
and current_parts[i] == target_parts[i]
|
|
||||||
):
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
# Build relative path: go up (../) then down to target
|
|
||||||
up_count = len(current_parts) - 1 - i
|
|
||||||
rel_parts = [".."] * up_count + target_parts[i:]
|
|
||||||
|
|
||||||
if not rel_parts or rel_parts == [".."]:
|
|
||||||
# Points to same directory or parent
|
|
||||||
new_url = "/".join(rel_parts) if rel_parts else "."
|
|
||||||
else:
|
|
||||||
new_url = "/".join(rel_parts)
|
|
||||||
|
|
||||||
return f"[{text}]({new_url})"
|
|
||||||
except (ValueError, IndexError):
|
|
||||||
return m.group(0)
|
|
||||||
|
|
||||||
return LINK_RE.sub(repl, md)
|
|
||||||
|
|
||||||
|
|
||||||
def write_qmd(out_path: Path, title: str, body_md: str):
|
|
||||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
fm = f"---\ntitle: {title!r}\nexecute:\n eval: false\nformat:\n html:\n toc: true\n---\n\n"
|
|
||||||
out_path.write_text(fm + body_md, encoding="utf-8")
|
|
||||||
|
|
||||||
|
|
||||||
def update_quarto_yml(generated: list[tuple[str, str, str]]):
|
|
||||||
"""
|
|
||||||
Update _quarto.yml with the generated example files in the correct order.
|
|
||||||
This keeps the sidebar in sync with the allowlist.
|
|
||||||
|
|
||||||
Model Guides is now nested under "Getting Started" section.
|
|
||||||
Creates nested sections for models with sub-entries (e.g., magistral, ministral3).
|
|
||||||
Parent pages are now flat files (e.g., ministral3.qmd) with sub-pages in subdirs.
|
|
||||||
"""
|
|
||||||
quarto_yml = ROOT / "_quarto.yml"
|
|
||||||
if not quarto_yml.exists():
|
|
||||||
print(f"[WARN] {quarto_yml} not found, skipping update", file=sys.stderr)
|
|
||||||
return
|
|
||||||
|
|
||||||
content = quarto_yml.read_text(encoding="utf-8")
|
|
||||||
|
|
||||||
# First pass: find all parents that have sub-entries
|
|
||||||
parents_with_subs = set()
|
|
||||||
for path, _name, _title in generated:
|
|
||||||
if "/" in path:
|
|
||||||
parent = path.split("/")[0]
|
|
||||||
parents_with_subs.add(parent)
|
|
||||||
|
|
||||||
# Build the YAML contents while preserving allowlist order
|
|
||||||
lines = []
|
|
||||||
processed_sections = set()
|
|
||||||
|
|
||||||
for path, _name, title in generated:
|
|
||||||
# Check if this is a parent page that has sub-pages
|
|
||||||
if path in parents_with_subs:
|
|
||||||
# This is a parent page with sub-pages - create a nested section
|
|
||||||
if path not in processed_sections:
|
|
||||||
processed_sections.add(path)
|
|
||||||
section_title = (
|
|
||||||
title or path.replace("-", " ").replace("_", " ").title()
|
|
||||||
)
|
|
||||||
lines.append(f' - section: "{section_title}"')
|
|
||||||
lines.append(" contents:")
|
|
||||||
# Add the parent page first
|
|
||||||
lines.append(f" - docs/models/{path}.qmd")
|
|
||||||
# Then add all sub-pages
|
|
||||||
for sub_path, _sub_name, _sub_title in generated:
|
|
||||||
if "/" in sub_path and sub_path.split("/")[0] == path:
|
|
||||||
lines.append(
|
|
||||||
f" - docs/models/{sub_path}.qmd"
|
|
||||||
)
|
|
||||||
elif "/" not in path:
|
|
||||||
# This is a flat item with no sub-pages
|
|
||||||
# Skip if it was already included as part of a parent section
|
|
||||||
if path not in processed_sections:
|
|
||||||
lines.append(f" - docs/models/{path}.qmd")
|
|
||||||
|
|
||||||
yaml_content = "\n".join(lines) + "\n"
|
|
||||||
|
|
||||||
# Pattern to match only the Model Guides contents, stopping at the next item
|
|
||||||
# in Getting Started (lines starting with 12 spaces: same level as the section)
|
|
||||||
pattern = r'( - section: "Model Guides"\n contents:)([^\n]*|.*?)(?=\n - |\n - section:|\n\nformat:)'
|
|
||||||
|
|
||||||
def replacement(match):
|
|
||||||
prefix = match.group(1)
|
|
||||||
return prefix + "\n" + yaml_content
|
|
||||||
|
|
||||||
new_content = re.sub(pattern, replacement, content, flags=re.DOTALL)
|
|
||||||
|
|
||||||
if new_content != content:
|
|
||||||
quarto_yml.write_text(new_content, encoding="utf-8")
|
|
||||||
print(f"Updated {quarto_yml}")
|
|
||||||
else:
|
|
||||||
print(f"No changes needed for {quarto_yml}")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
allow = read_allowlist()
|
|
||||||
if not EXAMPLES_DIR.exists():
|
|
||||||
print(f"[WARN] {EXAMPLES_DIR} not found", file=sys.stderr)
|
|
||||||
return
|
|
||||||
|
|
||||||
(OUTPUT_DIR / "assets").mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# First pass: identify which parents have their own entry vs only sub-entries
|
|
||||||
parent_entries = set() # Parents that have their own entry
|
|
||||||
parent_with_subs = set() # Parents that have sub-entries
|
|
||||||
allowlist_entries = set() # All entries in allowlist
|
|
||||||
|
|
||||||
for item in allow:
|
|
||||||
if isinstance(item, str):
|
|
||||||
name = item
|
|
||||||
else:
|
|
||||||
name = item.get("name")
|
|
||||||
|
|
||||||
allowlist_entries.add(name)
|
|
||||||
|
|
||||||
if "/" in name:
|
|
||||||
parent = name.split("/")[0]
|
|
||||||
parent_with_subs.add(parent)
|
|
||||||
else:
|
|
||||||
parent_entries.add(name)
|
|
||||||
|
|
||||||
# Parents with subs that DON'T have their own entry -> use index.qmd
|
|
||||||
parent_index_only = parent_with_subs - parent_entries
|
|
||||||
|
|
||||||
generated = []
|
|
||||||
seen_dirs = set() # Track which parent directories we've created index for
|
|
||||||
|
|
||||||
for item in allow:
|
|
||||||
if isinstance(item, str):
|
|
||||||
name = item
|
|
||||||
title = None
|
|
||||||
else:
|
|
||||||
name = item.get("name")
|
|
||||||
title = item.get("title")
|
|
||||||
|
|
||||||
if not name:
|
|
||||||
print(f"[WARN] Skipping item without name: {item}", file=sys.stderr)
|
|
||||||
continue
|
|
||||||
|
|
||||||
src_dir = EXAMPLES_DIR / name
|
|
||||||
if not src_dir.exists() or not src_dir.is_dir():
|
|
||||||
print(f"[WARN] Skipping {name} (not a directory)", file=sys.stderr)
|
|
||||||
continue
|
|
||||||
|
|
||||||
readme = find_readme(src_dir)
|
|
||||||
if not readme:
|
|
||||||
print(f"[WARN] Skipping {name} (no README.md)", file=sys.stderr)
|
|
||||||
continue
|
|
||||||
|
|
||||||
md = readme.read_text(encoding="utf-8")
|
|
||||||
|
|
||||||
# Determine output path first (needed for link rewriting)
|
|
||||||
parts = name.split("/")
|
|
||||||
if len(parts) == 1:
|
|
||||||
# Simple case: no subdirectory
|
|
||||||
out_path = OUTPUT_DIR / f"{parts[0]}.qmd"
|
|
||||||
sidebar_path = parts[0]
|
|
||||||
else:
|
|
||||||
# Has subdirectory: e.g., magistral/think
|
|
||||||
parent = parts[0]
|
|
||||||
child = "-".join(parts[1:]) # handle nested subdirs
|
|
||||||
out_path = OUTPUT_DIR / parent / f"{child}.qmd"
|
|
||||||
sidebar_path = f"{parent}/{child}"
|
|
||||||
|
|
||||||
# Remove the first H1 (we use frontmatter title instead)
|
|
||||||
md, _ = remove_first_h1(md)
|
|
||||||
# Rewrite links between README files
|
|
||||||
md = rewrite_readme_links(
|
|
||||||
md,
|
|
||||||
src_dir,
|
|
||||||
EXAMPLES_DIR,
|
|
||||||
parent_index_only,
|
|
||||||
name,
|
|
||||||
allowlist_entries,
|
|
||||||
sidebar_path,
|
|
||||||
)
|
|
||||||
md = rewrite_and_copy_assets(md, src_dir, OUTPUT_DIR)
|
|
||||||
|
|
||||||
# Handle parent page generation for sub-entries
|
|
||||||
if len(parts) > 1:
|
|
||||||
# Has subdirectory: e.g., magistral/think
|
|
||||||
parent = parts[0]
|
|
||||||
|
|
||||||
# Create parent.qmd if not already done and parent doesn't have own entry
|
|
||||||
if parent not in seen_dirs and parent in parent_index_only:
|
|
||||||
parent_readme = find_readme(EXAMPLES_DIR / parent)
|
|
||||||
if parent_readme:
|
|
||||||
parent_md = parent_readme.read_text(encoding="utf-8")
|
|
||||||
parent_md, _ = remove_first_h1(parent_md)
|
|
||||||
parent_md = rewrite_readme_links(
|
|
||||||
parent_md,
|
|
||||||
EXAMPLES_DIR / parent,
|
|
||||||
EXAMPLES_DIR,
|
|
||||||
parent_index_only,
|
|
||||||
parent,
|
|
||||||
allowlist_entries,
|
|
||||||
parent,
|
|
||||||
)
|
|
||||||
parent_md = rewrite_and_copy_assets(
|
|
||||||
parent_md, EXAMPLES_DIR / parent, OUTPUT_DIR
|
|
||||||
)
|
|
||||||
parent_title = parent.replace("-", " ").replace("_", " ").title()
|
|
||||||
write_qmd(OUTPUT_DIR / f"{parent}.qmd", parent_title, parent_md)
|
|
||||||
generated.append((parent, parent, parent_title))
|
|
||||||
seen_dirs.add(parent)
|
|
||||||
|
|
||||||
if not title:
|
|
||||||
title = name.replace("/", " ").replace("-", " ").title()
|
|
||||||
|
|
||||||
write_qmd(out_path, title, md)
|
|
||||||
generated.append((sidebar_path, name, title))
|
|
||||||
|
|
||||||
# Index page - preserve allowlist order
|
|
||||||
if generated:
|
|
||||||
listing = "\n".join(
|
|
||||||
[f"- [{title}]({path}.qmd)" for path, name, title in generated]
|
|
||||||
)
|
|
||||||
index_md = (
|
|
||||||
"# Model Guides\n\nBelow are the curated examples for training various model architectures:\n\n"
|
|
||||||
+ listing
|
|
||||||
+ "\n"
|
|
||||||
)
|
|
||||||
index_fm = (
|
|
||||||
"---\nexecute:\n eval: false\nformat:\n html:\n toc: true\n---\n\n"
|
|
||||||
)
|
|
||||||
(OUTPUT_DIR / "index.qmd").write_text(index_fm + index_md, encoding="utf-8")
|
|
||||||
|
|
||||||
# Auto-update _quarto.yml to keep sidebar in sync
|
|
||||||
update_quarto_yml(generated)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -49,9 +49,9 @@ When sequence parallelism is enabled:
|
|||||||
To use sequence parallelism, you need:
|
To use sequence parallelism, you need:
|
||||||
|
|
||||||
- Multiple GPUs (at least 2)
|
- Multiple GPUs (at least 2)
|
||||||
- The `ring-flash-attn` package. Install with:
|
- The `ring-flash-attn` package. Install with either `uv sync --extra ring-flash-attn`
|
||||||
- `pip install axolotl[ring-flash-attn]` (preferred)
|
(from a cloned repository) or `uv pip install ring-flash-attn>=0.1.4`.
|
||||||
- `pip install ring-flash-attn>=0.1.4`
|
- Flash Attention installed separately with `uv pip install flash-attn --no-build-isolation`.
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|
||||||
|
|||||||
@@ -1,61 +0,0 @@
|
|||||||
---
|
|
||||||
title: Telemetry
|
|
||||||
description: A description of the telemetry implementation in Axolotl.
|
|
||||||
---
|
|
||||||
|
|
||||||
# Telemetry in Axolotl
|
|
||||||
|
|
||||||
Axolotl implements anonymous telemetry to help maintainers understand how the library
|
|
||||||
is used and where users encounter issues. This data helps prioritize features, optimize
|
|
||||||
performance, and fix bugs.
|
|
||||||
|
|
||||||
## Data Collection
|
|
||||||
|
|
||||||
We collect:
|
|
||||||
|
|
||||||
- System info: OS, Python version, Axolotl version, PyTorch version, Transformers
|
|
||||||
version, etc.
|
|
||||||
- Hardware info: CPU count, memory, GPU count and models
|
|
||||||
- Runtime metrics: Training progress, memory usage, timing information
|
|
||||||
- Usage patterns: Models (from a whitelist) and configurations used
|
|
||||||
- Error tracking: Stack traces and error messages (sanitized to remove personal
|
|
||||||
information)
|
|
||||||
|
|
||||||
Personally identifiable information (PII) is not collected.
|
|
||||||
|
|
||||||
## Implementation
|
|
||||||
|
|
||||||
Telemetry is implemented using PostHog and consists of:
|
|
||||||
|
|
||||||
- `axolotl.telemetry.TelemetryManager`: A singleton class that initializes the
|
|
||||||
telemetry system and provides methods for tracking events.
|
|
||||||
- `axolotl.telemetry.errors.send_errors`: A decorator that captures exceptions and
|
|
||||||
sends sanitized stack traces.
|
|
||||||
- `axolotl.telemetry.runtime_metrics.RuntimeMetricsTracker`: A class that tracks
|
|
||||||
runtime metrics during training.
|
|
||||||
- `axolotl.telemetry.callbacks.TelemetryCallback`: A Trainer callback that sends
|
|
||||||
runtime metrics telemetry.
|
|
||||||
|
|
||||||
The telemetry system will block training startup for 10 seconds to ensure users are
|
|
||||||
aware of data collection, unless telemetry is explicitly enabled or disabled.
|
|
||||||
|
|
||||||
## Opt-Out Mechanism
|
|
||||||
|
|
||||||
Telemetry is **enabled by default** on an opt-out basis. To disable it, set
|
|
||||||
`AXOLOTL_DO_NOT_TRACK=1` or `DO_NOT_TRACK=1`.
|
|
||||||
|
|
||||||
A warning message will be logged on start to clearly inform users about telemetry.
|
|
||||||
We will remove this after some period.
|
|
||||||
|
|
||||||
To hide the warning message about telemetry that is displayed on train, etc. startup,
|
|
||||||
explicitly set: `AXOLOTL_DO_NOT_TRACK=0` (enable telemetry) or `AXOLOTL_DO_NOT_TRACK=1`
|
|
||||||
(explicitly disable telemetry).
|
|
||||||
|
|
||||||
## Privacy
|
|
||||||
|
|
||||||
- All path-like config information is automatically redacted from telemetry data
|
|
||||||
- Model information is only collected for whitelisted organizations
|
|
||||||
- See `axolotl/telemetry/whitelist.yaml` for the set of whitelisted organizations
|
|
||||||
- Each run generates a unique anonymous ID
|
|
||||||
- This allows us to link different telemetry events in a single same training run
|
|
||||||
- Telemetry is only sent from the main process to avoid duplicate events
|
|
||||||
@@ -6,17 +6,20 @@ LFM2 features a new hybrid Liquid architecture with multiplicative gates, short-
|
|||||||
|
|
||||||
This guide shows how to fine-tune both the LFM2 and LFM2-VL models with Axolotl.
|
This guide shows how to fine-tune both the LFM2 and LFM2-VL models with Axolotl.
|
||||||
|
|
||||||
Thanks to the team at LiquidAI for giving us early access to prepare for these releases.
|
|
||||||
|
|
||||||
## Getting Started
|
## Getting Started
|
||||||
|
|
||||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||||
|
|
||||||
Here is an example of how to install from pip:
|
Here is an example of how to install from pip:
|
||||||
```bash
|
```bash
|
||||||
# Ensure you have a compatible version of Pytorch installed
|
# Ensure you have a compatible version of PyTorch installed
|
||||||
pip3 install packaging setuptools wheel ninja
|
# Option A: manage dependencies in your project
|
||||||
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
|
uv add 'axolotl>=0.12.0'
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
|
|
||||||
|
# Option B: quick install
|
||||||
|
uv pip install 'axolotl>=0.12.0'
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Run one of the finetuning examples below.
|
2. Run one of the finetuning examples below.
|
||||||
@@ -33,19 +36,11 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r
|
|||||||
axolotl train examples/LiquidAI/lfm2-vl-lora.yaml
|
axolotl train examples/LiquidAI/lfm2-vl-lora.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
**LFM2-MoE**
|
|
||||||
```bash
|
|
||||||
pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6
|
|
||||||
|
|
||||||
# LoRA SFT (1x48GB @ 16.2GiB)
|
|
||||||
axolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
### TIPS
|
### TIPS
|
||||||
|
|
||||||
- **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it:
|
- **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it:
|
||||||
```bash
|
```bash
|
||||||
pip uninstall -y causal-conv1d
|
uv pip uninstall -y causal-conv1d
|
||||||
```
|
```
|
||||||
|
|
||||||
- **Dataset Loading**: Read more on how to load your own dataset in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).
|
- **Dataset Loading**: Read more on how to load your own dataset in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||||
@@ -55,13 +50,14 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r
|
|||||||
|
|
||||||
## Optimization Guides
|
## Optimization Guides
|
||||||
|
|
||||||
- [Optimizations Guide](https://docs.axolotl.ai/docs/optimizations.html)
|
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
||||||
|
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
|
||||||
|
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
||||||
|
|
||||||
## Related Resources
|
## Related Resources
|
||||||
|
|
||||||
- [LFM2 Blog](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models)
|
- [LFM2 Blog](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models)
|
||||||
- [LFM2-VL Blog](https://www.liquid.ai/blog/lfm2-vl-efficient-vision-language-models)
|
- [LFM2-VL Blog](https://www.liquid.ai/blog/lfm2-vl-efficient-vision-language-models)
|
||||||
- [LFM2-MoE Blog](https://www.liquid.ai/blog/lfm2-8b-a1b-an-efficient-on-device-mixture-of-experts)
|
|
||||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
- [Axolotl Docs](https://docs.axolotl.ai)
|
||||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
||||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
base_model: LiquidAI/LFM2-350M
|
base_model: LiquidAI/LFM2-350M
|
||||||
|
|
||||||
plugins:
|
chunked_cross_entropy: true
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
eot_tokens:
|
eot_tokens:
|
||||||
- "<|im_end|>"
|
- "<|im_end|>"
|
||||||
|
|||||||
@@ -1,59 +0,0 @@
|
|||||||
base_model: LiquidAI/LFM2-8B-A1B
|
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
load_in_8bit: true
|
|
||||||
|
|
||||||
eot_tokens:
|
|
||||||
- "<|im_end|>"
|
|
||||||
datasets:
|
|
||||||
- path: mlabonne/FineTome-100k
|
|
||||||
type: chat_template
|
|
||||||
split: train[:20%]
|
|
||||||
field_messages: conversations
|
|
||||||
message_field_role: from
|
|
||||||
message_field_content: value
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.05
|
|
||||||
output_dir: ./outputs/out
|
|
||||||
|
|
||||||
sequence_len: 4096
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
adapter: lora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_modules: 'model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 2
|
|
||||||
micro_batch_size: 4
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_torch_fused
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 5e-5
|
|
||||||
|
|
||||||
bf16: true
|
|
||||||
tf32: true
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
evals_per_epoch: 2
|
|
||||||
saves_per_epoch: 1
|
|
||||||
|
|
||||||
weight_decay: 0.0
|
|
||||||
|
|
||||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
|
||||||
@@ -3,9 +3,6 @@ trust_remote_code: true
|
|||||||
model_type: AutoModelForImageTextToText
|
model_type: AutoModelForImageTextToText
|
||||||
processor_type: AutoProcessor
|
processor_type: AutoProcessor
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
# these 3 lines are needed for now to handle vision chat templates w images
|
# these 3 lines are needed for now to handle vision chat templates w images
|
||||||
skip_prepare_dataset: true
|
skip_prepare_dataset: true
|
||||||
remove_unused_columns: false
|
remove_unused_columns: false
|
||||||
|
|||||||
@@ -15,8 +15,8 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
|
|||||||
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
||||||
cd axolotl
|
cd axolotl
|
||||||
|
|
||||||
pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
|
uv sync
|
||||||
pip3 install --no-build-isolation -e '.[flash-attn]'
|
uv pip install flash-attn --no-build-isolation
|
||||||
|
|
||||||
# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
|
# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
|
||||||
python scripts/cutcrossentropy_install.py | sh
|
python scripts/cutcrossentropy_install.py | sh
|
||||||
@@ -31,7 +31,7 @@ python scripts/cutcrossentropy_install.py | sh
|
|||||||
# For those using our Docker image, use the below path.
|
# For those using our Docker image, use the below path.
|
||||||
export CUDA_HOME=/usr/local/cuda
|
export CUDA_HOME=/usr/local/cuda
|
||||||
|
|
||||||
pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
|
uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
|
||||||
```
|
```
|
||||||
|
|
||||||
For any installation errors, see [XIELU Installation Issues](#xielu-installation-issues)
|
For any installation errors, see [XIELU Installation Issues](#xielu-installation-issues)
|
||||||
@@ -67,7 +67,7 @@ If those didn't help, please try the below solutions:
|
|||||||
1. Pass env for CMAKE and try install again:
|
1. Pass env for CMAKE and try install again:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
Python_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
|
Python_EXECUTABLE=$(which python) uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Git clone the repo and manually hardcode python path:
|
2. Git clone the repo and manually hardcode python path:
|
||||||
@@ -92,7 +92,7 @@ If those didn't help, please try the below solutions:
|
|||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip3 install . --no-build-isolation --no-deps
|
uv pip install . --no-build-isolation --no-deps
|
||||||
```
|
```
|
||||||
|
|
||||||
## Optimization Guides
|
## Optimization Guides
|
||||||
|
|||||||
@@ -17,8 +17,8 @@ Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the A
|
|||||||
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
||||||
cd axolotl
|
cd axolotl
|
||||||
|
|
||||||
pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
|
uv sync
|
||||||
pip3 install --no-build-isolation -e '.[flash-attn]'
|
uv pip install flash-attn --no-build-isolation
|
||||||
|
|
||||||
# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
|
# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
|
||||||
python scripts/cutcrossentropy_install.py | sh
|
python scripts/cutcrossentropy_install.py | sh
|
||||||
|
|||||||
@@ -12,10 +12,10 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"Axolotl is the most performant LLM post-training framework available, delivering faster training with efficient, consistent and stable performance. Train your workload and ship your product 30% faster; saving you both time and money.\n",
|
"Axolotl is the most performant LLM post-training framework available, delivering faster training with efficient, consistent and stable performance. Train your workload and ship your product 30% faster; saving you both time and money.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"- ⭐ us on [GitHub](https://github.com/axolotl-ai-cloud/axolotl)\n",
|
"- \u2b50 us on [GitHub](https://github.com/axolotl-ai-cloud/axolotl)\n",
|
||||||
"- 📜 Read the [Docs](http://docs.axolotl.ai/)\n",
|
"- \ud83d\udcdc Read the [Docs](http://docs.axolotl.ai/)\n",
|
||||||
"- 💬 Chat with us on [Discord](https://discord.gg/mnpEYgRUmD)\n",
|
"- \ud83d\udcac Chat with us on [Discord](https://discord.gg/mnpEYgRUmD)\n",
|
||||||
"- 📰 Get updates on [X/Twitter](https://x.com/axolotl_ai)\n"
|
"- \ud83d\udcf0 Get updates on [X/Twitter](https://x.com/axolotl_ai)\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -39,8 +39,8 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"%%capture\n",
|
"%%capture\n",
|
||||||
"# This step can take ~5-10 minutes to install dependencies\n",
|
"# This step can take ~5-10 minutes to install dependencies\n",
|
||||||
"!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
|
"!uv pip install --no-build-isolation axolotl>=0.9.1\n!uv pip install flash-attn --no-build-isolation\n",
|
||||||
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6\""
|
"!uv pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28\""
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -253,6 +253,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"from axolotl.utils import set_pytorch_cuda_alloc_conf\n",
|
"from axolotl.utils import set_pytorch_cuda_alloc_conf\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"# Set \"PYTORCH_CUDA_ALLOC_CONF\" env to save memory\n",
|
||||||
"set_pytorch_cuda_alloc_conf()"
|
"set_pytorch_cuda_alloc_conf()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -1370,7 +1371,7 @@
|
|||||||
"version_minor": 0
|
"version_minor": 0
|
||||||
},
|
},
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
|
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv\u2026"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@@ -1728,9 +1729,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_12815f401eba44658caa7b2e490137a8",
|
"layout": "IPY_MODEL_12815f401eba44658caa7b2e490137a8",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_30e02aa2d0d241979369e598287f2639",
|
"style": "IPY_MODEL_30e02aa2d0d241979369e598287f2639",
|
||||||
"value": "Drop Samples with Zero Trainable Tokens (num_proc=2): 100%"
|
"value": "Drop\u2007Samples\u2007with\u2007Zero\u2007Trainable\u2007Tokens\u2007(num_proc=2):\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"083f9cda8d754c168beee10d2f8955a2": {
|
"083f9cda8d754c168beee10d2f8955a2": {
|
||||||
@@ -1773,9 +1774,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_b195f160ca20442fadd8b5aed0ee41af",
|
"layout": "IPY_MODEL_b195f160ca20442fadd8b5aed0ee41af",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_ca65e32eb52f48c09a84b33cb18f22cd",
|
"style": "IPY_MODEL_ca65e32eb52f48c09a84b33cb18f22cd",
|
||||||
"value": " 11.4M/11.4M [00:00<00:00, 21.8MB/s]"
|
"value": "\u200711.4M/11.4M\u2007[00:00<00:00,\u200721.8MB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"0a46ad75c198463d843fb35e813642cb": {
|
"0a46ad75c198463d843fb35e813642cb": {
|
||||||
@@ -1916,7 +1917,7 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_b1bea589efa14258a9982071b87938bf",
|
"layout": "IPY_MODEL_b1bea589efa14258a9982071b87938bf",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_590eef89881545aa8bbef9a8bbe7fb00",
|
"style": "IPY_MODEL_590eef89881545aa8bbef9a8bbe7fb00",
|
||||||
"value": "\n<b>Pro Tip:</b> If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. </center>"
|
"value": "\n<b>Pro Tip:</b> If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. </center>"
|
||||||
}
|
}
|
||||||
@@ -1937,9 +1938,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_bfcdbba993b74972a9e3e575f86908ff",
|
"layout": "IPY_MODEL_bfcdbba993b74972a9e3e575f86908ff",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_6ebb2ec171414e47a14765505f64bb3c",
|
"style": "IPY_MODEL_6ebb2ec171414e47a14765505f64bb3c",
|
||||||
"value": " 3.84G/3.84G [00:09<00:00, 664MB/s]"
|
"value": "\u20073.84G/3.84G\u2007[00:09<00:00,\u2007664MB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"0e936d9dbf9c4fdd86bbfe9730dedc47": {
|
"0e936d9dbf9c4fdd86bbfe9730dedc47": {
|
||||||
@@ -2295,9 +2296,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_349eee9f56d64f0cba6fc24ff2c50c9b",
|
"layout": "IPY_MODEL_349eee9f56d64f0cba6fc24ff2c50c9b",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_7e5d3774060e4589aa65982da5ea4ef4",
|
"style": "IPY_MODEL_7e5d3774060e4589aa65982da5ea4ef4",
|
||||||
"value": " 9985/9985 [00:04<00:00, 2604.11 examples/s]"
|
"value": "\u20079985/9985\u2007[00:04<00:00,\u20072604.11\u2007examples/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"16d1283741404b7bb319094c992fce01": {
|
"16d1283741404b7bb319094c992fce01": {
|
||||||
@@ -2316,9 +2317,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_a4e5789584564049b83df7c6c54a3e08",
|
"layout": "IPY_MODEL_a4e5789584564049b83df7c6c54a3e08",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_ff3a94b146a948b6907f5d80c7157f99",
|
"style": "IPY_MODEL_ff3a94b146a948b6907f5d80c7157f99",
|
||||||
"value": " 9985/0 [00:00<00:00, 50763.46 examples/s]"
|
"value": "\u20079985/0\u2007[00:00<00:00,\u200750763.46\u2007examples/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"1811cda0644e4190a9469d1774435d82": {
|
"1811cda0644e4190a9469d1774435d82": {
|
||||||
@@ -2389,9 +2390,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_e366ae3fceec4566b9ed303d6c5f90af",
|
"layout": "IPY_MODEL_e366ae3fceec4566b9ed303d6c5f90af",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_5dd7d150dbe04f08b165ce7f2c27cd11",
|
"style": "IPY_MODEL_5dd7d150dbe04f08b165ce7f2c27cd11",
|
||||||
"value": "model-00008-of-00008.safetensors: 100%"
|
"value": "model-00008-of-00008.safetensors:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"19127c7bb1554ccbac877059f9a82db0": {
|
"19127c7bb1554ccbac877059f9a82db0": {
|
||||||
@@ -2560,9 +2561,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_0dea5caa27384f5689e3cab51f558727",
|
"layout": "IPY_MODEL_0dea5caa27384f5689e3cab51f558727",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_a6f48410b9964fefba0c3009a77dc838",
|
"style": "IPY_MODEL_a6f48410b9964fefba0c3009a77dc838",
|
||||||
"value": " 9.68k/9.68k [00:00<00:00, 812kB/s]"
|
"value": "\u20079.68k/9.68k\u2007[00:00<00:00,\u2007812kB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"1f7d30f71bbd4547a9150d21da071055": {
|
"1f7d30f71bbd4547a9150d21da071055": {
|
||||||
@@ -2633,9 +2634,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_f4a1795dc7514a718f478245f521f0ba",
|
"layout": "IPY_MODEL_f4a1795dc7514a718f478245f521f0ba",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_5e746eb25bbe416fb585fa24e79f5177",
|
"style": "IPY_MODEL_5e746eb25bbe416fb585fa24e79f5177",
|
||||||
"value": "model-00002-of-00008.safetensors: 100%"
|
"value": "model-00002-of-00008.safetensors:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"20352e5f58d24bb8b1f3940efd14fe4a": {
|
"20352e5f58d24bb8b1f3940efd14fe4a": {
|
||||||
@@ -2706,9 +2707,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_1c6f1f10667545aaab958016ba7e2c94",
|
"layout": "IPY_MODEL_1c6f1f10667545aaab958016ba7e2c94",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_e6e969610738449887259063967f82b0",
|
"style": "IPY_MODEL_e6e969610738449887259063967f82b0",
|
||||||
"value": " 2.78M/2.78M [00:00<00:00, 17.8MB/s]"
|
"value": "\u20072.78M/2.78M\u2007[00:00<00:00,\u200717.8MB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"258b7c635c1045329d4669e48c46ccd5": {
|
"258b7c635c1045329d4669e48c46ccd5": {
|
||||||
@@ -3055,9 +3056,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_be724f04b03942b2a033a7e8898bb4fd",
|
"layout": "IPY_MODEL_be724f04b03942b2a033a7e8898bb4fd",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_fcbab4d8dced41a18dfccce81e3a45a0",
|
"style": "IPY_MODEL_fcbab4d8dced41a18dfccce81e3a45a0",
|
||||||
"value": "model-00005-of-00008.safetensors: 100%"
|
"value": "model-00005-of-00008.safetensors:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"3036608c71904ce9ae4bb2a9fa8802d9": {
|
"3036608c71904ce9ae4bb2a9fa8802d9": {
|
||||||
@@ -3076,9 +3077,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_5ca6be24acb548cea130bd58e9954c7c",
|
"layout": "IPY_MODEL_5ca6be24acb548cea130bd58e9954c7c",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_5cfb02ee044b4011a378efa8b54a370f",
|
"style": "IPY_MODEL_5cfb02ee044b4011a378efa8b54a370f",
|
||||||
"value": " 3.96G/3.96G [00:10<00:00, 531MB/s]"
|
"value": "\u20073.96G/3.96G\u2007[00:10<00:00,\u2007531MB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"30a81da86f8043eca301e86a8651201a": {
|
"30a81da86f8043eca301e86a8651201a": {
|
||||||
@@ -3628,9 +3629,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_8f5bd719974e41c3a8dd9a5b0d3d71e6",
|
"layout": "IPY_MODEL_8f5bd719974e41c3a8dd9a5b0d3d71e6",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_b87c84de30e84b3abf4871461fb9cbd3",
|
"style": "IPY_MODEL_b87c84de30e84b3abf4871461fb9cbd3",
|
||||||
"value": "Loading checkpoint shards: 100%"
|
"value": "Loading\u2007checkpoint\u2007shards:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"41f3b32c2f6b4034ae7a3b9124e28bc7": {
|
"41f3b32c2f6b4034ae7a3b9124e28bc7": {
|
||||||
@@ -3790,7 +3791,7 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_39789237703c4a418134243055c9cbf5",
|
"layout": "IPY_MODEL_39789237703c4a418134243055c9cbf5",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_a3a945817f684328b34651fe052393ec",
|
"style": "IPY_MODEL_a3a945817f684328b34651fe052393ec",
|
||||||
"value": "Connecting..."
|
"value": "Connecting..."
|
||||||
}
|
}
|
||||||
@@ -4076,9 +4077,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_4d468f96ec924681ad65eb671674b93e",
|
"layout": "IPY_MODEL_4d468f96ec924681ad65eb671674b93e",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_ad7599de524549c48bf2d3124ad4b299",
|
"style": "IPY_MODEL_ad7599de524549c48bf2d3124ad4b299",
|
||||||
"value": "Dropping Long Sequences (num_proc=2): 100%"
|
"value": "Dropping\u2007Long\u2007Sequences\u2007(num_proc=2):\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"5ca240f31e6b44e3882c5eb37cd5a309": {
|
"5ca240f31e6b44e3882c5eb37cd5a309": {
|
||||||
@@ -4470,9 +4471,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_5e18768f7ad6434ba8b8b8a2e853e204",
|
"layout": "IPY_MODEL_5e18768f7ad6434ba8b8b8a2e853e204",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_bb33aec33a6447078c31bfd728942994",
|
"style": "IPY_MODEL_bb33aec33a6447078c31bfd728942994",
|
||||||
"value": " 728/728 [00:00<00:00, 20.3kB/s]"
|
"value": "\u2007728/728\u2007[00:00<00:00,\u200720.3kB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"62e302ebdad64aada0ffe64ae1c873f3": {
|
"62e302ebdad64aada0ffe64ae1c873f3": {
|
||||||
@@ -4635,9 +4636,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_81c3db71ac704280ad030072655f1537",
|
"layout": "IPY_MODEL_81c3db71ac704280ad030072655f1537",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_042e091f75694c47aee761e760e76773",
|
"style": "IPY_MODEL_042e091f75694c47aee761e760e76773",
|
||||||
"value": " 9985/9985 [00:02<00:00, 3977.47 examples/s]"
|
"value": "\u20079985/9985\u2007[00:02<00:00,\u20073977.47\u2007examples/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"67da6c4260574869aa24c3cbc1bc1654": {
|
"67da6c4260574869aa24c3cbc1bc1654": {
|
||||||
@@ -4777,7 +4778,7 @@
|
|||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"disabled": false,
|
"disabled": false,
|
||||||
"layout": "IPY_MODEL_2e257c8be2da40b4bb67a9e4ab6811f3",
|
"layout": "IPY_MODEL_2e257c8be2da40b4bb67a9e4ab6811f3",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_56e3768bef5a4b9db4168c5c17f509c2",
|
"style": "IPY_MODEL_56e3768bef5a4b9db4168c5c17f509c2",
|
||||||
"value": ""
|
"value": ""
|
||||||
}
|
}
|
||||||
@@ -4822,9 +4823,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_41f3b32c2f6b4034ae7a3b9124e28bc7",
|
"layout": "IPY_MODEL_41f3b32c2f6b4034ae7a3b9124e28bc7",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_a10d0a76010f4e508c65a9b69ebc5156",
|
"style": "IPY_MODEL_a10d0a76010f4e508c65a9b69ebc5156",
|
||||||
"value": "Tokenizing Prompts (num_proc=2): 100%"
|
"value": "Tokenizing\u2007Prompts\u2007(num_proc=2):\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"704f2f5a9b1c49d5a75a0025a5dda11b": {
|
"704f2f5a9b1c49d5a75a0025a5dda11b": {
|
||||||
@@ -5070,9 +5071,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_93a44a11aa4846fa8efc6c1413ef1627",
|
"layout": "IPY_MODEL_93a44a11aa4846fa8efc6c1413ef1627",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_a55060adc3564407ac81ad7297d34aaa",
|
"style": "IPY_MODEL_a55060adc3564407ac81ad7297d34aaa",
|
||||||
"value": "train.jsonl: 100%"
|
"value": "train.jsonl:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"7be6f04c284e4326bb4ff3d301e7b3c6": {
|
"7be6f04c284e4326bb4ff3d301e7b3c6": {
|
||||||
@@ -5137,9 +5138,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_7fd44cf9ca6e4726bfd7ac21846d6a14",
|
"layout": "IPY_MODEL_7fd44cf9ca6e4726bfd7ac21846d6a14",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_366a343b62fa47d8985a3bd464d99f9e",
|
"style": "IPY_MODEL_366a343b62fa47d8985a3bd464d99f9e",
|
||||||
"value": "config.json: 100%"
|
"value": "config.json:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"7cd0b85ebd204b7aba908417811ce4e0": {
|
"7cd0b85ebd204b7aba908417811ce4e0": {
|
||||||
@@ -5338,9 +5339,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_67da6c4260574869aa24c3cbc1bc1654",
|
"layout": "IPY_MODEL_67da6c4260574869aa24c3cbc1bc1654",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_94b9088614464f60a203de39dbcae853",
|
"style": "IPY_MODEL_94b9088614464f60a203de39dbcae853",
|
||||||
"value": " 8/8 [01:47<00:00, 11.64s/it]"
|
"value": "\u20078/8\u2007[01:47<00:00,\u200711.64s/it]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"823f1c78f15043e38bbd4dca3932a86a": {
|
"823f1c78f15043e38bbd4dca3932a86a": {
|
||||||
@@ -5487,7 +5488,7 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_8640ac440fbc4644b9a3af7ba3ae7183",
|
"layout": "IPY_MODEL_8640ac440fbc4644b9a3af7ba3ae7183",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_5cea7996f02040b187ece0bb2d6a8d1f",
|
"style": "IPY_MODEL_5cea7996f02040b187ece0bb2d6a8d1f",
|
||||||
"value": "<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svg\nalt='Hugging Face'> <br> Copy a token from <a\nhref=\"https://huggingface.co/settings/tokens\" target=\"_blank\">your Hugging Face\ntokens page</a> and paste it below. <br> Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. </center>"
|
"value": "<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svg\nalt='Hugging Face'> <br> Copy a token from <a\nhref=\"https://huggingface.co/settings/tokens\" target=\"_blank\">your Hugging Face\ntokens page</a> and paste it below. <br> Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. </center>"
|
||||||
}
|
}
|
||||||
@@ -5508,9 +5509,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_ef223e8504b64e3592589880326aaf41",
|
"layout": "IPY_MODEL_ef223e8504b64e3592589880326aaf41",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_598da69727bd4fb8b1caf465ac736d7a",
|
"style": "IPY_MODEL_598da69727bd4fb8b1caf465ac736d7a",
|
||||||
"value": " 1.67M/1.67M [00:00<00:00, 19.0MB/s]"
|
"value": "\u20071.67M/1.67M\u2007[00:00<00:00,\u200719.0MB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"897b77a56c09479bb11d7f2a30997e55": {
|
"897b77a56c09479bb11d7f2a30997e55": {
|
||||||
@@ -5716,9 +5717,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_37de928300e34184881039378bd75e7f",
|
"layout": "IPY_MODEL_37de928300e34184881039378bd75e7f",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_0e936d9dbf9c4fdd86bbfe9730dedc47",
|
"style": "IPY_MODEL_0e936d9dbf9c4fdd86bbfe9730dedc47",
|
||||||
"value": " 3.96G/3.96G [00:13<00:00, 273MB/s]"
|
"value": "\u20073.96G/3.96G\u2007[00:13<00:00,\u2007273MB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"936d04b5fe1b4c63bf0b080e423d051b": {
|
"936d04b5fe1b4c63bf0b080e423d051b": {
|
||||||
@@ -6049,9 +6050,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_d955dcaa0e944e719f3a06139dd54a03",
|
"layout": "IPY_MODEL_d955dcaa0e944e719f3a06139dd54a03",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_d3de2662c7964f1ba96e58da382af720",
|
"style": "IPY_MODEL_d3de2662c7964f1ba96e58da382af720",
|
||||||
"value": "merges.txt: 100%"
|
"value": "merges.txt:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"9cd5211b5d8b457aa0002f1d17b80028": {
|
"9cd5211b5d8b457aa0002f1d17b80028": {
|
||||||
@@ -6070,9 +6071,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_6932489232ec4ab18a160b1e7fbcdfe1",
|
"layout": "IPY_MODEL_6932489232ec4ab18a160b1e7fbcdfe1",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_4540927d98f54466b434ba4c0edf045d",
|
"style": "IPY_MODEL_4540927d98f54466b434ba4c0edf045d",
|
||||||
"value": "model-00007-of-00008.safetensors: 100%"
|
"value": "model-00007-of-00008.safetensors:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"9d4897eefb5f48259ffb2d23e332f752": {
|
"9d4897eefb5f48259ffb2d23e332f752": {
|
||||||
@@ -6302,9 +6303,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_3aaecbf540f54a2db9ab0931e3b1fe57",
|
"layout": "IPY_MODEL_3aaecbf540f54a2db9ab0931e3b1fe57",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_9e333ed3b5014069ac1dd969255dd591",
|
"style": "IPY_MODEL_9e333ed3b5014069ac1dd969255dd591",
|
||||||
"value": " 239/239 [00:00<00:00, 30.9kB/s]"
|
"value": "\u2007239/239\u2007[00:00<00:00,\u200730.9kB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"a20927bf5f2c41f58c1e31ac858ab36c": {
|
"a20927bf5f2c41f58c1e31ac858ab36c": {
|
||||||
@@ -6323,9 +6324,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_1811cda0644e4190a9469d1774435d82",
|
"layout": "IPY_MODEL_1811cda0644e4190a9469d1774435d82",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_35c811d2ae8e43f3b5cecbdd3cfa857f",
|
"style": "IPY_MODEL_35c811d2ae8e43f3b5cecbdd3cfa857f",
|
||||||
"value": "tokenizer.json: 100%"
|
"value": "tokenizer.json:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"a3a945817f684328b34651fe052393ec": {
|
"a3a945817f684328b34651fe052393ec": {
|
||||||
@@ -6359,9 +6360,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_ed5ca967ad5342929e578ac6aa4dc4c0",
|
"layout": "IPY_MODEL_ed5ca967ad5342929e578ac6aa4dc4c0",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_af401d117d5047629d3a6e2361757b62",
|
"style": "IPY_MODEL_af401d117d5047629d3a6e2361757b62",
|
||||||
"value": "model-00001-of-00008.safetensors: 100%"
|
"value": "model-00001-of-00008.safetensors:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"a4e5789584564049b83df7c6c54a3e08": {
|
"a4e5789584564049b83df7c6c54a3e08": {
|
||||||
@@ -6493,9 +6494,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_fa1282ccc7544e4f818e2f03ccffe4a5",
|
"layout": "IPY_MODEL_fa1282ccc7544e4f818e2f03ccffe4a5",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_bbbf575d2a4b4c6ea8389be79b2a6039",
|
"style": "IPY_MODEL_bbbf575d2a4b4c6ea8389be79b2a6039",
|
||||||
"value": "model.safetensors.index.json: 100%"
|
"value": "model.safetensors.index.json:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"ab93eabd7cea4b94b4b7a387f101e8a1": {
|
"ab93eabd7cea4b94b4b7a387f101e8a1": {
|
||||||
@@ -6581,9 +6582,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_62e302ebdad64aada0ffe64ae1c873f3",
|
"layout": "IPY_MODEL_62e302ebdad64aada0ffe64ae1c873f3",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_bd1b0dfed6d34d16af33a4a58330f5ec",
|
"style": "IPY_MODEL_bd1b0dfed6d34d16af33a4a58330f5ec",
|
||||||
"value": "Saving the dataset (1/1 shards): 100%"
|
"value": "Saving\u2007the\u2007dataset\u2007(1/1\u2007shards):\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"ad7599de524549c48bf2d3124ad4b299": {
|
"ad7599de524549c48bf2d3124ad4b299": {
|
||||||
@@ -6966,9 +6967,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_2b3a2659b12244bd8548320320016dbf",
|
"layout": "IPY_MODEL_2b3a2659b12244bd8548320320016dbf",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_0cd7efffbb3c4c4b972e63749f61ab97",
|
"style": "IPY_MODEL_0cd7efffbb3c4c4b972e63749f61ab97",
|
||||||
"value": "Generating train split: "
|
"value": "Generating\u2007train\u2007split:\u2007"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"b87c84de30e84b3abf4871461fb9cbd3": {
|
"b87c84de30e84b3abf4871461fb9cbd3": {
|
||||||
@@ -7084,9 +7085,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_0f480e3a0b0a45d2a2d2dec3cad923f3",
|
"layout": "IPY_MODEL_0f480e3a0b0a45d2a2d2dec3cad923f3",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_fcb30372e7404c5d8a1ad4df91e6c7b2",
|
"style": "IPY_MODEL_fcb30372e7404c5d8a1ad4df91e6c7b2",
|
||||||
"value": " 1.91G/1.91G [00:05<00:00, 444MB/s]"
|
"value": "\u20071.91G/1.91G\u2007[00:05<00:00,\u2007444MB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"bd1b0dfed6d34d16af33a4a58330f5ec": {
|
"bd1b0dfed6d34d16af33a4a58330f5ec": {
|
||||||
@@ -7324,9 +7325,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_158c8b85dbf34de6a94b4e35e2fc7d5a",
|
"layout": "IPY_MODEL_158c8b85dbf34de6a94b4e35e2fc7d5a",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_0b4c9753a7cb4354b8e5f187e6e1ad7c",
|
"style": "IPY_MODEL_0b4c9753a7cb4354b8e5f187e6e1ad7c",
|
||||||
"value": " 3.96G/3.96G [00:15<00:00, 564MB/s]"
|
"value": "\u20073.96G/3.96G\u2007[00:15<00:00,\u2007564MB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"c0991cf63ee6458b96e9a75e7a88b61a": {
|
"c0991cf63ee6458b96e9a75e7a88b61a": {
|
||||||
@@ -7345,9 +7346,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_ed28e2e0410d4e0b855467e798e53d66",
|
"layout": "IPY_MODEL_ed28e2e0410d4e0b855467e798e53d66",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_d93f134f802b4b69b575bdaf07dbd27c",
|
"style": "IPY_MODEL_d93f134f802b4b69b575bdaf07dbd27c",
|
||||||
"value": "tokenizer_config.json: 100%"
|
"value": "tokenizer_config.json:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"c12ea43372ac4d57bb9605f1a429b397": {
|
"c12ea43372ac4d57bb9605f1a429b397": {
|
||||||
@@ -7580,9 +7581,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_8bc9d8ba866c442b9118d9630009939c",
|
"layout": "IPY_MODEL_8bc9d8ba866c442b9118d9630009939c",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_9f56a2d9979c4bd8928c644c22c3ecdf",
|
"style": "IPY_MODEL_9f56a2d9979c4bd8928c644c22c3ecdf",
|
||||||
"value": "model-00003-of-00008.safetensors: 100%"
|
"value": "model-00003-of-00008.safetensors:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"c6164e05a1914ae48083db9ad7f4ef7c": {
|
"c6164e05a1914ae48083db9ad7f4ef7c": {
|
||||||
@@ -7693,9 +7694,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_e40d1c1ac9494b3bade9858324e7ffdf",
|
"layout": "IPY_MODEL_e40d1c1ac9494b3bade9858324e7ffdf",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_d65b6b060d9845779299491ac5599c31",
|
"style": "IPY_MODEL_d65b6b060d9845779299491ac5599c31",
|
||||||
"value": " 9985/9985 [01:04<00:00, 189.08 examples/s]"
|
"value": "\u20079985/9985\u2007[01:04<00:00,\u2007189.08\u2007examples/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"c7433acd3c4841e6958ae8f7e87b1808": {
|
"c7433acd3c4841e6958ae8f7e87b1808": {
|
||||||
@@ -7736,9 +7737,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_0077aedc3d174560bce924ee89e9c006",
|
"layout": "IPY_MODEL_0077aedc3d174560bce924ee89e9c006",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_00321cce58884f6f9b3855a21fcd9187",
|
"style": "IPY_MODEL_00321cce58884f6f9b3855a21fcd9187",
|
||||||
"value": "Add position_id column (Sample Packing) (num_proc=2): 100%"
|
"value": "Add\u2007position_id\u2007column\u2007(Sample\u2007Packing)\u2007(num_proc=2):\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"ca65e32eb52f48c09a84b33cb18f22cd": {
|
"ca65e32eb52f48c09a84b33cb18f22cd": {
|
||||||
@@ -8161,9 +8162,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_63580b6fb30642479fe3000915bf551a",
|
"layout": "IPY_MODEL_63580b6fb30642479fe3000915bf551a",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_8f726dbfb45d4528afa33e36a6313267",
|
"style": "IPY_MODEL_8f726dbfb45d4528afa33e36a6313267",
|
||||||
"value": " 27.3M/27.3M [00:00<00:00, 31.0MB/s]"
|
"value": "\u200727.3M/27.3M\u2007[00:00<00:00,\u200731.0MB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"d43c6df07ddb466587807d6dbe1ff614": {
|
"d43c6df07ddb466587807d6dbe1ff614": {
|
||||||
@@ -8182,9 +8183,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_8c4d4fc5a30f4e7cb3be53fe2adda33d",
|
"layout": "IPY_MODEL_8c4d4fc5a30f4e7cb3be53fe2adda33d",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_e90658f4bcb642baa78426012f863152",
|
"style": "IPY_MODEL_e90658f4bcb642baa78426012f863152",
|
||||||
"value": "model-00004-of-00008.safetensors: 100%"
|
"value": "model-00004-of-00008.safetensors:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"d65b6b060d9845779299491ac5599c31": {
|
"d65b6b060d9845779299491ac5599c31": {
|
||||||
@@ -8473,9 +8474,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_34cf3df51fbc41cabfdbba153c007f0e",
|
"layout": "IPY_MODEL_34cf3df51fbc41cabfdbba153c007f0e",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_ac764024cf1c4e08ba7749afd2cd20ac",
|
"style": "IPY_MODEL_ac764024cf1c4e08ba7749afd2cd20ac",
|
||||||
"value": "vocab.json: 100%"
|
"value": "vocab.json:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"dfd2a2649b8341ef913207526708aff1": {
|
"dfd2a2649b8341ef913207526708aff1": {
|
||||||
@@ -8668,9 +8669,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_c6164e05a1914ae48083db9ad7f4ef7c",
|
"layout": "IPY_MODEL_c6164e05a1914ae48083db9ad7f4ef7c",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_813621384dc748b0ad06775e22761c0b",
|
"style": "IPY_MODEL_813621384dc748b0ad06775e22761c0b",
|
||||||
"value": " 9985/9985 [00:03<00:00, 3622.89 examples/s]"
|
"value": "\u20079985/9985\u2007[00:03<00:00,\u20073622.89\u2007examples/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"e400cbf14bcc446a9d33b210cd93550b": {
|
"e400cbf14bcc446a9d33b210cd93550b": {
|
||||||
@@ -9064,9 +9065,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_fba7aa824b38467ab3061b226114cdec",
|
"layout": "IPY_MODEL_fba7aa824b38467ab3061b226114cdec",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_f3075dccbd2747b4a7913b66f44f2596",
|
"style": "IPY_MODEL_f3075dccbd2747b4a7913b66f44f2596",
|
||||||
"value": " 3.96G/3.96G [00:13<00:00, 398MB/s]"
|
"value": "\u20073.96G/3.96G\u2007[00:13<00:00,\u2007398MB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"ec030fc3c346426f9abc3a89892258d3": {
|
"ec030fc3c346426f9abc3a89892258d3": {
|
||||||
@@ -9109,9 +9110,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_936d04b5fe1b4c63bf0b080e423d051b",
|
"layout": "IPY_MODEL_936d04b5fe1b4c63bf0b080e423d051b",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_f1cef8e8dc2646fb9fd09f3b09081074",
|
"style": "IPY_MODEL_f1cef8e8dc2646fb9fd09f3b09081074",
|
||||||
"value": " 36.5k/36.5k [00:00<00:00, 4.32MB/s]"
|
"value": "\u200736.5k/36.5k\u2007[00:00<00:00,\u20074.32MB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"ed28e2e0410d4e0b855467e798e53d66": {
|
"ed28e2e0410d4e0b855467e798e53d66": {
|
||||||
@@ -9421,9 +9422,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_735d4f225b24414294fc1b213c61223c",
|
"layout": "IPY_MODEL_735d4f225b24414294fc1b213c61223c",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_5e5e15b0569b474c9620083b3ec6af55",
|
"style": "IPY_MODEL_5e5e15b0569b474c9620083b3ec6af55",
|
||||||
"value": "generation_config.json: 100%"
|
"value": "generation_config.json:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"f4667818b9d34a09891cd727a429a610": {
|
"f4667818b9d34a09891cd727a429a610": {
|
||||||
@@ -9442,9 +9443,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_4b27c267393640f28f6eae0875bd2ed9",
|
"layout": "IPY_MODEL_4b27c267393640f28f6eae0875bd2ed9",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_9858cb74a09748a39e8149baac96702c",
|
"style": "IPY_MODEL_9858cb74a09748a39e8149baac96702c",
|
||||||
"value": " 3.96G/3.96G [00:11<00:00, 457MB/s]"
|
"value": "\u20073.96G/3.96G\u2007[00:11<00:00,\u2007457MB/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"f4a1795dc7514a718f478245f521f0ba": {
|
"f4a1795dc7514a718f478245f521f0ba": {
|
||||||
@@ -9829,9 +9830,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_d1f9b10c130542f094c8fd3d1e23b5e9",
|
"layout": "IPY_MODEL_d1f9b10c130542f094c8fd3d1e23b5e9",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_e575d87a7efe4ec7b1efde489839d4a6",
|
"style": "IPY_MODEL_e575d87a7efe4ec7b1efde489839d4a6",
|
||||||
"value": "model-00006-of-00008.safetensors: 100%"
|
"value": "model-00006-of-00008.safetensors:\u2007100%"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"fe18bba7f3fb4c31bf840541f36b3425": {
|
"fe18bba7f3fb4c31bf840541f36b3425": {
|
||||||
@@ -9872,9 +9873,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_e5a82df528bb4e408797a3b6c2758f4a",
|
"layout": "IPY_MODEL_e5a82df528bb4e408797a3b6c2758f4a",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_f113ebd8c1c34806bea4dd7ed3035173",
|
"style": "IPY_MODEL_f113ebd8c1c34806bea4dd7ed3035173",
|
||||||
"value": " 9985/9985 [00:00<00:00, 44264.88 examples/s]"
|
"value": "\u20079985/9985\u2007[00:00<00:00,\u200744264.88\u2007examples/s]"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"fea1b70fb46745feb5111b3929175b5d": {
|
"fea1b70fb46745feb5111b3929175b5d": {
|
||||||
@@ -9930,9 +9931,9 @@
|
|||||||
"description": "",
|
"description": "",
|
||||||
"description_tooltip": null,
|
"description_tooltip": null,
|
||||||
"layout": "IPY_MODEL_ab93eabd7cea4b94b4b7a387f101e8a1",
|
"layout": "IPY_MODEL_ab93eabd7cea4b94b4b7a387f101e8a1",
|
||||||
"placeholder": "",
|
"placeholder": "\u200b",
|
||||||
"style": "IPY_MODEL_704f2f5a9b1c49d5a75a0025a5dda11b",
|
"style": "IPY_MODEL_704f2f5a9b1c49d5a75a0025a5dda11b",
|
||||||
"value": " 3.96G/3.96G [00:12<00:00, 656MB/s]"
|
"value": "\u20073.96G/3.96G\u2007[00:12<00:00,\u2007656MB/s]"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,8 +16,13 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
|
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
|
||||||
pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
|
# Option A: manage dependencies in your project
|
||||||
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
|
uv add 'axolotl>=0.12.0'
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
|
|
||||||
|
# Option B: quick install
|
||||||
|
uv pip install 'axolotl>=0.12.0'
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
|
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
|
||||||
|
|||||||
@@ -52,7 +52,6 @@ gradient_checkpointing: true
|
|||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
scaling_softmax: true
|
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|||||||
@@ -1,77 +0,0 @@
|
|||||||
base_model: google/gemma-3-1b-it
|
|
||||||
|
|
||||||
model_type: Gemma3ForCausalLM
|
|
||||||
cls_model_config: Gemma3TextConfig
|
|
||||||
|
|
||||||
# gemma3 doesn't seem to play nice with ddp
|
|
||||||
ddp_find_unused_parameters: true
|
|
||||||
|
|
||||||
chat_template: gemma3
|
|
||||||
eot_tokens:
|
|
||||||
- <end_of_turn>
|
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: cgato/SlimOrcaDedupCleaned
|
|
||||||
type: chat_template
|
|
||||||
field_messages: conversations
|
|
||||||
message_property_mappings:
|
|
||||||
role: from
|
|
||||||
content: value
|
|
||||||
|
|
||||||
dataset_prepared_path:
|
|
||||||
val_set_size: 0
|
|
||||||
output_dir: ./outputs/eaft-gemma-3-1b
|
|
||||||
|
|
||||||
use_eaft: true
|
|
||||||
eaft_alpha: 1.0
|
|
||||||
eaft_k: 20
|
|
||||||
|
|
||||||
sequence_len: 1024
|
|
||||||
sample_packing: false
|
|
||||||
|
|
||||||
adapter:
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 1
|
|
||||||
eval_batch_size: 1
|
|
||||||
max_steps: 1000
|
|
||||||
evaluation_strategy: "no"
|
|
||||||
optimizer: adamw_torch_fused
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 5e-5
|
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
|
||||||
fp16:
|
|
||||||
tf32: true
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
gradient_checkpointing_kwargs:
|
|
||||||
use_reentrant: false
|
|
||||||
|
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
weight_decay: 0.0
|
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
|
||||||
@@ -1,5 +1,7 @@
|
|||||||
base_model: google/gemma-3-1b-it
|
base_model: google/gemma-3-1b-it
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
|
model_type: AutoModelForCausalLM
|
||||||
|
tokenizer_type: AutoTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
@@ -24,15 +26,10 @@ datasets:
|
|||||||
val_set_size: 0.0
|
val_set_size: 0.0
|
||||||
output_dir: ./outputs/out
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
# Freeze vision tower
|
|
||||||
unfrozen_parameters:
|
|
||||||
- ^model\.language_model\..*
|
|
||||||
- ^lm_head\..*
|
|
||||||
|
|
||||||
adapter: qlora
|
adapter: qlora
|
||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
|
||||||
sequence_len: 2048
|
sequence_len: 2048
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
base_model: google/gemma-3-270m-it
|
base_model: google/gemma-3-270m-it
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
|
model_type: AutoModelForCausalLM
|
||||||
|
tokenizer_type: AutoTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
@@ -24,15 +26,10 @@ datasets:
|
|||||||
val_set_size: 0.0
|
val_set_size: 0.0
|
||||||
output_dir: ./outputs/out
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
# Freeze vision tower
|
|
||||||
unfrozen_parameters:
|
|
||||||
- ^model\.language_model\..*
|
|
||||||
- ^lm_head\..*
|
|
||||||
|
|
||||||
adapter: qlora
|
adapter: qlora
|
||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
|
||||||
sequence_len: 2048
|
sequence_len: 2048
|
||||||
|
|||||||
@@ -20,11 +20,6 @@ dataset_prepared_path: last_run_prepared
|
|||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./outputs/out
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
# Freeze vision tower
|
|
||||||
unfrozen_parameters:
|
|
||||||
- ^model\.language_model\..*
|
|
||||||
- ^lm_head\..*
|
|
||||||
|
|
||||||
adapter: qlora
|
adapter: qlora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|
||||||
@@ -34,8 +29,8 @@ sample_packing: true
|
|||||||
|
|
||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ pad_to_sequence_len: false
|
|||||||
|
|
||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0
|
lora_dropout: 0.05
|
||||||
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
|
|||||||
@@ -10,17 +10,22 @@ Gemma-3n is a family of multimodal models from Google found on [HuggingFace](htt
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
|
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
|
||||||
pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
|
# Option A: manage dependencies in your project
|
||||||
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
|
uv add 'axolotl>=0.12.0'
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
|
|
||||||
|
# Option B: quick install
|
||||||
|
uv pip install 'axolotl>=0.12.0'
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
```
|
```
|
||||||
|
|
||||||
2. In addition to Axolotl's requirements, Gemma-3n requires:
|
2. In addition to Axolotl's requirements, Gemma-3n requires:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip3 install timm==1.0.17
|
uv pip install timm==1.0.17
|
||||||
|
|
||||||
# for loading audio data
|
# for loading audio data
|
||||||
pip3 install librosa==0.11.0
|
uv pip install librosa==0.11.0
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Download sample dataset files
|
3. Download sample dataset files
|
||||||
|
|||||||
@@ -1,72 +0,0 @@
|
|||||||
# Finetune Z.ai's GLM-4.5-Air with Axolotl
|
|
||||||
|
|
||||||
[GLM-4.5-Air](https://huggingface.co/zai-org/GLM-4.5-Air) is a MoE model by Z.ai.
|
|
||||||
|
|
||||||
This guide shows how to fine-tune it with Axolotl.
|
|
||||||
|
|
||||||
## Getting started
|
|
||||||
|
|
||||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
|
||||||
|
|
||||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
|
||||||
|
|
||||||
3. Run the finetuning example:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# QLoRA (1x80GB @ ~63.4GiB/GPU)
|
|
||||||
axolotl train examples/glm45/glm-45-air-qlora.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
### Dataset
|
|
||||||
|
|
||||||
In addition to the standard OpenAI Messages format, GLM-4.5 supports an extra parameter for thinking in the assistant section.
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"reasoning_content": "...", // or have </think>...</think> in `content`
|
|
||||||
"content": "..."
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Make sure you set the below extra attributes if needed:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
datasets:
|
|
||||||
- path: ...
|
|
||||||
type: chat_template
|
|
||||||
message_property_mappings:
|
|
||||||
role: role
|
|
||||||
content: content
|
|
||||||
|
|
||||||
# tool_calls: tool_calls # uncomment if using tools
|
|
||||||
# reasoning_content: reasoning_content # uncomment if have reasoning
|
|
||||||
|
|
||||||
# Uncomment if training on tool role (you would rarely if ever need this)
|
|
||||||
# eot_tokens:
|
|
||||||
# - <|observation|>
|
|
||||||
```
|
|
||||||
|
|
||||||
### Tips
|
|
||||||
|
|
||||||
- The role name for tools in this template is `tool`.
|
|
||||||
- You will see this Axolotl WARNING — this is expected as the template does not use EOS:
|
|
||||||
```
|
|
||||||
EOS token '<|endoftext|>' not found in chat_template. Please check if your template/EOS token is correct.
|
|
||||||
```
|
|
||||||
- You can run a full finetuning by removing `adapter: qlora`, `load_in_4bit: true`, and `quantize_moe_experts: true` from the config.
|
|
||||||
- **LoRA kernels**: Incompatible with this model. Must be explicitly disabled (`lora_*_kernel: false`).
|
|
||||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
|
||||||
|
|
||||||
## Optimization Guides
|
|
||||||
|
|
||||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
|
||||||
|
|
||||||
## Related Resources
|
|
||||||
|
|
||||||
- [GLM-4.5-Air on HuggingFace](https://huggingface.co/zai-org/GLM-4.5-Air)
|
|
||||||
- [GLM-4.5 Blog](https://z.ai/blog/glm-4.5)
|
|
||||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
|
||||||
- [Axolotl Website](https://axolotl.ai)
|
|
||||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
|
||||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
base_model: zai-org/GLM-4.5-Air
|
|
||||||
|
|
||||||
# Automatically upload checkpoint and final model to HF
|
|
||||||
# hub_model_id: username/custom_model_name
|
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: true
|
|
||||||
|
|
||||||
quantize_moe_experts: true # important
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: fozziethebeat/alpaca_messages_2k_test
|
|
||||||
type: chat_template
|
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.1
|
|
||||||
output_dir: ./outputs/lora-out
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
lora_r: 16
|
|
||||||
lora_alpha: 8
|
|
||||||
lora_dropout: 0
|
|
||||||
lora_target_modules:
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
- k_proj
|
|
||||||
- o_proj
|
|
||||||
|
|
||||||
# lora_target_parameters:
|
|
||||||
# - mlp.experts.gate_up_proj
|
|
||||||
# - mlp.experts.down_proj
|
|
||||||
|
|
||||||
lora_mlp_kernel: false
|
|
||||||
lora_qkv_kernel: false
|
|
||||||
lora_o_kernel: false
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 2
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_bnb_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
bf16: auto
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
evals_per_epoch: 1
|
|
||||||
saves_per_epoch: 1
|
|
||||||
|
|
||||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
|
||||||
@@ -1,44 +0,0 @@
|
|||||||
# Finetune GLM-4.6V with Axolotl
|
|
||||||
|
|
||||||
GLM-4.6V is a family of vision-language models from ZhipuAI found on [HuggingFace](https://huggingface.co/zai-org/GLM-4.6V). This guide shows how to fine-tune it with Axolotl for vision-language tasks.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Getting started
|
|
||||||
|
|
||||||
1. Install Axolotl from source following the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build).
|
|
||||||
|
|
||||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
|
||||||
|
|
||||||
|
|
||||||
3. Run the fine-tuning:
|
|
||||||
|
|
||||||
glm-4-6v-flash(9B)
|
|
||||||
```bash
|
|
||||||
axolotl train examples/glm46v/glm-4-6v-flash-qlora.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
Let us know how it goes. Happy finetuning! 🚀
|
|
||||||
|
|
||||||
## Tips
|
|
||||||
|
|
||||||
- Vision datasets should follow the format described in the [multimodal docs](https://docs.axolotl.ai/docs/multimodal.html#dataset-format)
|
|
||||||
- You can run a **full finetuning** by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
|
||||||
- Read more on how to load your own dataset in the [dataset loading docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
|
||||||
|
|
||||||
## Supported Models
|
|
||||||
|
|
||||||
- **GLM-4.6V**: Full vision-language model (`zai-org/GLM-4.6V`)
|
|
||||||
- **GLM-4.6V-Flash**: Faster variant (`zai-org/GLM-4.6V-Flash`)
|
|
||||||
|
|
||||||
## Optimization Guides
|
|
||||||
|
|
||||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
|
||||||
|
|
||||||
## Related Resources
|
|
||||||
|
|
||||||
- [ZhipuAI GLM-4.6V](https://huggingface.co/zai-org/GLM-4.6V)
|
|
||||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
|
||||||
- [Axolotl Website](https://axolotl.ai)
|
|
||||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
|
||||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
base_model: zai-org/GLM-4.6V-Flash
|
|
||||||
trust_remote_code: true
|
|
||||||
|
|
||||||
processor_type: AutoProcessor
|
|
||||||
load_in_4bit: true
|
|
||||||
|
|
||||||
# these 3 lines are needed for now to handle vision chat templates w images
|
|
||||||
skip_prepare_dataset: true
|
|
||||||
remove_unused_columns: false
|
|
||||||
sample_packing: false
|
|
||||||
ddp_find_unused_parameters: true
|
|
||||||
|
|
||||||
output_dir: ./outputs/glm-4-6v-flash-qlora
|
|
||||||
datasets:
|
|
||||||
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
|
||||||
type: chat_template
|
|
||||||
split: train[:1%]
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_r: 16
|
|
||||||
lora_alpha: 32
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_modules:
|
|
||||||
- gate_proj
|
|
||||||
- down_proj
|
|
||||||
- up_proj
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
- k_proj
|
|
||||||
- o_proj
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 1
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
bf16: auto
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
gradient_checkpointing_kwargs:
|
|
||||||
use_reentrant: false
|
|
||||||
logging_steps: 1
|
|
||||||
sdp_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
evals_per_epoch: 0
|
|
||||||
saves_per_epoch: 1
|
|
||||||
weight_decay: 0.0
|
|
||||||
@@ -1,50 +0,0 @@
|
|||||||
base_model: zai-org/GLM-4.6V-Flash
|
|
||||||
trust_remote_code: true
|
|
||||||
|
|
||||||
processor_type: AutoProcessor
|
|
||||||
load_in_4bit: true
|
|
||||||
|
|
||||||
# these 3 lines are needed for now to handle vision chat templates w images
|
|
||||||
skip_prepare_dataset: true
|
|
||||||
remove_unused_columns: false
|
|
||||||
sample_packing: false
|
|
||||||
|
|
||||||
output_dir: ./outputs/glm-4-6v-flash-qlora
|
|
||||||
datasets:
|
|
||||||
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
|
||||||
type: chat_template
|
|
||||||
split: train[:1%]
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_r: 16
|
|
||||||
lora_alpha: 32
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_modules:
|
|
||||||
- gate_proj
|
|
||||||
- down_proj
|
|
||||||
- up_proj
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
- k_proj
|
|
||||||
- o_proj
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 1
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
bf16: auto
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
logging_steps: 1
|
|
||||||
sdp_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
evals_per_epoch: 0
|
|
||||||
saves_per_epoch: 1
|
|
||||||
weight_decay: 0.0
|
|
||||||
@@ -1,65 +0,0 @@
|
|||||||
# Finetune Z.ai's GLM-4.7-Flash with Axolotl
|
|
||||||
|
|
||||||
[GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash) is a 30B-A3B MoE model by Z.ai.
|
|
||||||
|
|
||||||
This guide shows how to fine-tune it with Axolotl.
|
|
||||||
|
|
||||||
## Getting started
|
|
||||||
|
|
||||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
|
||||||
|
|
||||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
|
||||||
|
|
||||||
3. Run the finetuning example:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# QLoRA
|
|
||||||
# - no target experts (1x48GB @ ~24GiB/GPU)
|
|
||||||
# - target experts (1x48GB @ ~34GiB/GPU)
|
|
||||||
axolotl train examples/glm47-flash/qlora.yaml
|
|
||||||
|
|
||||||
# QLoRA FSDP2 no target experts (2x48GB @ ~29GiB/GPU)
|
|
||||||
axolotl train examples/glm47-flash/qlora_fsdp.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# LoRA
|
|
||||||
# - no target experts (1x48GB @ ~35GiB/GPU)
|
|
||||||
# - target experts (1x48GB @ OOM. Projected ~45-50GiB/GPU)
|
|
||||||
axolotl train examples/glm47-flash/lora.yaml
|
|
||||||
|
|
||||||
# LoRA FSDP2 no target experts (2x48GB @ ~43GiB/GPU)
|
|
||||||
axolotl train examples/glm47-flash/lora_fsdp.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
### MoE Expert Quantization & Expert LoRA
|
|
||||||
|
|
||||||
This model quantize expert weights on load. To learn about expert quantization, expert LoRA targeting, and related limitations, see the [MoE Expert Quantization](https://docs.axolotl.ai/docs/expert_quantization.html) docs.
|
|
||||||
|
|
||||||
## Limitations
|
|
||||||
|
|
||||||
- **lora_target_linear**: Incompatible for this model.
|
|
||||||
- **LoRA kernels**: Incompatible with this model due to non-standard attention projections (DSA). Must be explicitly disabled (`lora_*_kernel: false`).
|
|
||||||
|
|
||||||
|
|
||||||
### TIPS
|
|
||||||
|
|
||||||
- For inference, the official Z.ai team recommends these default settings (most tasks):
|
|
||||||
- `temperature: 1.0`
|
|
||||||
- `top_p: 0.95`
|
|
||||||
- `max_new_tokens: 131072`
|
|
||||||
- You can run a full finetuning by removing `adapter: qlora`, `load_in_4bit: true`, and `quantize_moe_experts: true` from the config. This is heavy, so we have not tested this.
|
|
||||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
|
||||||
|
|
||||||
## Optimization Guides
|
|
||||||
|
|
||||||
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
|
|
||||||
|
|
||||||
## Related Resources
|
|
||||||
|
|
||||||
- [GLM-4.7-Flash on HuggingFace](https://huggingface.co/zai-org/GLM-4.7-Flash)
|
|
||||||
- [GLM-4.7 Blog](https://z.ai/blog/glm-4.7)
|
|
||||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
|
||||||
- [Axolotl Website](https://axolotl.ai)
|
|
||||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
|
||||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
|
||||||
@@ -1,65 +0,0 @@
|
|||||||
base_model: zai-org/GLM-4.7-Flash
|
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
load_in_8bit: true
|
|
||||||
quantize_moe_experts: true
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: fozziethebeat/alpaca_messages_2k_test
|
|
||||||
type: chat_template
|
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.1
|
|
||||||
output_dir: ./outputs/glm4.7-flash-lora-8bit-out
|
|
||||||
|
|
||||||
adapter: lora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0
|
|
||||||
lora_target_modules:
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
- k_proj
|
|
||||||
- o_proj
|
|
||||||
|
|
||||||
# Uncomment to also target MoE expert weights:
|
|
||||||
# lora_target_parameters:
|
|
||||||
# - mlp.experts.gate_up_proj
|
|
||||||
# - mlp.experts.down_proj
|
|
||||||
|
|
||||||
# LoRA kernels incompatible with DSA attention
|
|
||||||
lora_mlp_kernel: false
|
|
||||||
lora_qkv_kernel: false
|
|
||||||
lora_o_kernel: false
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_torch_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
bf16: auto
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
evals_per_epoch: 1
|
|
||||||
saves_per_epoch: 1
|
|
||||||
@@ -1,75 +0,0 @@
|
|||||||
base_model: zai-org/GLM-4.7-Flash
|
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
load_in_8bit: true
|
|
||||||
quantize_moe_experts: true
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: fozziethebeat/alpaca_messages_2k_test
|
|
||||||
type: chat_template
|
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.1
|
|
||||||
output_dir: ./outputs/glm4.7-flash-lora-8bit-fsdp-out
|
|
||||||
|
|
||||||
adapter: lora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0
|
|
||||||
lora_target_modules:
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
- k_proj
|
|
||||||
- o_proj
|
|
||||||
|
|
||||||
# Uncomment to also target MoE expert weights:
|
|
||||||
# lora_target_parameters:
|
|
||||||
# - mlp.experts.gate_up_proj
|
|
||||||
# - mlp.experts.down_proj
|
|
||||||
|
|
||||||
# LoRA kernels incompatible with DSA attention
|
|
||||||
lora_mlp_kernel: false
|
|
||||||
lora_qkv_kernel: false
|
|
||||||
lora_o_kernel: false
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_torch_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
bf16: auto
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
evals_per_epoch: 1
|
|
||||||
saves_per_epoch: 1
|
|
||||||
|
|
||||||
fsdp_config:
|
|
||||||
fsdp_version: 2
|
|
||||||
offload_params: false
|
|
||||||
cpu_ram_efficient_loading: false
|
|
||||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
|
||||||
transformer_layer_cls_to_wrap: Glm4MoeLiteDecoderLayer
|
|
||||||
state_dict_type: FULL_STATE_DICT
|
|
||||||
sharding_strategy: FULL_SHARD
|
|
||||||
reshard_after_forward: true
|
|
||||||
activation_checkpointing: true
|
|
||||||
@@ -1,65 +0,0 @@
|
|||||||
base_model: zai-org/GLM-4.7-Flash
|
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
load_in_4bit: true
|
|
||||||
quantize_moe_experts: true
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: fozziethebeat/alpaca_messages_2k_test
|
|
||||||
type: chat_template
|
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.1
|
|
||||||
output_dir: ./outputs/glm4.7-flash-qlora-out
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0
|
|
||||||
lora_target_modules:
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
- k_proj
|
|
||||||
- o_proj
|
|
||||||
|
|
||||||
# Uncomment to also target MoE expert weights:
|
|
||||||
# lora_target_parameters:
|
|
||||||
# - mlp.experts.gate_up_proj
|
|
||||||
# - mlp.experts.down_proj
|
|
||||||
|
|
||||||
# LoRA kernels incompatible with DSA attention
|
|
||||||
lora_mlp_kernel: false
|
|
||||||
lora_qkv_kernel: false
|
|
||||||
lora_o_kernel: false
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_torch_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
bf16: auto
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
evals_per_epoch: 1
|
|
||||||
saves_per_epoch: 1
|
|
||||||
@@ -1,75 +0,0 @@
|
|||||||
base_model: zai-org/GLM-4.7-Flash
|
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
load_in_4bit: true
|
|
||||||
quantize_moe_experts: true
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: fozziethebeat/alpaca_messages_2k_test
|
|
||||||
type: chat_template
|
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.1
|
|
||||||
output_dir: ./outputs/glm4.7-flash-qlora-fsdp-out
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0
|
|
||||||
lora_target_modules:
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
- k_proj
|
|
||||||
- o_proj
|
|
||||||
|
|
||||||
# Uncomment to also target MoE expert weights:
|
|
||||||
# lora_target_parameters:
|
|
||||||
# - mlp.experts.gate_up_proj
|
|
||||||
# - mlp.experts.down_proj
|
|
||||||
|
|
||||||
# LoRA kernels incompatible with DSA attention
|
|
||||||
lora_mlp_kernel: false
|
|
||||||
lora_qkv_kernel: false
|
|
||||||
lora_o_kernel: false
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_torch_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
bf16: auto
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
evals_per_epoch: 1
|
|
||||||
saves_per_epoch: 1
|
|
||||||
|
|
||||||
fsdp_config:
|
|
||||||
fsdp_version: 2
|
|
||||||
offload_params: false
|
|
||||||
cpu_ram_efficient_loading: false
|
|
||||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
|
||||||
transformer_layer_cls_to_wrap: Glm4MoeLiteDecoderLayer
|
|
||||||
state_dict_type: FULL_STATE_DICT
|
|
||||||
sharding_strategy: FULL_SHARD
|
|
||||||
reshard_after_forward: true
|
|
||||||
activation_checkpointing: true
|
|
||||||
@@ -2,8 +2,6 @@
|
|||||||
|
|
||||||
[GPT-OSS](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4) are a family of open-weight MoE models trained by OpenAI, released in August 2025. There are two variants: 20B and 120B.
|
[GPT-OSS](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4) are a family of open-weight MoE models trained by OpenAI, released in August 2025. There are two variants: 20B and 120B.
|
||||||
|
|
||||||
In October 2025, OpenAI released safeguard models built upon GPT-OSS called [GPT-OSS-Safeguard](https://huggingface.co/collections/openai/gpt-oss-safeguard). They use the same architecture, so the same examples below can be re-used.
|
|
||||||
|
|
||||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||||
|
|
||||||
## Getting started
|
## Getting started
|
||||||
@@ -14,8 +12,13 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
|
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
|
||||||
pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
|
# Option A: manage dependencies in your project
|
||||||
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
|
uv add 'axolotl>=0.12.0'
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
|
|
||||||
|
# Option B: quick install
|
||||||
|
uv pip install 'axolotl>=0.12.0'
|
||||||
|
uv pip install flash-attn --no-build-isolation
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Choose one of the following configs below for training the 20B model. (for 120B, see [below](#training-120b))
|
2. Choose one of the following configs below for training the 20B model. (for 120B, see [below](#training-120b))
|
||||||
@@ -66,16 +69,6 @@ axolotl merge-sharded-fsdp-weights examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offlo
|
|||||||
mv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/
|
mv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/
|
||||||
```
|
```
|
||||||
|
|
||||||
### How to set reasoning_effort in template?
|
|
||||||
|
|
||||||
The harmony template has a feature to set the `reasoning_effort` during prompt building. The default is `medium`. If you would like to adjust this, you can add the following to your config:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
chat_template_kwargs:
|
|
||||||
reasoning_effort: "high" # low | medium | high
|
|
||||||
```
|
|
||||||
|
|
||||||
Currently, this applies globally. There is no method to apply per sample yet. If you are interested in adding this, please feel free to create an Issue to discuss.
|
|
||||||
|
|
||||||
### Inferencing your fine-tuned model
|
### Inferencing your fine-tuned model
|
||||||
|
|
||||||
@@ -87,7 +80,7 @@ for more information about using a special vllm-openai docker image for inferenc
|
|||||||
Optionally, vLLM can be installed from nightly:
|
Optionally, vLLM can be installed from nightly:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
|
uv pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
|
||||||
```
|
```
|
||||||
and the vLLM server can be started with the following command (modify `--tensor-parallel-size 8` to match your environment):
|
and the vLLM server can be started with the following command (modify `--tensor-parallel-size 8` to match your environment):
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@@ -32,10 +32,6 @@ wandb_watch:
|
|||||||
wandb_name:
|
wandb_name:
|
||||||
wandb_log_model:
|
wandb_log_model:
|
||||||
|
|
||||||
trackio_project_name:
|
|
||||||
trackio_run_name:
|
|
||||||
trackio_space_id:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 2
|
gradient_accumulation_steps: 2
|
||||||
micro_batch_size: 1
|
micro_batch_size: 1
|
||||||
num_epochs: 1
|
num_epochs: 1
|
||||||
|
|||||||
@@ -28,10 +28,6 @@ wandb_watch:
|
|||||||
wandb_name:
|
wandb_name:
|
||||||
wandb_log_model:
|
wandb_log_model:
|
||||||
|
|
||||||
trackio_project_name:
|
|
||||||
trackio_run_name:
|
|
||||||
trackio_space_id:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 2
|
gradient_accumulation_steps: 2
|
||||||
micro_batch_size: 1
|
micro_batch_size: 1
|
||||||
num_epochs: 1
|
num_epochs: 1
|
||||||
|
|||||||
@@ -29,10 +29,6 @@ wandb_watch:
|
|||||||
wandb_name:
|
wandb_name:
|
||||||
wandb_log_model:
|
wandb_log_model:
|
||||||
|
|
||||||
trackio_project_name:
|
|
||||||
trackio_run_name:
|
|
||||||
trackio_space_id:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 2
|
gradient_accumulation_steps: 2
|
||||||
micro_batch_size: 1
|
micro_batch_size: 1
|
||||||
num_epochs: 1
|
num_epochs: 1
|
||||||
|
|||||||
@@ -28,10 +28,6 @@ wandb_watch:
|
|||||||
wandb_name:
|
wandb_name:
|
||||||
wandb_log_model:
|
wandb_log_model:
|
||||||
|
|
||||||
trackio_project_name:
|
|
||||||
trackio_run_name:
|
|
||||||
trackio_space_id:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 2
|
gradient_accumulation_steps: 2
|
||||||
micro_batch_size: 1
|
micro_batch_size: 1
|
||||||
num_epochs: 1
|
num_epochs: 1
|
||||||
|
|||||||
@@ -41,10 +41,6 @@ wandb_watch:
|
|||||||
wandb_name:
|
wandb_name:
|
||||||
wandb_log_model:
|
wandb_log_model:
|
||||||
|
|
||||||
trackio_project_name:
|
|
||||||
trackio_run_name:
|
|
||||||
trackio_space_id:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
micro_batch_size: 1
|
micro_batch_size: 1
|
||||||
num_epochs: 1
|
num_epochs: 1
|
||||||
|
|||||||
@@ -1,71 +0,0 @@
|
|||||||
base_model: openai/gpt-oss-safeguard-20b
|
|
||||||
use_kernels: true
|
|
||||||
model_quantization_config: Mxfp4Config
|
|
||||||
model_quantization_config_kwargs:
|
|
||||||
dequantize: true
|
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
experimental_skip_move_to_device: true # prevent OOM by not putting model to GPU before sharding
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: HuggingFaceH4/Multilingual-Thinking
|
|
||||||
type: chat_template
|
|
||||||
field_thinking: thinking
|
|
||||||
template_thinking_key: thinking
|
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0
|
|
||||||
output_dir: ./outputs/gpt-oss-safeguard-out/
|
|
||||||
|
|
||||||
sequence_len: 4096
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
adapter: lora
|
|
||||||
lora_r: 8
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.0 # dropout not supported when using LoRA over expert parameters
|
|
||||||
lora_target_linear: true
|
|
||||||
|
|
||||||
# TODO: not supported for now, see peft#2710
|
|
||||||
#lora_target_parameters: # target the experts in the last two layers
|
|
||||||
# - "22._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
|
|
||||||
# - "22._checkpoint_wrapped_module.mlp.experts.down_proj"
|
|
||||||
# - "23._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
|
|
||||||
# - "23._checkpoint_wrapped_module.mlp.experts.down_proj"
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
trackio_project_name:
|
|
||||||
trackio_run_name:
|
|
||||||
trackio_space_id:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 8
|
|
||||||
micro_batch_size: 1
|
|
||||||
num_epochs: 1
|
|
||||||
|
|
||||||
optimizer: adamw_torch_8bit
|
|
||||||
lr_scheduler: constant_with_warmup
|
|
||||||
learning_rate: 2e-4
|
|
||||||
|
|
||||||
bf16: true
|
|
||||||
tf32: true
|
|
||||||
|
|
||||||
flash_attention: true
|
|
||||||
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
activation_offloading: true
|
|
||||||
|
|
||||||
logging_steps: 1
|
|
||||||
saves_per_epoch: 1
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
|
|
||||||
special_tokens:
|
|
||||||
eot_tokens:
|
|
||||||
- "<|end|>"
|
|
||||||
@@ -1,65 +0,0 @@
|
|||||||
# Finetune IBM's Granite 4.0 with Axolotl
|
|
||||||
|
|
||||||
[Granite 4.0](https://huggingface.co/collections/ibm-granite/granite-40-language-models) are a family of open source models trained by IBM Research.
|
|
||||||
|
|
||||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
|
||||||
|
|
||||||
## Getting started
|
|
||||||
|
|
||||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Granite4 is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
|
|
||||||
|
|
||||||
Here is an example of how to install from main for pip:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Ensure you have Pytorch installed (Pytorch 2.7.1 min)
|
|
||||||
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
|
||||||
cd axolotl
|
|
||||||
|
|
||||||
pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
|
|
||||||
pip3 install --no-build-isolation -e '.[flash-attn]'
|
|
||||||
|
|
||||||
# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
|
|
||||||
python scripts/cutcrossentropy_install.py | sh
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Run the finetuning example:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
axolotl train examples/granite4/granite-4.0-tiny-fft.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
This config uses about 40.8GiB VRAM.
|
|
||||||
|
|
||||||
Let us know how it goes. Happy finetuning! 🚀
|
|
||||||
|
|
||||||
### TIPS
|
|
||||||
|
|
||||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
|
||||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
|
||||||
|
|
||||||
### Limitation
|
|
||||||
|
|
||||||
Adapter finetuning does not work at the moment. It would error with
|
|
||||||
|
|
||||||
```bash
|
|
||||||
RuntimeError: mat1 and mat2 shapes cannot be multiplied (4096x3072 and 1x1179648)
|
|
||||||
```
|
|
||||||
|
|
||||||
In addition, if adapter training works, `lora_target_linear: true` will not work due to:
|
|
||||||
```bash
|
|
||||||
ValueError: Target module GraniteMoeHybridParallelExperts() is not supported.
|
|
||||||
```
|
|
||||||
|
|
||||||
## Optimization Guides
|
|
||||||
|
|
||||||
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
|
||||||
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
|
||||||
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
|
|
||||||
|
|
||||||
## Related Resources
|
|
||||||
|
|
||||||
- [Granite Docs](https://www.ibm.com/granite/docs/models/granite)
|
|
||||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
|
||||||
- [Axolotl Website](https://axolotl.ai)
|
|
||||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
|
||||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
|
||||||
@@ -1,45 +0,0 @@
|
|||||||
base_model: ibm-granite/granite-4.0-tiny-preview
|
|
||||||
|
|
||||||
# Automatically upload checkpoint and final model to HF
|
|
||||||
# hub_model_id: username/custom_model_name
|
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: fozziethebeat/alpaca_messages_2k_test
|
|
||||||
type: chat_template
|
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.1
|
|
||||||
output_dir: ./outputs/model-out
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_bnb_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
bf16: auto
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
evals_per_epoch: 1
|
|
||||||
saves_per_epoch: 1
|
|
||||||
|
|
||||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user