Compare commits
9 Commits
activeblue
...
attn-patch
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
587dbbfc02 | ||
|
|
6c306d9186 | ||
|
|
7565fb9d63 | ||
|
|
a6b737d5ff | ||
|
|
cf95b57c0a | ||
|
|
13f7efaf74 | ||
|
|
d773384f74 | ||
|
|
985dcbc051 | ||
|
|
5d0b27e5a1 |
@@ -1,41 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
_axolotl_completions() {
|
||||
local cur prev
|
||||
COMPREPLY=()
|
||||
cur="${COMP_WORDS[COMP_CWORD]}"
|
||||
prev="${COMP_WORDS[COMP_CWORD-1]}"
|
||||
|
||||
# If we're completing the first argument (the command)
|
||||
if [[ $COMP_CWORD -eq 1 ]]; then
|
||||
mapfile -t COMPREPLY < <(compgen -W "delinearize-llama4 fetch lm-eval merge-sharded-fsdp-weights quantize vllm-serve evaluate inference merge-lora preprocess train" -- "$cur")
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Commands that should complete with directories and YAML files
|
||||
local -a yaml_commands=("merge-sharded-fsdp-weights" "quantize" "vllm-serve" "evaluate" "inference" "merge-lora" "preprocess" "train")
|
||||
|
||||
# Check if previous word is in our list
|
||||
if [[ " ${yaml_commands[*]} " =~ (^|[[:space:]])$prev($|[[:space:]]) ]]; then
|
||||
# Use filename completion which handles directories properly
|
||||
compopt -o filenames
|
||||
mapfile -t COMPREPLY < <(compgen -f -- "$cur")
|
||||
|
||||
# Filter to only include directories and YAML files
|
||||
local -a filtered=()
|
||||
for item in "${COMPREPLY[@]}"; do
|
||||
if [[ -d "$item" ]] || [[ "$item" == *.yaml ]] || [[ "$item" == *.yml ]]; then
|
||||
filtered+=("$item")
|
||||
fi
|
||||
done
|
||||
COMPREPLY=("${filtered[@]}")
|
||||
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Default: no completion
|
||||
return 0
|
||||
}
|
||||
|
||||
# Remove the -o nospace option - let filenames handle it
|
||||
complete -F _axolotl_completions axolotl
|
||||
2
.bandit
2
.bandit
@@ -1,3 +1,3 @@
|
||||
[bandit]
|
||||
exclude = tests
|
||||
skips = B101,B615,B102,B110
|
||||
skips = B101
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
|
||||
language: "en-US"
|
||||
early_access: false
|
||||
reviews:
|
||||
profile: "chill"
|
||||
request_changes_workflow: false
|
||||
high_level_summary: true
|
||||
review_status: true
|
||||
collapse_walkthrough: true
|
||||
poem: false
|
||||
sequence_diagrams: false
|
||||
auto_review:
|
||||
enabled: true
|
||||
drafts: false
|
||||
auto_incremental_review: false
|
||||
chat:
|
||||
auto_reply: true
|
||||
14
.coveragerc
14
.coveragerc
@@ -1,14 +0,0 @@
|
||||
[run]
|
||||
source = axolotl
|
||||
omit =
|
||||
*/tests/*
|
||||
setup.py
|
||||
|
||||
[report]
|
||||
exclude_lines =
|
||||
pragma: no cover
|
||||
def __repr__
|
||||
raise NotImplementedError
|
||||
if __name__ == .__main__.:
|
||||
pass
|
||||
raise ImportError
|
||||
5
.flake8
Normal file
5
.flake8
Normal file
@@ -0,0 +1,5 @@
|
||||
[flake8]
|
||||
max-line-length = 88
|
||||
|
||||
select = C,E,F,W,B,B950
|
||||
extend-ignore = E203, E501, W503
|
||||
32
.github/CONTRIBUTING.md
vendored
32
.github/CONTRIBUTING.md
vendored
@@ -15,27 +15,23 @@ First of all, thank you for your interest in contributing to axolotl! We appreci
|
||||
- [Commit Messages](#commit-messages)
|
||||
- [Additional Resources](#additional-resources)
|
||||
|
||||
## Code of Conduct
|
||||
## Code of Conductcode
|
||||
|
||||
All contributors are expected to adhere to our [Code of Conduct](CODE_OF_CONDUCT.md). Please read it before participating in the axolotl community.
|
||||
|
||||
## Getting Started
|
||||
|
||||
Bugs? Please check for open issue else create a new [Issue](https://github.com/axolotl-ai-cloud/axolotl/issues/new).
|
||||
Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new).
|
||||
|
||||
PRs are **greatly welcome**!
|
||||
|
||||
1. Fork the repository and clone it to your local machine.
|
||||
2. Set up the development environment by following the instructions in the [README.md](https://github.com/axolotl-ai-cloud/axolotl/tree/main/README.md) file.
|
||||
2. Set up the development environment by following the instructions in the [README.md](https://github.com/OpenAccess-AI-Collective/axolotl/tree/main/README.md) file.
|
||||
3. Explore the codebase, run tests, and verify that everything works as expected.
|
||||
|
||||
Please run below to setup env
|
||||
```bash
|
||||
# Install axolotl + dev and test dependencies
|
||||
export UV_TORCH_BACKEND=cu128 # or cu130
|
||||
uv venv --no-project --relocatable
|
||||
source .venv/bin/activate
|
||||
uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
|
||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
||||
pre-commit install
|
||||
|
||||
# test
|
||||
@@ -46,11 +42,11 @@ pytest tests/
|
||||
|
||||
### Reporting Bugs
|
||||
|
||||
If you encounter a bug or issue while using axolotl, please open a new issue on the [GitHub Issues](https://github.com/axolotl-ai-cloud/axolotl/issues) page. Provide a clear and concise description of the problem, steps to reproduce it, and any relevant error messages or logs.
|
||||
If you encounter a bug or issue while using axolotl, please open a new issue on the [GitHub Issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues) page. Provide a clear and concise description of the problem, steps to reproduce it, and any relevant error messages or logs.
|
||||
|
||||
### Suggesting Enhancements
|
||||
|
||||
We welcome ideas for improvements and new features. To suggest an enhancement, open a new issue on the [GitHub Issues](https://github.com/axolotl-ai-cloud/axolotl/issues) page. Describe the enhancement in detail, explain the use case, and outline the benefits it would bring to the project.
|
||||
We welcome ideas for improvements and new features. To suggest an enhancement, open a new issue on the [GitHub Issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues) page. Describe the enhancement in detail, explain the use case, and outline the benefits it would bring to the project.
|
||||
|
||||
### Submitting Pull Requests
|
||||
|
||||
@@ -61,23 +57,11 @@ We welcome ideas for improvements and new features. To suggest an enhancement, o
|
||||
5. Push your branch to your fork on GitHub.
|
||||
6. Open a new pull request against the `main` branch of the axolotl repository. Include a clear and concise description of your changes, referencing any related issues.
|
||||
|
||||
#### Skipping CI Checks
|
||||
|
||||
You can skip certain CI checks by including specific keywords in your commit messages:
|
||||
|
||||
- `[skip ci]` or `skip ci` - Skips all CI checks for that commit
|
||||
- `[skip-e2e]` or `skip-e2e` - Skips only end-to-end tests while running other CI checks. You may also include this in the title of your PR to disable end-to-end tests for the entire PR.
|
||||
|
||||
## Style Guidelines
|
||||
|
||||
### Code Style
|
||||
|
||||
axolotl uses [Ruff](https://docs.astral.sh/ruff/) as its code style guide. Please ensure that your code follows these guidelines.
|
||||
|
||||
Use the pre-commit linter to ensure that your code is formatted consistently.
|
||||
```bash
|
||||
pre-commit run --all-files
|
||||
```
|
||||
axolotl uses [{codestyle}]({URLofCodestyle}) as its code style guide. Please ensure that your code follows these guidelines.
|
||||
|
||||
### Commit Messages
|
||||
|
||||
@@ -87,6 +71,6 @@ Write clear and concise commit messages that briefly describe the changes made i
|
||||
|
||||
- [GitHub Help](https://help.github.com/)
|
||||
- [GitHub Pull Request Documentation](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests)
|
||||
- [Ruff](https://docs.astral.sh/ruff/)
|
||||
- [{codestyle}]({URLofCodestyle})
|
||||
|
||||
Thank you once again for your interest in contributing to axolotl. We look forward to collaborating with you and creating an even better project together!
|
||||
|
||||
2
.github/FUNDING.yml
vendored
2
.github/FUNDING.yml
vendored
@@ -1,6 +1,6 @@
|
||||
# These are supported funding model platforms
|
||||
|
||||
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
||||
github: OpenAccess-AI-Collective # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
||||
patreon: # Replace with a single Patreon username
|
||||
open_collective: # Replace with a single Open Collective username
|
||||
ko_fi: # Replace with a single Ko-fi username
|
||||
|
||||
10
.github/ISSUE_TEMPLATE/bug-report.yaml
vendored
10
.github/ISSUE_TEMPLATE/bug-report.yaml
vendored
@@ -15,7 +15,7 @@ body:
|
||||
label: "Please check that this issue hasn't been reported before."
|
||||
description: "The **Label filters** may help make your search more focussed."
|
||||
options:
|
||||
- label: "I searched previous [Bug Reports](https://github.com/axolotl-ai-cloud/axolotl/labels/bug) didn't find any similar reports."
|
||||
- label: "I searched previous [Bug Reports](https://github.com/OpenAccess-AI-Collective/axolotl/labels/bug) didn't find any similar reports."
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
@@ -53,14 +53,6 @@ body:
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: config
|
||||
attributes:
|
||||
label: Config yaml
|
||||
description: |
|
||||
Please attach the config yaml!
|
||||
render: yaml
|
||||
|
||||
- type: textarea
|
||||
id: possible-solution
|
||||
attributes:
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/config.yml
vendored
2
.github/ISSUE_TEMPLATE/config.yml
vendored
@@ -1,7 +1,7 @@
|
||||
blank_issues_enabled: false
|
||||
contact_links:
|
||||
- name: Ask a question
|
||||
url: https://github.com/axolotl-ai-cloud/axolotl/discussions/categories/q-a
|
||||
url: https://github.com/OpenAccess-AI-Collective/axolotl/discussions/categories/q-a
|
||||
about: Ask questions and discuss with other community members
|
||||
- name: Discuss the Project in Discord
|
||||
url: https://discord.gg/HhrNrHJPRb
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/docs.yml
vendored
2
.github/ISSUE_TEMPLATE/docs.yml
vendored
@@ -10,7 +10,7 @@ body:
|
||||
value: |
|
||||
* Ask questions in [Discord](https://discord.gg/HhrNrHJPRb).
|
||||
* Before you file an issue read the [Contributing guide](./CONTRIBUTING.md).
|
||||
* Check to make sure someone hasn't already opened a [similar issue](https://github.com/axolotl-ai-cloud/axolotl/issues).
|
||||
* Check to make sure someone hasn't already opened a [similar issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues).
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: What piece of documentation is affected?
|
||||
|
||||
4
.github/ISSUE_TEMPLATE/feature-request.yaml
vendored
4
.github/ISSUE_TEMPLATE/feature-request.yaml
vendored
@@ -8,9 +8,9 @@ body:
|
||||
label: "⚠️ Please check that this feature request hasn't been suggested before."
|
||||
description: "There are two locations for previous feature requests. Please search in both. Thank you. The **Label filters** may help make your search more focussed."
|
||||
options:
|
||||
- label: "I searched previous [Ideas in Discussions](https://github.com/axolotl-ai-cloud/axolotl/discussions/categories/ideas) didn't find any similar feature requests."
|
||||
- label: "I searched previous [Ideas in Discussions](https://github.com/OpenAccess-AI-Collective/axolotl/discussions/categories/ideas) didn't find any similar feature requests."
|
||||
required: true
|
||||
- label: "I searched previous [Issues](https://github.com/axolotl-ai-cloud/axolotl/labels/enhancement) didn't find any similar feature requests."
|
||||
- label: "I searched previous [Issues](https://github.com/OpenAccess-AI-Collective/axolotl/labels/enhancement) didn't find any similar feature requests."
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
|
||||
@@ -15,18 +15,8 @@
|
||||
<!--- Include details of your testing environment, tests ran to see how -->
|
||||
<!--- your change affects other areas of the code, etc. -->
|
||||
|
||||
## AI Usage Disclaimer
|
||||
|
||||
<!--- Was AI (e.g., ChatGPT, Claude, Copilot) used to generate or assist with this PR? -->
|
||||
<!--- Please indicate: No / Yes (specify which tool and to what extent) -->
|
||||
|
||||
## Screenshots (if appropriate)
|
||||
|
||||
## Types of changes
|
||||
|
||||
<!--- What types of changes does your code introduce? Put an `x` in all the boxes that apply: -->
|
||||
|
||||
## Social Handles (Optional)
|
||||
|
||||
<!-- Thanks for submitting a bugfix or enhancement. -->
|
||||
<!-- We'd love to show our thanks to you on Twitter & Discord if you provide your handle -->
|
||||
231
.github/workflows/base.yml
vendored
231
.github/workflows/base.yml
vendored
@@ -3,240 +3,53 @@ name: ci-cd-base
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
paths:
|
||||
- 'docker/Dockerfile-base'
|
||||
- 'docker/Dockerfile-uv-base'
|
||||
- '.github/workflows/base.yml'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'docker/Dockerfile-base'
|
||||
- 'docker/Dockerfile-uv-base'
|
||||
- '.github/workflows/base.yml'
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
- "main-base"
|
||||
- "dev-base"
|
||||
|
||||
jobs:
|
||||
build-base:
|
||||
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
||||
timeout-minutes: 480
|
||||
if: github.repository_owner == 'OpenAccess-AI-Collective'
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
runs-on: ubuntu-latest-m
|
||||
env:
|
||||
HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
|
||||
runs-on: self-hosted
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.10.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.12"
|
||||
pytorch: 2.10.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
# - cuda: "129"
|
||||
# cuda_version: 12.9.1
|
||||
# cudnn_version: ""
|
||||
# python_version: "3.12"
|
||||
# pytorch: 2.9.1
|
||||
# torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
# dockerfile: "Dockerfile-base"
|
||||
# platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "130"
|
||||
cuda_version: 13.0.0
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "130"
|
||||
cuda_version: 13.0.0
|
||||
cudnn_version: ""
|
||||
python_version: "3.12"
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "130"
|
||||
cuda_version: 13.0.0
|
||||
cudnn_version: ""
|
||||
python_version: "3.12"
|
||||
pytorch: 2.10.0
|
||||
torch_cuda_arch_list: "9.0+PTX"
|
||||
dockerfile: "Dockerfile-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
# - cuda: "128"
|
||||
# cuda_version: 12.8.1
|
||||
# cudnn_version: ""
|
||||
# python_version: "3.11"
|
||||
# pytorch: nightly
|
||||
# torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
# dockerfile: "Dockerfile-base-nightly"
|
||||
# # "next" is for release candidates of pytorch
|
||||
# - cuda: "128"
|
||||
# cuda_version: 12.8.1
|
||||
# cudnn_version: ""
|
||||
# python_version: "3.11"
|
||||
# pytorch: next
|
||||
# torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
# dockerfile: "Dockerfile-base-next"
|
||||
- cuda: "118"
|
||||
cuda_version: 11.8.0
|
||||
python_version: "3.9"
|
||||
pytorch: 2.0.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
||||
- cuda: "118"
|
||||
cuda_version: 11.8.0
|
||||
python_version: "3.10"
|
||||
pytorch: 2.0.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
- name: Docker metadata
|
||||
id: metadata
|
||||
uses: docker/metadata-action@v5
|
||||
uses: docker/metadata-action@v3
|
||||
with:
|
||||
images: |
|
||||
axolotlai/axolotl-base
|
||||
images: winglian/axolotl-base
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Build
|
||||
uses: docker/build-push-action@v5
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
file: ./docker/${{ matrix.dockerfile }}
|
||||
platforms: ${{ matrix.platforms }}
|
||||
file: ./docker/Dockerfile-base
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
build-args: |
|
||||
CUDA_VERSION=${{ matrix.cuda_version }}
|
||||
CUDNN_VERSION=${{ matrix.cudnn_version }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
PYTHON_VERSION=${{ matrix.python_version }}
|
||||
PYTORCH_VERSION=${{ matrix.pytorch }}
|
||||
TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
|
||||
build-base-uv:
|
||||
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
||||
timeout-minutes: 480
|
||||
runs-on: ubuntu-latest-m
|
||||
env:
|
||||
HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.12"
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.10.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "128"
|
||||
cuda_version: 12.8.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.12"
|
||||
pytorch: 2.10.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
# - cuda: "129"
|
||||
# cuda_version: 12.9.1
|
||||
# cudnn_version: ""
|
||||
# python_version: "3.12"
|
||||
# pytorch: 2.9.1
|
||||
# torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
# dockerfile: "Dockerfile-uv-base"
|
||||
# platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "130"
|
||||
cuda_version: 13.0.0
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "130"
|
||||
cuda_version: 13.0.0
|
||||
cudnn_version: ""
|
||||
python_version: "3.12"
|
||||
pytorch: 2.9.1
|
||||
torch_cuda_arch_list: "9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: "130"
|
||||
cuda_version: 13.0.0
|
||||
cudnn_version: ""
|
||||
python_version: "3.12"
|
||||
pytorch: 2.10.0
|
||||
torch_cuda_arch_list: "9.0+PTX"
|
||||
dockerfile: "Dockerfile-uv-base"
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Docker metadata
|
||||
id: metadata
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
axolotlai/axolotl-base-uv
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Build
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
file: ./docker/${{ matrix.dockerfile }}
|
||||
platforms: ${{ matrix.platforms }}
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
build-args: |
|
||||
CUDA_VERSION=${{ matrix.cuda_version }}
|
||||
CUDNN_VERSION=${{ matrix.cudnn_version }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
PYTHON_VERSION=${{ matrix.python_version }}
|
||||
PYTORCH_VERSION=${{ matrix.pytorch }}
|
||||
|
||||
37
.github/workflows/docs.yml
vendored
37
.github/workflows/docs.yml
vendored
@@ -1,37 +0,0 @@
|
||||
name: Publish Docs
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
pages: write
|
||||
|
||||
jobs:
|
||||
build-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: cleanup node
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up Quarto
|
||||
uses: quarto-dev/quarto-actions/setup@v2
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python3 -m pip install jupyter quartodoc
|
||||
python3 -m pip install -e .
|
||||
- name: Build autodoc
|
||||
run: quartodoc build
|
||||
- name: Publish to GitHub Pages (and render)
|
||||
uses: quarto-dev/quarto-actions/publish@v2
|
||||
with:
|
||||
target: gh-pages
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
30
.github/workflows/lint.yml
vendored
30
.github/workflows/lint.yml
vendored
@@ -1,30 +0,0 @@
|
||||
name: lint
|
||||
on:
|
||||
# check on PRs, and manual triggers
|
||||
merge_group:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, ready_for_review]
|
||||
paths:
|
||||
- '**.py'
|
||||
- 'pyproject.toml'
|
||||
- '.github/workflows/*.yml'
|
||||
- "*.[q]md"
|
||||
- "examples/**/*.y[a]?ml"
|
||||
- ".pre-commit-config.yaml"
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
pre-commit:
|
||||
name: pre-commit
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
cache: 'pip' # caching pip dependencies
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
364
.github/workflows/main.yml
vendored
364
.github/workflows/main.yml
vendored
@@ -4,365 +4,103 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
tags:
|
||||
- "v*"
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build-axolotl:
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
if: github.repository_owner == 'OpenAccess-AI-Collective'
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
- cuda: cu118
|
||||
cuda_version: 11.8.0
|
||||
python_version: "3.9"
|
||||
pytorch: 2.0.1
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
is_latest: true
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.12"
|
||||
pytorch: 2.10.0
|
||||
- cuda: cu118
|
||||
cuda_version: 11.8.0
|
||||
python_version: "3.10"
|
||||
pytorch: 2.0.1
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
# - cuda: 129
|
||||
# cuda_version: 12.9.1
|
||||
# python_version: "3.12"
|
||||
# pytorch: 2.9.1
|
||||
# axolotl_extras:
|
||||
# platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.12"
|
||||
pytorch: 2.10.0
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
runs-on: axolotl-gpu-runner
|
||||
- cuda: cu118
|
||||
cuda_version: 11.8.0
|
||||
python_version: "3.9"
|
||||
pytorch: 2.0.1
|
||||
axolotl_extras: gptq
|
||||
runs-on: self-hosted
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
- name: Docker metadata
|
||||
id: metadata
|
||||
uses: docker/metadata-action@v5
|
||||
uses: docker/metadata-action@v3
|
||||
with:
|
||||
images: |
|
||||
axolotlai/axolotl
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
type=pep440,pattern={{version}}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
images: winglian/axolotl
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
# guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
|
||||
- name: Build and export to Docker
|
||||
uses: docker/build-push-action@v5
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Build
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.platforms }}
|
||||
build-args: |
|
||||
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
PYTORCH_VERSION=${{ matrix.pytorch }}
|
||||
AXOLOTL_ARGS=${{ matrix.axolotl_args }}
|
||||
AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
|
||||
BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}
|
||||
file: ./docker/Dockerfile
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: |
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
||||
tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
|
||||
build-axolotl-uv:
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.12"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
is_latest: true
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.12"
|
||||
pytorch: 2.10.0
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.12"
|
||||
pytorch: 2.10.0
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Docker metadata
|
||||
id: metadata
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
axolotlai/axolotl-uv
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
type=pep440,pattern={{version}}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
# guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
|
||||
- name: Build and export to Docker
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.platforms }}
|
||||
build-args: |
|
||||
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
PYTORCH_VERSION=${{ matrix.pytorch }}
|
||||
AXOLOTL_ARGS=${{ matrix.axolotl_args }}
|
||||
AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
|
||||
file: ./docker/Dockerfile-uv
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: |
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
|
||||
build-axolotl-cloud:
|
||||
build-axolotl-runpod:
|
||||
needs: build-axolotl
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
if: github.repository_owner == 'OpenAccess-AI-Collective'
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
- cuda: 118
|
||||
cuda_version: 11.8.0
|
||||
python_version: "3.9"
|
||||
pytorch: 2.0.1
|
||||
axolotl_extras:
|
||||
- cuda: 118
|
||||
cuda_version: 11.8.0
|
||||
python_version: "3.10"
|
||||
pytorch: 2.0.1
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.12"
|
||||
pytorch: 2.10.0
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
# - cuda: 129
|
||||
# cuda_version: 12.9.1
|
||||
# python_version: "3.12"
|
||||
# pytorch: 2.9.1
|
||||
# axolotl_extras:
|
||||
# platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.12"
|
||||
pytorch: 2.10.0
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
runs-on: axolotl-gpu-runner
|
||||
- cuda: 118
|
||||
cuda_version: 11.8.0
|
||||
python_version: "3.9"
|
||||
pytorch: 2.0.1
|
||||
axolotl_extras: gptq
|
||||
runs-on: self-hosted
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
- name: Docker metadata
|
||||
id: metadata
|
||||
uses: docker/metadata-action@v5
|
||||
uses: docker/metadata-action@v3
|
||||
with:
|
||||
images: |
|
||||
axolotlai/axolotl-cloud
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
type=pep440,pattern={{version}}
|
||||
images: winglian/axolotl-runpod
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Build
|
||||
uses: docker/build-push-action@v5
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.platforms }}
|
||||
build-args: |
|
||||
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
file: ./docker/Dockerfile-cloud
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: |
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
|
||||
build-axolotl-cloud-uv:
|
||||
needs: build-axolotl-uv
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.12"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.12"
|
||||
pytorch: 2.10.0
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.12"
|
||||
pytorch: 2.10.0
|
||||
axolotl_extras:
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Docker metadata
|
||||
id: metadata
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
axolotlai/axolotl-cloud-uv
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
type=pep440,pattern={{version}}
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Build
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.platforms }}
|
||||
build-args: |
|
||||
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
file: ./docker/Dockerfile-cloud-uv
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: |
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
|
||||
build-axolotl-cloud-no-tmux:
|
||||
needs: build-axolotl
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
is_latest:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Docker metadata
|
||||
id: metadata
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
axolotlai/axolotl-cloud-term
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
type=pep440,pattern={{version}}
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Build
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64
|
||||
build-args: |
|
||||
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
file: ./docker/Dockerfile-cloud-no-tmux
|
||||
file: ./docker/Dockerfile-runpod
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: |
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
|
||||
80
.github/workflows/multi-gpu-e2e.yml
vendored
80
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -1,80 +0,0 @@
|
||||
name: docker-multigpu-tests-biweekly
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- "tests/e2e/multigpu/**.py"
|
||||
- "pyproject.toml"
|
||||
- ".github/workflows/multi-gpu-e2e.yml"
|
||||
- "scripts/cutcrossentropy_install.py"
|
||||
- "src/axolotl/core/trainers/mixins/sequence_parallel.py"
|
||||
- "src/axolotl/utils/distributed.py"
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "0 0 * * 1,4" # Runs at 00:00 UTC every monday & thursday
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
env:
|
||||
MODAL_IMAGE_BUILDER_VERSION: "2025.06"
|
||||
|
||||
jobs:
|
||||
test-axolotl-multigpu:
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
# - cuda: 129
|
||||
# cuda_version: 12.9.1
|
||||
# python_version: "3.12"
|
||||
# pytorch: 2.9.1
|
||||
# axolotl_extras: "fbgemm-gpu"
|
||||
# num_gpus: 2
|
||||
# dockerfile: "Dockerfile-uv.jinja"
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
# axolotl_extras: fbgemm-gpu
|
||||
num_gpus: 2
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.10.0
|
||||
axolotl_extras: "fbgemm-gpu"
|
||||
num_gpus: 2
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 120
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
|
||||
- name: Run tests job on Modal
|
||||
env:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
run: |
|
||||
modal run -m cicd.multigpu
|
||||
100
.github/workflows/nightlies.yml
vendored
100
.github/workflows/nightlies.yml
vendored
@@ -1,100 +0,0 @@
|
||||
name: docker-nightlies
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '0 0 * * *' # Runs at 00:00 UTC every day
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
build-axolotl:
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Docker metadata
|
||||
id: metadata
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
axolotlai/axolotl
|
||||
tags: |
|
||||
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
# guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
|
||||
- name: Build and export to Docker
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
PYTORCH_VERSION=${{ matrix.pytorch }}
|
||||
AXOLOTL_ARGS=${{ matrix.axolotl_args }}
|
||||
file: ./docker/Dockerfile
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: |
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
|
||||
build-axolotl-cloud:
|
||||
needs: build-axolotl
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Docker metadata
|
||||
id: metadata
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
axolotlai/axolotl-cloud
|
||||
tags: |
|
||||
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Build
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
file: ./docker/Dockerfile-cloud
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: |
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
16
.github/workflows/pre-commit.yml
vendored
Normal file
16
.github/workflows/pre-commit.yml
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
name: pre-commit
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
|
||||
jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.9"
|
||||
cache: 'pip' # caching pip dependencies
|
||||
- uses: pre-commit/action@v3.0.0
|
||||
42
.github/workflows/precommit-autoupdate.yml
vendored
42
.github/workflows/precommit-autoupdate.yml
vendored
@@ -1,42 +0,0 @@
|
||||
name: Pre-commit auto-update
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 0 1 * *' # Run monthly
|
||||
workflow_dispatch: # Manual kickoff
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
auto-update:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
pull-requests: write
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Update pre-commit hooks
|
||||
id: update
|
||||
run: |
|
||||
pip install pre-commit
|
||||
pre-commit autoupdate
|
||||
if [[ -n $(git status --porcelain) ]]; then
|
||||
echo "changes=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Create Pull Request
|
||||
if: steps.update.outputs.changes == 'true'
|
||||
uses: peter-evans/create-pull-request@v6
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
branch: update/pre-commit-hooks
|
||||
delete-branch: true
|
||||
title: "chore: update pre-commit hooks"
|
||||
commit-message: "chore: update pre-commit hooks"
|
||||
body: |
|
||||
Automated PR to update pre-commit hooks to their latest versions.
|
||||
77
.github/workflows/preview-docs.yml
vendored
77
.github/workflows/preview-docs.yml
vendored
@@ -1,77 +0,0 @@
|
||||
name: Preview
|
||||
on:
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, ready_for_review]
|
||||
|
||||
# Run the workflow only when one of these files changes
|
||||
paths:
|
||||
- '**/*.md' # any Markdown file
|
||||
- '**/*.qmd' # any Quarto file
|
||||
- '_quarto.yml'
|
||||
- docs/scripts/generate_config_docs.py
|
||||
- src/axolotl/utils/schemas/**.py
|
||||
- .github/workflows/preview-docs.yml
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
preview:
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
steps:
|
||||
- name: cleanup node
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
||||
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
|
||||
- name: Set up Quarto
|
||||
uses: quarto-dev/quarto-actions/setup@v2
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python3 -m pip install jupyter quartodoc
|
||||
python3 -m pip install -e .
|
||||
|
||||
- name: Build autodoc
|
||||
run: quartodoc build
|
||||
|
||||
- name: Quarto render
|
||||
run: quarto render
|
||||
|
||||
- name: Netlify Publish
|
||||
uses: nwtgck/actions-netlify@v3.0
|
||||
if: ${{ github.event.pull_request.head.repo.full_name == github.repository }}
|
||||
id: netlify
|
||||
with:
|
||||
publish-dir: './_site'
|
||||
enable-pull-request-comment: false
|
||||
enable-github-deployment: false
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
deploy-message: "Deployed On Netlify"
|
||||
github-deployment-environment: 'preview'
|
||||
github-deployment-description: 'Preview Deployment'
|
||||
env:
|
||||
NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
|
||||
NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
|
||||
|
||||
- name: Update PR with preview link
|
||||
if: ${{ steps.netlify.outcome == 'success' }}
|
||||
uses: marocchino/sticky-pull-request-comment@v2
|
||||
with:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
message: |
|
||||
📖 **Documentation Preview**: ${{ steps.netlify.outputs.deploy-url }}
|
||||
|
||||
Deployed on Netlify from commit ${{ github.event.pull_request.head.sha }}
|
||||
70
.github/workflows/pypi.yml
vendored
70
.github/workflows/pypi.yml
vendored
@@ -1,70 +0,0 @@
|
||||
name: publish pypi
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "v*"
|
||||
workflow_dispatch:
|
||||
|
||||
permissions: {}
|
||||
|
||||
env:
|
||||
UV_SYSTEM_PYTHON: "1"
|
||||
|
||||
jobs:
|
||||
setup_release:
|
||||
name: Create Release
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Create release
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: gh release create "$GITHUB_REF_NAME" --generate-notes
|
||||
pypi-publish:
|
||||
name: Upload release to PyPI
|
||||
runs-on: ubuntu-latest
|
||||
needs: [setup_release]
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/axolotl
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
|
||||
steps:
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install wheel packaging
|
||||
uv pip install --no-build-isolation -e .
|
||||
uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
|
||||
codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
|
||||
|
||||
- name: Extract tag name
|
||||
id: tag
|
||||
run: echo "TAG_NAME=$(echo $GITHUB_REF | cut -d / -f 3)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Update version in VERSION file
|
||||
run: |
|
||||
echo "${{ steps.tag.outputs.TAG_NAME }}" | sed 's/^v//' > VERSION
|
||||
|
||||
- name: Build a source dist
|
||||
run: |
|
||||
python setup.py sdist
|
||||
|
||||
- name: Publish package distributions to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
206
.github/workflows/tests-nightly.yml
vendored
206
.github/workflows/tests-nightly.yml
vendored
@@ -1,206 +0,0 @@
|
||||
name: Tests Nightly against upstream main
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "0 0 * * *" # Runs at 00:00 UTC every day
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, ready_for_review]
|
||||
paths:
|
||||
- ".github/workflows/tests-nightly.yml"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
env:
|
||||
UV_SYSTEM_PYTHON: "1"
|
||||
|
||||
jobs:
|
||||
pre-commit:
|
||||
name: pre-commit
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
cache: "pip" # caching pip dependencies
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
env:
|
||||
SKIP: no-commit-to-branch
|
||||
|
||||
prime-cdn-s3-cache:
|
||||
name: Prefetch S3 once to prime the CDN cache
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- name: Restore Cache from S3
|
||||
id: hf-cache-restore-s3
|
||||
run: |
|
||||
curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
|
||||
|
||||
pytest:
|
||||
name: PyTest
|
||||
runs-on: ubuntu-latest
|
||||
needs: [prime-cdn-s3-cache]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
|
||||
pytorch_version: ["2.9.1", "2.10.0"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Restore Cache from S3
|
||||
id: hf-cache-restore-s3
|
||||
run: |
|
||||
mkdir -p /home/runner/.cache/huggingface/hub
|
||||
curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python_version }}
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7
|
||||
|
||||
- name: Install PyTorch
|
||||
run: |
|
||||
uv pip install torch==${{ matrix.pytorch_version }} torchvision
|
||||
uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
|
||||
python scripts/cutcrossentropy_install.py --uv | sh
|
||||
uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
|
||||
codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
|
||||
|
||||
- name: Override with nightly HF packages
|
||||
run: |
|
||||
uv pip install --no-deps \
|
||||
"transformers @ git+https://github.com/huggingface/transformers.git@main" \
|
||||
"peft @ git+https://github.com/huggingface/peft.git@main" \
|
||||
"accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
|
||||
"trl @ git+https://github.com/huggingface/trl.git@main" \
|
||||
"datasets @ git+https://github.com/huggingface/datasets.git@main"
|
||||
|
||||
- name: Make sure PyTorch version wasn't clobbered
|
||||
run: |
|
||||
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
|
||||
|
||||
- name: Ensure axolotl CLI was installed
|
||||
run: |
|
||||
axolotl --help
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
|
||||
pytest -v --durations=10 tests/patched/
|
||||
pytest -v --durations=10 tests/cli/
|
||||
|
||||
|
||||
docker-e2e-tests:
|
||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 120
|
||||
needs: [pre-commit, pytest]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.10.0
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.12"
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
|
||||
echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
|
||||
- name: Run tests job on Modal
|
||||
env:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
run: |
|
||||
modal run cicd.e2e_tests
|
||||
docker-e2e-multigpu-tests:
|
||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 120
|
||||
needs: [pre-commit, pytest, docker-e2e-tests]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 2
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||
echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
|
||||
- name: Run tests job on Modal
|
||||
env:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
run: |
|
||||
modal run cicd.multigpu
|
||||
399
.github/workflows/tests.yml
vendored
399
.github/workflows/tests.yml
vendored
@@ -1,407 +1,32 @@
|
||||
name: Tests
|
||||
name: PyTest
|
||||
on:
|
||||
# check on push/merge to main, PRs, and manual triggers
|
||||
merge_group:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
paths:
|
||||
- "**.py"
|
||||
- "pyproject.toml"
|
||||
- ".github/workflows/*.yml"
|
||||
- "cicd/cicd.sh"
|
||||
- "cicd/Dockerfile-uv.jinja"
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, ready_for_review]
|
||||
paths:
|
||||
- "**.py"
|
||||
- "pyproject.toml"
|
||||
- ".github/workflows/*.yml"
|
||||
- "cicd/cicd.sh"
|
||||
- "cicd/Dockerfile-uv.jinja"
|
||||
workflow_dispatch:
|
||||
|
||||
# Cancel jobs on the same ref if a new one is triggered
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
env:
|
||||
TRANSFORMERS_IS_CI: "yes"
|
||||
UV_SYSTEM_PYTHON: "1"
|
||||
|
||||
jobs:
|
||||
pre-commit:
|
||||
name: pre-commit
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
cache: "pip" # caching pip dependencies
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
env:
|
||||
SKIP: no-commit-to-branch
|
||||
|
||||
prime-cdn-s3-cache:
|
||||
name: Prefetch S3 once to prime the CDN cache
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.9", "3.10"]
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- name: Restore Cache from S3
|
||||
id: hf-cache-restore-s3
|
||||
run: |
|
||||
curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
|
||||
|
||||
pytest:
|
||||
name: PyTest
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
needs: [prime-cdn-s3-cache]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.12", "3.14"]
|
||||
pytorch_version: ["2.9.1", "2.10.0"]
|
||||
exclude:
|
||||
- python_version: "3.14"
|
||||
pytorch_version: "2.9.1"
|
||||
timeout-minutes: 25
|
||||
|
||||
steps:
|
||||
- name: cleanup node
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
||||
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Restore Cache from S3
|
||||
id: hf-cache-restore-s3
|
||||
run: |
|
||||
mkdir -p ~/.cache/huggingface/hub
|
||||
curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1
|
||||
ls -ltr ~/.cache/huggingface/hub/
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python_version }}
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7
|
||||
|
||||
- name: Install PyTorch
|
||||
run: |
|
||||
uv pip install torch==${{ matrix.pytorch_version }} torchvision
|
||||
uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
|
||||
cache: 'pip' # caching pip dependencies
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
|
||||
python scripts/cutcrossentropy_install.py --uv | sh
|
||||
uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
|
||||
codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
|
||||
|
||||
- name: Make sure PyTorch version wasn't clobbered
|
||||
run: |
|
||||
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
|
||||
|
||||
- name: Ensure axolotl CLI was installed
|
||||
run: |
|
||||
axolotl --help
|
||||
|
||||
- name: Pre-Download dataset fixture
|
||||
run: |
|
||||
hf download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
||||
|
||||
- name: Show HF cache
|
||||
run: hf cache ls
|
||||
pip install -e .
|
||||
pip install -r requirements-tests.txt
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
df -h
|
||||
pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
|
||||
df -h
|
||||
pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
|
||||
df -h
|
||||
pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml
|
||||
df -h
|
||||
pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
|
||||
|
||||
- name: Show HF cache
|
||||
run: hf cache ls
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
files: ./coverage.xml
|
||||
flags: unittests,pytorch-${{ matrix.pytorch_version }}
|
||||
fail_ci_if_error: false
|
||||
|
||||
pytest-sdist:
|
||||
name: PyTest from Source Dist
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
needs: [prime-cdn-s3-cache]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.12", "3.14"]
|
||||
pytorch_version: ["2.9.1", "2.10.0"]
|
||||
exclude:
|
||||
- python_version: "3.14"
|
||||
pytorch_version: "2.9.1"
|
||||
timeout-minutes: 30
|
||||
|
||||
steps:
|
||||
- name: cleanup node
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
|
||||
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Restore Cache from S3
|
||||
id: hf-cache-restore-s3
|
||||
run: |
|
||||
mkdir -p ~/.cache/huggingface/hub
|
||||
curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1
|
||||
ls -ltr ~/.cache/huggingface/hub/
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python_version }}
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v7
|
||||
|
||||
- name: Install PyTorch
|
||||
run: |
|
||||
uv pip install torch==${{ matrix.pytorch_version }} torchvision
|
||||
uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
uv pip install packaging setuptools_scm build wheel psutil
|
||||
python -m build --no-isolation --sdist
|
||||
uv pip install --no-build-isolation dist/axolotl*.tar.gz --override /tmp/torch-pin.txt
|
||||
python scripts/cutcrossentropy_install.py --uv | sh
|
||||
uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
|
||||
codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
|
||||
|
||||
- name: Make sure PyTorch version wasn't clobbered
|
||||
run: |
|
||||
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
|
||||
|
||||
- name: Ensure axolotl CLI was installed
|
||||
run: |
|
||||
axolotl --help
|
||||
|
||||
- name: Verify agent docs are discoverable
|
||||
run: |
|
||||
# Agent docs live in docs/agents/ (source of truth) and are resolved
|
||||
# at runtime from the repo checkout or via `axolotl fetch docs`
|
||||
axolotl agent-docs --list
|
||||
axolotl agent-docs | grep -q "Fine-tuning framework"
|
||||
axolotl agent-docs grpo | grep -q "GRPO"
|
||||
axolotl agent-docs sft | grep -q "SFT"
|
||||
python -c "from axolotl.cli.agent_docs import get_doc, list_topics; assert len(list_topics()) >= 5; assert 'GRPO' in get_doc('grpo')"
|
||||
|
||||
- name: Show HF cache
|
||||
run: hf cache ls
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
|
||||
pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
|
||||
pytest -v --durations=10 tests/cli/
|
||||
|
||||
- name: Show HF cache
|
||||
run: hf cache ls
|
||||
|
||||
gate-skip-e2e:
|
||||
needs: [pre-commit]
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
skip: ${{ steps.compute.outputs.skip }}
|
||||
steps:
|
||||
- uses: actions/github-script@v7
|
||||
id: compute
|
||||
with:
|
||||
script: |
|
||||
const token = /\[skip-e2e\]/i;
|
||||
let msg = '';
|
||||
if (context.eventName === 'push') {
|
||||
msg = context.payload.head_commit?.message || '';
|
||||
} else if (context.eventName === 'pull_request') {
|
||||
const { owner, repo } = context.repo;
|
||||
const prNumber = context.payload.pull_request.number;
|
||||
const commits = await github.paginate(
|
||||
github.rest.pulls.listCommits,
|
||||
{ owner, repo, pull_number: prNumber, per_page: 100 }
|
||||
);
|
||||
msg = commits.at(-1)?.commit?.message || '';
|
||||
}
|
||||
const title = context.payload.pull_request?.title || '';
|
||||
const body = context.payload.pull_request?.body || '';
|
||||
const skip = token.test(msg) || token.test(title) || token.test(body);
|
||||
core.setOutput('skip', String(skip));
|
||||
|
||||
docker-e2e-tests-1st:
|
||||
# Run this job first as a gate for running the remainder of the test matrix
|
||||
if: >
|
||||
github.repository_owner == 'axolotl-ai-cloud' &&
|
||||
(github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
|
||||
needs.gate-skip-e2e.outputs.skip != 'true'
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 120
|
||||
needs: [pre-commit, pytest]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.12"
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
|
||||
- name: Run tests job on Modal
|
||||
env:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
run: |
|
||||
modal run cicd.e2e_tests
|
||||
|
||||
docker-e2e-tests:
|
||||
if: >
|
||||
github.repository_owner == 'axolotl-ai-cloud' &&
|
||||
(github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
|
||||
needs.gate-skip-e2e.outputs.skip != 'true'
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 120
|
||||
# Only run the remainder of the matrix if the first e2e check passed;
|
||||
# this is to save on wasted compute costs for known failures that get caught in the first run
|
||||
needs: [pre-commit, pytest, gate-skip-e2e, docker-e2e-tests-1st]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.10.0
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||
echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
|
||||
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
|
||||
- name: Run tests job on Modal
|
||||
env:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
run: |
|
||||
modal run cicd.e2e_tests
|
||||
|
||||
docker-e2e-cleanup:
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 90
|
||||
needs: [docker-e2e-tests]
|
||||
if: ${{ !github.event.pull_request.draft }}
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 128
|
||||
cuda_version: 12.8.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.9.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: Install Modal
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install modal==1.3.0.post1 jinja2
|
||||
- name: Update env vars
|
||||
run: |
|
||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||
- name: Run tests job on Modal
|
||||
run: |
|
||||
modal run cicd.cleanup
|
||||
pytest tests/
|
||||
|
||||
32
.gitignore
vendored
32
.gitignore
vendored
@@ -1,9 +1,5 @@
|
||||
**/axolotl.egg-info
|
||||
configs
|
||||
last_run_prepared/
|
||||
outputs
|
||||
.vscode
|
||||
_site/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
@@ -134,7 +130,6 @@ venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
venv3.10/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
@@ -166,30 +161,3 @@ cython_debug/
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
.idea/
|
||||
|
||||
# WandB
|
||||
# wandb creates a folder to store logs for training runs
|
||||
wandb
|
||||
|
||||
# Runs
|
||||
lora-out/*
|
||||
qlora-out/*
|
||||
mlruns/*
|
||||
|
||||
/.quarto/
|
||||
prepared-datasets/
|
||||
submit.sh
|
||||
*.out*
|
||||
|
||||
# Quartodoc generated files
|
||||
objects.json
|
||||
site_libs/
|
||||
|
||||
typings/
|
||||
out/
|
||||
|
||||
# vim
|
||||
*.swp
|
||||
|
||||
# scm auto-versioning
|
||||
src/axolotl/_version.py
|
||||
|
||||
2
.isort.cfg
Normal file
2
.isort.cfg
Normal file
@@ -0,0 +1,2 @@
|
||||
[settings]
|
||||
profile=black
|
||||
17
.mypy.ini
17
.mypy.ini
@@ -1,5 +1,5 @@
|
||||
[mypy]
|
||||
plugins = pydantic.mypy
|
||||
|
||||
exclude = venv
|
||||
|
||||
[mypy-alpaca_lora_4bit.*]
|
||||
@@ -8,15 +8,6 @@ ignore_missing_imports = True
|
||||
[mypy-axolotl.monkeypatch.*]
|
||||
ignore_errors = True
|
||||
|
||||
[mypy-axolotl.models.mixtral.*]
|
||||
ignore_errors = True
|
||||
|
||||
[mypy-axolotl.integrations.liger.models.*]
|
||||
ignore_errors = True
|
||||
|
||||
[mypy-axolotl.models.phi.*]
|
||||
ignore_errors = True
|
||||
|
||||
[mypy-flash_attn.*]
|
||||
ignore_missing_imports = True
|
||||
|
||||
@@ -29,15 +20,9 @@ ignore_missing_imports = True
|
||||
[mypy-peft]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-wandb]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-bitsandbytes]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-requests]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-datasets]
|
||||
ignore_missing_imports = True
|
||||
|
||||
|
||||
@@ -3,30 +3,37 @@ default_language_version:
|
||||
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v6.0.0
|
||||
rev: v4.4.0
|
||||
hooks:
|
||||
- id: check-yaml
|
||||
- id: end-of-file-fixer
|
||||
- id: trailing-whitespace
|
||||
- id: no-commit-to-branch
|
||||
args: ['--branch', 'main']
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.15.8
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 23.3.0
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--fix]
|
||||
- id: ruff-format
|
||||
- id: black
|
||||
- repo: https://github.com/pycqa/isort
|
||||
rev: 5.12.0
|
||||
hooks:
|
||||
- id: isort
|
||||
- repo: https://github.com/PyCQA/flake8
|
||||
rev: 6.0.0
|
||||
hooks:
|
||||
- id: flake8
|
||||
- repo: https://github.com/PyCQA/pylint
|
||||
rev: v2.17.4
|
||||
hooks:
|
||||
- id: pylint
|
||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||
rev: v1.19.1
|
||||
rev: v1.3.0
|
||||
hooks:
|
||||
- id: mypy
|
||||
additional_dependencies:
|
||||
[
|
||||
'types-PyYAML',
|
||||
'pydantic>=2.5.3',
|
||||
]
|
||||
- repo: https://github.com/PyCQA/bandit
|
||||
rev: 1.9.4
|
||||
rev: 1.7.5
|
||||
hooks:
|
||||
- id: bandit
|
||||
args: [
|
||||
|
||||
14
.pylintrc
Normal file
14
.pylintrc
Normal file
@@ -0,0 +1,14 @@
|
||||
[MASTER]
|
||||
init-hook="from pylint.config import find_pylintrc; import os, sys; sys.path.append(os.path.dirname(find_pylintrc()))"
|
||||
|
||||
[TYPECHECK]
|
||||
|
||||
# List of members which are set dynamically and missed by Pylint inference
|
||||
# system, and so shouldn't trigger E1101 when accessed.
|
||||
generated-members=numpy.*, torch.*
|
||||
|
||||
|
||||
[pylint.messages_control]
|
||||
disable=missing-function-docstring, line-too-long, import-error,
|
||||
too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
|
||||
too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
|
||||
161
.runpod/.gitignore
vendored
161
.runpod/.gitignore
vendored
@@ -1,161 +0,0 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
pod/scripts/config.yaml
|
||||
@@ -1,19 +0,0 @@
|
||||
FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0
|
||||
|
||||
COPY .runpod/requirements.txt /requirements.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
python3 -m pip install --upgrade pip && \
|
||||
python3 -m pip install --upgrade -r /requirements.txt
|
||||
|
||||
# Environment settings
|
||||
ARG BASE_VOLUME="/runpod-volume"
|
||||
ENV BASE_VOLUME=$BASE_VOLUME
|
||||
ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
|
||||
ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
||||
ENV HF_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
||||
ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
|
||||
|
||||
COPY .runpod/src /src
|
||||
|
||||
WORKDIR /src
|
||||
CMD ["python3", "/src/handler.py"]
|
||||
@@ -1,335 +0,0 @@
|
||||
<h1>LLM Post Training- Full fine-tune, LoRA, QLoRa etc. Llama/Mistral/Gemma and more</h1>
|
||||
|
||||
# Configuration Options
|
||||
|
||||
This document outlines all available configuration options for training models. The configuration can be provided as a JSON request.
|
||||
|
||||
## Usage
|
||||
|
||||
You can use these configuration Options:
|
||||
|
||||
1. As a JSON request body:
|
||||
|
||||
```json
|
||||
{
|
||||
"input": {
|
||||
"user_id": "user",
|
||||
"model_id": "model-name",
|
||||
"run_id": "run-id",
|
||||
"credentials": {
|
||||
"wandb_api_key": "", # add your Weights & biases key. TODO: you will be able to set this in Enviornment variables.
|
||||
"hf_token": "", # add your HF_token. TODO: you will be able to set this in Enviornment variables.
|
||||
},
|
||||
"args": {
|
||||
"base_model": "NousResearch/Llama-3.2-1B",
|
||||
// ... other options
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Configuration Options
|
||||
|
||||
### Model Configuration
|
||||
|
||||
| Option | Description | Default |
|
||||
| ------------------- | --------------------------------------------------------------------------------------------- | -------------------- |
|
||||
| `base_model` | Path to the base model (local or HuggingFace) | Required |
|
||||
| `base_model_config` | Configuration path for the base model | Same as base_model |
|
||||
| `revision_of_model` | Specific model revision from HuggingFace hub | Latest |
|
||||
| `tokenizer_config` | Custom tokenizer configuration path | Optional |
|
||||
| `model_type` | Type of model to load | AutoModelForCausalLM |
|
||||
| `tokenizer_type` | Type of tokenizer to use | AutoTokenizer |
|
||||
| `hub_model_id` | Repository ID where the model will be pushed on Hugging Face Hub (format: username/repo-name) | Optional |
|
||||
|
||||
## Model Family Identification
|
||||
|
||||
| Option | Default | Description |
|
||||
| -------------------------- | ------- | ------------------------------ |
|
||||
| `is_falcon_derived_model` | `false` | Whether model is Falcon-based |
|
||||
| `is_llama_derived_model` | `false` | Whether model is LLaMA-based |
|
||||
| `is_qwen_derived_model` | `false` | Whether model is Qwen-based |
|
||||
| `is_mistral_derived_model` | `false` | Whether model is Mistral-based |
|
||||
|
||||
## Model Configuration Overrides
|
||||
|
||||
| Option | Default | Description |
|
||||
| ----------------------------------------------- | ---------- | ---------------------------------- |
|
||||
| `overrides_of_model_config.rope_scaling.type` | `"linear"` | RoPE scaling type (linear/dynamic) |
|
||||
| `overrides_of_model_config.rope_scaling.factor` | `1.0` | RoPE scaling factor |
|
||||
|
||||
### Model Loading Options
|
||||
|
||||
| Option | Description | Default |
|
||||
| -------------- | ----------------------------- | ------- |
|
||||
| `load_in_8bit` | Load model in 8-bit precision | false |
|
||||
| `load_in_4bit` | Load model in 4-bit precision | false |
|
||||
| `bf16` | Use bfloat16 precision | false |
|
||||
| `fp16` | Use float16 precision | false |
|
||||
| `tf32` | Use tensor float 32 precision | false |
|
||||
|
||||
## Memory and Device Settings
|
||||
|
||||
| Option | Default | Description |
|
||||
| ------------------ | --------- | ----------------------- |
|
||||
| `gpu_memory_limit` | `"20GiB"` | GPU memory limit |
|
||||
| `lora_on_cpu` | `false` | Load LoRA on CPU |
|
||||
| `device_map` | `"auto"` | Device mapping strategy |
|
||||
| `max_memory` | `null` | Max memory per device |
|
||||
|
||||
## Training Hyperparameters
|
||||
|
||||
| Option | Default | Description |
|
||||
| ----------------------------- | --------- | --------------------------- |
|
||||
| `gradient_accumulation_steps` | `1` | Gradient accumulation steps |
|
||||
| `micro_batch_size` | `2` | Batch size per GPU |
|
||||
| `eval_batch_size` | `null` | Evaluation batch size |
|
||||
| `num_epochs` | `4` | Number of training epochs |
|
||||
| `warmup_steps` | `100` | Warmup steps |
|
||||
| `warmup_ratio` | `0.05` | Warmup ratio |
|
||||
| `learning_rate` | `0.00003` | Learning rate |
|
||||
| `lr_quadratic_warmup` | `false` | Quadratic warmup |
|
||||
| `logging_steps` | `null` | Logging frequency |
|
||||
| `eval_steps` | `null` | Evaluation frequency |
|
||||
| `evals_per_epoch` | `null` | Evaluations per epoch |
|
||||
| `save_strategy` | `"epoch"` | Checkpoint saving strategy |
|
||||
| `save_steps` | `null` | Saving frequency |
|
||||
| `saves_per_epoch` | `null` | Saves per epoch |
|
||||
| `save_total_limit` | `null` | Maximum checkpoints to keep |
|
||||
| `max_steps` | `null` | Maximum training steps |
|
||||
|
||||
### Dataset Configuration
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
- path: vicgalle/alpaca-gpt4 # HuggingFace dataset or TODO: You will be able to add the local path.
|
||||
type: alpaca # Format type (alpaca, gpteacher, oasst, etc.)
|
||||
ds_type: json # Dataset type
|
||||
data_files: path/to/data # Source data files
|
||||
train_on_split: train # Dataset split to use
|
||||
```
|
||||
|
||||
## Chat Template Settings
|
||||
|
||||
| Option | Default | Description |
|
||||
| ------------------------ | -------------------------------- | ---------------------- |
|
||||
| `chat_template` | `"tokenizer_default"` | Chat template type |
|
||||
| `chat_template_jinja` | `null` | Custom Jinja template |
|
||||
| `default_system_message` | `"You are a helpful assistant."` | Default system message |
|
||||
|
||||
## Dataset Processing
|
||||
|
||||
| Option | Default | Description |
|
||||
| --------------------------------- | -------------------------- | ----------------------------------- |
|
||||
| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
|
||||
| `push_dataset_to_hub` | `""` | Push dataset to HF hub |
|
||||
| `dataset_num_proc` | `4` | Number of preprocessing processes |
|
||||
| `dataset_keep_in_memory` | `false` | Keep dataset in memory |
|
||||
| `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
|
||||
| `shuffle_before_merging_datasets` | `false` | Shuffle each dataset before merging |
|
||||
| `dataset_exact_deduplication` | `true` | Deduplicate datasets |
|
||||
|
||||
## LoRA Configuration
|
||||
|
||||
| Option | Default | Description |
|
||||
| -------------------------- | ---------------------- | ------------------------------ |
|
||||
| `adapter` | `"lora"` | Adapter type (lora/qlora) |
|
||||
| `lora_model_dir` | `""` | Directory with pretrained LoRA |
|
||||
| `lora_r` | `8` | LoRA attention dimension |
|
||||
| `lora_alpha` | `16` | LoRA alpha parameter |
|
||||
| `lora_dropout` | `0.05` | LoRA dropout |
|
||||
| `lora_target_modules` | `["q_proj", "v_proj"]` | Modules to apply LoRA |
|
||||
| `lora_target_linear` | `false` | Target all linear modules |
|
||||
| `peft_layers_to_transform` | `[]` | Layers to transform |
|
||||
| `lora_modules_to_save` | `[]` | Modules to save |
|
||||
| `lora_fan_in_fan_out` | `false` | Fan in/out structure |
|
||||
|
||||
## Optimization Settings
|
||||
|
||||
| Option | Default | Description |
|
||||
| ------------------------- | ------- | -------------------------- |
|
||||
| `train_on_inputs` | `false` | Train on input prompts |
|
||||
| `group_by_length` | `false` | Group by sequence length |
|
||||
| `gradient_checkpointing` | `false` | Use gradient checkpointing |
|
||||
| `early_stopping_patience` | `3` | Early stopping patience |
|
||||
|
||||
## Learning Rate Scheduling
|
||||
|
||||
| Option | Default | Description |
|
||||
| -------------------------- | ---------- | -------------------- |
|
||||
| `lr_scheduler` | `"cosine"` | Scheduler type |
|
||||
| `lr_scheduler_kwargs` | `{}` | Scheduler parameters |
|
||||
| `cosine_min_lr_ratio` | `null` | Minimum LR ratio |
|
||||
| `cosine_constant_lr_ratio` | `null` | Constant LR ratio |
|
||||
| `lr_div_factor` | `null` | LR division factor |
|
||||
|
||||
## Optimizer Settings
|
||||
|
||||
| Option | Default | Description |
|
||||
| ---------------------- | ------------ | ------------------- |
|
||||
| `optimizer` | `"adamw_hf"` | Optimizer choice |
|
||||
| `optim_args` | `{}` | Optimizer arguments |
|
||||
| `optim_target_modules` | `[]` | Target modules |
|
||||
| `weight_decay` | `null` | Weight decay |
|
||||
| `adam_beta1` | `null` | Adam beta1 |
|
||||
| `adam_beta2` | `null` | Adam beta2 |
|
||||
| `adam_epsilon` | `null` | Adam epsilon |
|
||||
| `max_grad_norm` | `null` | Gradient clipping |
|
||||
|
||||
## Attention Implementations
|
||||
|
||||
| Option | Default | Description |
|
||||
| -------------------------- | ------- | ----------------------------- |
|
||||
| `flash_optimum` | `false` | Use better transformers |
|
||||
| `xformers_attention` | `false` | Use xformers |
|
||||
| `flash_attention` | `false` | Use flash attention |
|
||||
| `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
|
||||
| `flash_attn_rms_norm` | `false` | Flash attention RMS norm |
|
||||
| `flash_attn_fuse_mlp` | `false` | Fuse MLP operations |
|
||||
| `sdp_attention` | `false` | Use scaled dot product |
|
||||
| `s2_attention` | `false` | Use shifted sparse attention |
|
||||
|
||||
## Tokenizer Modifications
|
||||
|
||||
| Option | Default | Description |
|
||||
| ---------------- | ------- | ---------------------------- |
|
||||
| `special_tokens` | - | Special tokens to add/modify |
|
||||
| `tokens` | `[]` | Additional tokens |
|
||||
|
||||
## Distributed Training
|
||||
|
||||
| Option | Default | Description |
|
||||
| ----------------------- | ------- | --------------------- |
|
||||
| `fsdp` | `null` | FSDP configuration |
|
||||
| `fsdp_config` | `null` | FSDP config options |
|
||||
| `deepspeed` | `null` | Deepspeed config path |
|
||||
| `ddp_timeout` | `null` | DDP timeout |
|
||||
| `ddp_bucket_cap_mb` | `null` | DDP bucket capacity |
|
||||
| `ddp_broadcast_buffers` | `null` | DDP broadcast buffers |
|
||||
|
||||
<details>
|
||||
<summary><h3>Example Configuration Request:</h3></summary>
|
||||
|
||||
Here's a complete example for fine-tuning a LLaMA model using LoRA:
|
||||
|
||||
```json
|
||||
{
|
||||
"input": {
|
||||
"user_id": "user",
|
||||
"model_id": "llama-test",
|
||||
"run_id": "test-run",
|
||||
"credentials": {
|
||||
"wandb_api_key": "",
|
||||
"hf_token": ""
|
||||
},
|
||||
"args": {
|
||||
"base_model": "NousResearch/Llama-3.2-1B",
|
||||
"load_in_8bit": false,
|
||||
"load_in_4bit": false,
|
||||
"strict": false,
|
||||
"datasets": [
|
||||
{
|
||||
"path": "teknium/GPT4-LLM-Cleaned",
|
||||
"type": "alpaca"
|
||||
}
|
||||
],
|
||||
"dataset_prepared_path": "last_run_prepared",
|
||||
"val_set_size": 0.1,
|
||||
"output_dir": "./outputs/lora-out",
|
||||
"adapter": "lora",
|
||||
"sequence_len": 2048,
|
||||
"sample_packing": true,
|
||||
"eval_sample_packing": true,
|
||||
"pad_to_sequence_len": true,
|
||||
"lora_r": 16,
|
||||
"lora_alpha": 32,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_modules": [
|
||||
"gate_proj",
|
||||
"down_proj",
|
||||
"up_proj",
|
||||
"q_proj",
|
||||
"v_proj",
|
||||
"k_proj",
|
||||
"o_proj"
|
||||
],
|
||||
"gradient_accumulation_steps": 2,
|
||||
"micro_batch_size": 2,
|
||||
"num_epochs": 1,
|
||||
"optimizer": "adamw_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"learning_rate": 0.0002,
|
||||
"train_on_inputs": false,
|
||||
"group_by_length": false,
|
||||
"bf16": "auto",
|
||||
"tf32": false,
|
||||
"gradient_checkpointing": true,
|
||||
"logging_steps": 1,
|
||||
"flash_attention": true,
|
||||
"loss_watchdog_threshold": 5,
|
||||
"loss_watchdog_patience": 3,
|
||||
"warmup_steps": 10,
|
||||
"evals_per_epoch": 4,
|
||||
"saves_per_epoch": 1,
|
||||
"weight_decay": 0,
|
||||
"hub_model_id": "runpod/llama-fr-lora",
|
||||
"wandb_name": "test-run-1",
|
||||
"wandb_project": "test-run-1",
|
||||
"wandb_entity": "axo-test",
|
||||
"special_tokens": {
|
||||
"pad_token": "<|end_of_text|>"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### Advanced Features
|
||||
|
||||
#### Wandb Integration
|
||||
|
||||
- `wandb_project`: Project name for Weights & Biases
|
||||
- `wandb_entity`: Team name in W&B
|
||||
- `wandb_watch`: Monitor model with W&B
|
||||
- `wandb_name`: Name of the W&B run
|
||||
- `wandb_run_id`: ID for the W&B run
|
||||
|
||||
#### Performance Optimization
|
||||
|
||||
- `sample_packing`: Enable efficient sequence packing
|
||||
- `eval_sample_packing`: Use sequence packing during evaluation
|
||||
- `torch_compile`: Enable PyTorch 2.0 compilation
|
||||
- `flash_attention`: Use Flash Attention implementation
|
||||
- `xformers_attention`: Use xFormers attention implementation
|
||||
|
||||
### Available Optimizers
|
||||
|
||||
The following optimizers are supported:
|
||||
|
||||
- `adamw_hf`: HuggingFace's AdamW implementation
|
||||
- `adamw_torch`: PyTorch's AdamW
|
||||
- `adamw_torch_fused`: Fused AdamW implementation
|
||||
- `adamw_torch_xla`: XLA-optimized AdamW
|
||||
- `adamw_apex_fused`: NVIDIA Apex fused AdamW
|
||||
- `adafactor`: Adafactor optimizer
|
||||
- `adamw_anyprecision`: Anyprecision AdamW
|
||||
- `adamw_bnb_8bit`: 8-bit AdamW from bitsandbytes
|
||||
- `lion_8bit`: 8-bit Lion optimizer
|
||||
- `lion_32bit`: 32-bit Lion optimizer
|
||||
- `sgd`: Stochastic Gradient Descent
|
||||
- `adagrad`: Adagrad optimizer
|
||||
|
||||
## Notes
|
||||
|
||||
- Set `load_in_8bit: true` or `load_in_4bit: true` for memory-efficient training
|
||||
- Enable `flash_attention: true` for faster training on modern GPUs
|
||||
- Use `gradient_checkpointing: true` to reduce memory usage
|
||||
- Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory
|
||||
|
||||
For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config-reference.html).
|
||||
|
||||
### Errors:
|
||||
|
||||
- if you face any issues with the Flash Attention-2, Delete yoor worker and Re-start.
|
||||
@@ -1,93 +0,0 @@
|
||||
{
|
||||
"title": "Axolotl Fine-Tuning",
|
||||
"description": "Serverless fine-tuning of open-source LLMs with Axolotl. Supports LoRA, QLoRA, DPO, and more using Hugging Face models and datasets.",
|
||||
"type": "serverless",
|
||||
"category": "language",
|
||||
"iconUrl": "https://avatars.githubusercontent.com/u/167502477",
|
||||
"config": {
|
||||
"runsOn": "GPU",
|
||||
"containerDiskInGb": 200,
|
||||
"gpuCount": 1,
|
||||
"allowedCudaVersions": [
|
||||
"12.8",
|
||||
"12.7",
|
||||
"12.6",
|
||||
"12.5",
|
||||
"12.4"
|
||||
],
|
||||
"presets": [],
|
||||
"env": [
|
||||
{
|
||||
"key": "TOKENIZER",
|
||||
"input": {
|
||||
"name": "Tokenizer",
|
||||
"type": "string",
|
||||
"description": "Name or path of the Hugging Face tokenizer to use.",
|
||||
"default": "",
|
||||
"advanced": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"key": "MAX_NUM_SEQS",
|
||||
"input": {
|
||||
"name": "Max Num Seqs",
|
||||
"type": "number",
|
||||
"description": "Maximum number of sequences per iteration.",
|
||||
"default": 256,
|
||||
"advanced": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"key": "DISABLE_LOG_STATS",
|
||||
"input": {
|
||||
"name": "Disable Log Stats",
|
||||
"type": "boolean",
|
||||
"description": "Disable logging statistics.",
|
||||
"default": false,
|
||||
"trueValue": "true",
|
||||
"falseValue": "false"
|
||||
}
|
||||
},
|
||||
{
|
||||
"key": "LOAD_FORMAT",
|
||||
"input": {
|
||||
"name": "Load Format",
|
||||
"type": "string",
|
||||
"description": "The format of the model weights to load.",
|
||||
"default": "auto",
|
||||
"options": [
|
||||
{
|
||||
"label": "auto",
|
||||
"value": "auto"
|
||||
},
|
||||
{
|
||||
"label": "pt",
|
||||
"value": "pt"
|
||||
},
|
||||
{
|
||||
"label": "safetensors",
|
||||
"value": "safetensors"
|
||||
},
|
||||
{
|
||||
"label": "npcache",
|
||||
"value": "npcache"
|
||||
},
|
||||
{
|
||||
"label": "dummy",
|
||||
"value": "dummy"
|
||||
},
|
||||
{
|
||||
"label": "tensorizer",
|
||||
"value": "tensorizer"
|
||||
},
|
||||
{
|
||||
"label": "bitsandbytes",
|
||||
"value": "bitsandbytes"
|
||||
}
|
||||
],
|
||||
"advanced": true
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
# Required Python packages get listed here, one per line.
|
||||
# Reccomended to lock the version number to avoid unexpected changes.
|
||||
|
||||
# You can also install packages from a git repository, e.g.:
|
||||
# git+https://github.com/runpod/runpod-python.git
|
||||
# To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/
|
||||
runpod~=1.7.0
|
||||
@@ -1,564 +0,0 @@
|
||||
# # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
|
||||
# # This can also be a relative path to a model on disk
|
||||
# base_model: ./llama-7b-hf
|
||||
# # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
|
||||
# base_model_ignore_patterns:
|
||||
# # If the base_model repo on hf hub doesn't include configuration .json files,
|
||||
# # You can set that here, or leave this empty to default to base_model
|
||||
# base_model_config: ./llama-7b-hf
|
||||
# # You can specify to choose a specific model revision from huggingface hub
|
||||
# model_revision:
|
||||
# # Optional tokenizer configuration override in case you want to use a different tokenizer
|
||||
# # than the one defined in the base model
|
||||
# tokenizer_config:
|
||||
# # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
|
||||
# model_type: AutoModelForCausalLM
|
||||
# # Corresponding tokenizer for the model AutoTokenizer is a good choice
|
||||
# tokenizer_type: AutoTokenizer
|
||||
# # Trust remote code for untrusted source
|
||||
# trust_remote_code:
|
||||
# # use_fast option for tokenizer loading from_pretrained, default to True
|
||||
# tokenizer_use_fast:
|
||||
# # Whether to use the legacy tokenizer setting, defaults to True
|
||||
# tokenizer_legacy:
|
||||
# # Resize the model embeddings when new tokens are added to multiples of 32
|
||||
# # This is reported to improve training speed on some models
|
||||
# resize_token_embeddings_to_32x:
|
||||
|
||||
# # Used to identify which the model is based on
|
||||
# is_falcon_derived_model:
|
||||
# is_llama_derived_model:
|
||||
# # Please note that if you set this to true, `padding_side` will be set to "left" by default
|
||||
# is_mistral_derived_model:
|
||||
# is_qwen_derived_model:
|
||||
|
||||
# # optional overrides to the base model configuration
|
||||
# model_config:
|
||||
# # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
|
||||
# rope_scaling:
|
||||
# type: # linear | dynamic
|
||||
# factor: # float
|
||||
|
||||
# # Whether you are training a 4-bit GPTQ quantized model
|
||||
# gptq: true
|
||||
# gptq_groupsize: 128 # group size
|
||||
# gptq_model_v1: false # v1 or v2
|
||||
|
||||
# # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
|
||||
# load_in_8bit: true
|
||||
# # Use bitsandbytes 4 bit
|
||||
# load_in_4bit:
|
||||
|
||||
# # Use CUDA bf16
|
||||
# bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
|
||||
# # Use CUDA fp16
|
||||
# fp16: true
|
||||
# # Use CUDA tf32
|
||||
# tf32: true # require >=ampere
|
||||
|
||||
# # No AMP (automatic mixed precision)
|
||||
# bfloat16: true # require >=ampere
|
||||
# float16: true
|
||||
|
||||
# # A list of one or more datasets to finetune the model with
|
||||
# datasets:
|
||||
# # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
|
||||
# - path: vicgalle/alpaca-gpt4
|
||||
# # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
||||
# type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
||||
# ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
|
||||
# data_files: # Optional[str] path to source data files
|
||||
# shards: # Optional[int] number of shards to split data into
|
||||
# name: # Optional[str] name of dataset configuration to load
|
||||
# train_on_split: train # Optional[str] name of dataset split to load from
|
||||
|
||||
# # Optional[str] fastchat conversation type, only used with type: sharegpt
|
||||
# conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
|
||||
# field_human: # Optional[str]. Human key to use for conversation.
|
||||
# field_model: # Optional[str]. Assistant key to use for conversation.
|
||||
|
||||
# # Custom user prompt
|
||||
# - path: repo
|
||||
# type:
|
||||
# # The below are defaults. only set what's needed.
|
||||
# system_prompt: ""
|
||||
# system_format: "{system}"
|
||||
# field_system: system
|
||||
# field_instruction: instruction
|
||||
# field_input: input
|
||||
# field_output: output
|
||||
|
||||
# # Customizable to be single line or multi-line
|
||||
# # 'format' can include {input}
|
||||
# format: |-
|
||||
# User: {instruction} {input}
|
||||
# Assistant:
|
||||
# # 'no_input_format' cannot include {input}
|
||||
# no_input_format: "{instruction} "
|
||||
|
||||
# # For `completion` datasets only, uses the provided field instead of `text` column
|
||||
# field:
|
||||
|
||||
# # Axolotl attempts to save the dataset as an arrow after packing the data together so
|
||||
# # subsequent training attempts load faster, relative path
|
||||
# dataset_prepared_path: data/last_run_prepared
|
||||
# # Push prepared dataset to hub
|
||||
# push_dataset_to_hub: # repo path
|
||||
# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
|
||||
# # if not set.
|
||||
# dataset_num_proc: # defaults to os.cpu_count() if not set
|
||||
# # push checkpoints to hub
|
||||
# hub_model_id: # repo path to push finetuned model
|
||||
# # how to push checkpoints to hub
|
||||
# # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
|
||||
# hub_strategy:
|
||||
# # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
|
||||
# # Required to be true when used in combination with `push_dataset_to_hub`
|
||||
# hf_use_auth_token: # boolean
|
||||
# # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
|
||||
# val_set_size: 0.04
|
||||
# # Num shards for whole dataset
|
||||
# dataset_shard_num:
|
||||
# # Index of shard to use for whole dataset
|
||||
# dataset_shard_idx:
|
||||
|
||||
# # The maximum length of an input to train with, this should typically be less than 2048
|
||||
# # as most models have a token/context limit of 2048
|
||||
# sequence_len: 2048
|
||||
# # Pad inputs so each step uses constant sized buffers
|
||||
# # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
|
||||
# pad_to_sequence_len:
|
||||
# # Max sequence length to concatenate training samples together up to
|
||||
# # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
|
||||
# # FutureWarning: This will soon be DEPRECATED
|
||||
# max_packed_sequence_len: 1024
|
||||
# # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
|
||||
# sample_packing:
|
||||
# # Set to 'false' if getting errors during eval with sample_packing on.
|
||||
# eval_sample_packing:
|
||||
# # You can set these packing optimizations AFTER starting a training at least once.
|
||||
# # The trainer will provide recommended values for these values.
|
||||
# sample_packing_eff_est:
|
||||
# total_num_tokens:
|
||||
|
||||
# # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
|
||||
# adapter: lora
|
||||
# # If you already have a lora model trained that you want to load, put that here.
|
||||
# # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`.
|
||||
# lora_model_dir:
|
||||
|
||||
# # LoRA hyperparameters
|
||||
# # For more details about the following options, see:
|
||||
# # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
|
||||
# lora_r: 8
|
||||
# lora_alpha: 16
|
||||
# lora_dropout: 0.05
|
||||
# lora_target_modules:
|
||||
# - q_proj
|
||||
# - v_proj
|
||||
# # - k_proj
|
||||
# # - o_proj
|
||||
# # - gate_proj
|
||||
# # - down_proj
|
||||
# # - up_proj
|
||||
# lora_target_linear: # If true, will target all linear layers
|
||||
|
||||
# # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
|
||||
# # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
|
||||
# # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
|
||||
# # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
|
||||
# lora_modules_to_save:
|
||||
# # - embed_tokens
|
||||
# # - lm_head
|
||||
|
||||
# # Once you complete training, the model will be saved to the following directory.
|
||||
# # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory.
|
||||
# # Make sure `lora_model_dir` points to this directory if you want to use the trained model.
|
||||
# lora_out_dir:
|
||||
# lora_fan_in_fan_out: false
|
||||
|
||||
# # ReLoRA configuration
|
||||
# # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
|
||||
# relora_steps: # Number of steps per ReLoRA restart
|
||||
# relora_warmup_steps: # Number of per-restart warmup steps
|
||||
# relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
|
||||
|
||||
# # wandb configuration if you're using it
|
||||
# wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
|
||||
# wandb_project: # Your wandb project name
|
||||
# wandb_entity: # A wandb Team name if using a Team
|
||||
# wandb_watch:
|
||||
# wandb_run_id: # Set the name of your wandb run
|
||||
# wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
|
||||
|
||||
# # Where to save the full-finetuned model to
|
||||
# output_dir: ./completed-model
|
||||
|
||||
# # Whether to use torch.compile and which backend to use
|
||||
# torch_compile: # bool
|
||||
# torch_compile_backend: # Optional[str]
|
||||
|
||||
# # Training hyperparameters
|
||||
|
||||
# # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
|
||||
# gradient_accumulation_steps: 1
|
||||
# # The number of samples to include in each batch. This is the number of samples sent to each GPU.
|
||||
# micro_batch_size: 2
|
||||
# eval_batch_size:
|
||||
# num_epochs: 4
|
||||
# warmup_steps: 100 # cannot use with warmup_ratio
|
||||
# warmup_ratio: 0.05 # cannot use with warmup_steps
|
||||
# learning_rate: 0.00003
|
||||
# lr_quadratic_warmup:
|
||||
# logging_steps:
|
||||
# save_strategy: # Set to `no` to skip checkpoint saves
|
||||
# save_steps: # Leave empty to save at each epoch
|
||||
# eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
|
||||
# save_total_limit: # Checkpoints saved at a time
|
||||
# # Maximum number of iterations to train for. It precedes num_epochs which means that
|
||||
# # if both are set, num_epochs will not be guaranteed.
|
||||
# # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
|
||||
# max_steps:
|
||||
|
||||
# eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
|
||||
# eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
|
||||
|
||||
# # Whether to mask out or include the human's prompt from the training labels
|
||||
# train_on_inputs: false
|
||||
# # Group similarly sized data to minimize padding.
|
||||
# # May be slower to start, as it must download and sort the entire dataset.
|
||||
# # Note that training loss may have an oscillating pattern with this enabled.
|
||||
# group_by_length: false
|
||||
|
||||
# # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
|
||||
# gradient_checkpointing: false
|
||||
|
||||
# # Stop training after this many evaluation losses have increased in a row
|
||||
# # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
|
||||
# early_stopping_patience: 3
|
||||
|
||||
# # Specify a scheduler and kwargs to use with the optimizer
|
||||
# lr_scheduler: # 'one_cycle' | empty for cosine
|
||||
# lr_scheduler_kwargs:
|
||||
|
||||
# # For one_cycle optim
|
||||
# lr_div_factor: # Learning rate div factor
|
||||
|
||||
# # Specify optimizer
|
||||
# # Valid values are driven by the Transformers OptimizerNames class, see:
|
||||
# # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
|
||||
# #
|
||||
# # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
|
||||
# # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
|
||||
# # in the examples/ for your model and fine-tuning use case.
|
||||
# #
|
||||
# # Valid values for 'optimizer' include:
|
||||
# # - adamw_hf
|
||||
# # - adamw_torch
|
||||
# # - adamw_torch_fused
|
||||
# # - adamw_torch_xla
|
||||
# # - adamw_apex_fused
|
||||
# # - adafactor
|
||||
# # - adamw_anyprecision
|
||||
# # - sgd
|
||||
# # - adagrad
|
||||
# # - adamw_bnb_8bit
|
||||
# # - lion_8bit
|
||||
# # - lion_32bit
|
||||
# # - paged_adamw_32bit
|
||||
# # - paged_adamw_8bit
|
||||
# # - paged_lion_32bit
|
||||
# # - paged_lion_8bit
|
||||
# optimizer:
|
||||
# # Specify weight decay
|
||||
# weight_decay:
|
||||
# # adamw hyperparams
|
||||
# adam_beta1:
|
||||
# adam_beta2:
|
||||
# adam_epsilon:
|
||||
# # Gradient clipping max norm
|
||||
# max_grad_norm:
|
||||
|
||||
# # Augmentation techniques
|
||||
# # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
|
||||
# # currently only supported on Llama and Mistral
|
||||
# noisy_embedding_alpha:
|
||||
|
||||
# # Whether to bettertransformers
|
||||
# flash_optimum:
|
||||
# # Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
|
||||
# xformers_attention:
|
||||
# # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
|
||||
# flash_attention:
|
||||
# flash_attn_cross_entropy: # Whether to use flash-attention cross entropy implementation - advanced use only
|
||||
# flash_attn_rms_norm: # Whether to use flash-attention rms norm implementation - advanced use only
|
||||
# flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
|
||||
# # Whether to use scaled-dot-product attention
|
||||
# # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
|
||||
# sdp_attention:
|
||||
# # Landmark attention (only llama)
|
||||
# landmark_attention:
|
||||
# # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
|
||||
# # LLaMA only
|
||||
# xpos_rope:
|
||||
|
||||
# # Resume from a specific checkpoint dir
|
||||
# resume_from_checkpoint:
|
||||
# # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
|
||||
# # Be careful with this being turned on between different models.
|
||||
# auto_resume_from_checkpoints: false
|
||||
|
||||
# # Don't mess with this, it's here for accelerate and torchrun
|
||||
# local_rank:
|
||||
|
||||
# # Add or change special tokens.
|
||||
# # If you add tokens here, you don't need to add them to the `tokens` list.
|
||||
# special_tokens:
|
||||
# # bos_token: "<s>"
|
||||
# # eos_token: "</s>"
|
||||
# # unk_token: "<unk>"
|
||||
|
||||
# # Add extra tokens.
|
||||
# tokens:
|
||||
|
||||
# # FSDP
|
||||
# fsdp:
|
||||
# fsdp_config:
|
||||
|
||||
# # Deepspeed config path. e.g., deepspeed/zero3.json
|
||||
# deepspeed:
|
||||
|
||||
# # Advanced DDP Arguments
|
||||
# ddp_timeout:
|
||||
# ddp_bucket_cap_mb:
|
||||
# ddp_broadcast_buffers:
|
||||
|
||||
# # Path to torch distx for optim 'adamw_anyprecision'
|
||||
# torchdistx_path:
|
||||
|
||||
# # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
|
||||
# pretraining_dataset:
|
||||
|
||||
# # Debug mode
|
||||
# debug:
|
||||
|
||||
# # Seed
|
||||
# seed:
|
||||
|
||||
# # Allow overwrite yml config using from cli
|
||||
# strict:
|
||||
|
||||
base_model: ${BASE_MODEL}
|
||||
base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
|
||||
base_model_config: ${BASE_MODEL_CONFIG}
|
||||
revision_of_model: ${REVISION_OF_MODEL}
|
||||
tokenizer_config: ${TOKENIZER_CONFIG}
|
||||
model_type: ${MODEL_TYPE}
|
||||
tokenizer_type: ${TOKENIZER_TYPE}
|
||||
trust_remote_code: ${TRUST_REMOTE_CODE}
|
||||
tokenizer_use_fast: ${TOKENIZER_USE_FAST}
|
||||
tokenizer_legacy: ${TOKENIZER_LEGACY}
|
||||
resize_token_embeddings_to_32x: ${RESIZE_TOKEN_EMBEDDINGS_TO_32X}
|
||||
|
||||
is_falcon_derived_model: ${IS_FALCON_DERIVED_MODEL}
|
||||
is_llama_derived_model: ${IS_LLAMA_DERIVED_MODEL}
|
||||
is_qwen_derived_model: ${IS_QWEN_DERIVED_MODEL}
|
||||
is_mistral_derived_model: ${IS_MISTRAL_DERIVED_MODEL}
|
||||
|
||||
overrides_of_model_config:
|
||||
rope_scaling:
|
||||
type: ${ROPE_SCALING_TYPE}
|
||||
factor: ${ROPE_SCALING_FACTOR}
|
||||
|
||||
bnb_config_kwargs:
|
||||
llm_int8_has_fp16_weight: ${BNB_LLM_INT8_HAS_FP16_WEIGHT}
|
||||
bnb_4bit_quant_type: ${BNB_4BIT_QUANT_TYPE}
|
||||
bnb_4bit_use_double_quant: ${BNB_4BIT_USE_DOUBLE_QUANT}
|
||||
|
||||
gptq: ${GPTQ}
|
||||
load_in_8bit: ${LOAD_IN_8BIT}
|
||||
load_in_4bit: ${LOAD_IN_4BIT}
|
||||
bf16: ${BF16}
|
||||
fp16: ${FP16}
|
||||
tf32: ${TF32}
|
||||
bfloat16: ${BFLOAT16}
|
||||
float16: ${FLOAT16}
|
||||
|
||||
gpu_memory_limit: ${GPU_MEMORY_LIMIT}
|
||||
lora_on_cpu: ${LORA_ON_CPU}
|
||||
|
||||
datasets:
|
||||
- path: ${DATASET_PATH}
|
||||
type: ${DATASET_TYPE}
|
||||
ds_type: ${DATASET_DS_TYPE}
|
||||
data_files: ${DATASET_DATA_FILES}
|
||||
shards: ${DATASET_SHARDS}
|
||||
name: ${DATASET_NAME}
|
||||
train_on_split: ${DATASET_TRAIN_ON_SPLIT}
|
||||
revision: ${DATASET_REVISION}
|
||||
trust_remote_code: ${DATASET_TRUST_REMOTE_CODE}
|
||||
|
||||
rl: ${RL}
|
||||
dpo_use_weighting: ${DPO_USE_WEIGHTING}
|
||||
|
||||
chat_template: ${CHAT_TEMPLATE}
|
||||
chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
|
||||
default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
|
||||
dataset_prepared_path: ${DATASET_PREPARED_PATH}
|
||||
push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
|
||||
dataset_num_proc: ${DATASET_NUM_PROC}
|
||||
dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
|
||||
hub_model_id: ${HUB_MODEL_ID}
|
||||
hub_strategy: ${HUB_STRATEGY}
|
||||
hf_use_auth_token: ${HF_USE_AUTH_TOKEN}
|
||||
val_set_size: ${VAL_SET_SIZE}
|
||||
dataset_shard_num: ${DATASET_SHARD_NUM}
|
||||
dataset_shard_idx: ${DATASET_SHARD_IDX}
|
||||
|
||||
sequence_len: ${SEQUENCE_LEN}
|
||||
pad_to_sequence_len: ${PAD_TO_SEQUENCE_LEN}
|
||||
sample_packing: ${SAMPLE_PACKING}
|
||||
eval_sample_packing: ${EVAL_SAMPLE_PACKING}
|
||||
sample_packing_eff_est: ${SAMPLE_PACKING_EFF_EST}
|
||||
total_num_tokens: ${TOTAL_NUM_TOKENS}
|
||||
sample_packing_group_size: ${SAMPLE_PACKING_GROUP_SIZE}
|
||||
sample_packing_bin_size: ${SAMPLE_PACKING_BIN_SIZE}
|
||||
|
||||
batch_flattening: ${BATCH_FLATTENING}
|
||||
device_map: ${DEVICE_MAP}
|
||||
max_memory: ${MAX_MEMORY}
|
||||
|
||||
adapter: ${ADAPTER}
|
||||
lora_model_dir: ${LORA_MODEL_DIR}
|
||||
|
||||
lora_r: ${LORA_R}
|
||||
lora_alpha: ${LORA_ALPHA}
|
||||
lora_dropout: ${LORA_DROPOUT}
|
||||
lora_target_modules:
|
||||
- ${LORA_TARGET_MODULES}
|
||||
lora_target_linear: ${LORA_TARGET_LINEAR}
|
||||
peft_layers_to_transform: ${PEFT_LAYERS_TO_TRANSFORM}
|
||||
lora_modules_to_save: ${LORA_MODULES_TO_SAVE}
|
||||
lora_fan_in_fan_out: ${LORA_FAN_IN_FAN_OUT}
|
||||
|
||||
loraplus_lr_ratio: ${LORAPLUS_LR_RATIO}
|
||||
loraplus_lr_embedding: ${LORAPLUS_LR_EMBEDDING}
|
||||
|
||||
peft:
|
||||
loftq_config:
|
||||
loftq_bits: ${LOFTQ_BITS}
|
||||
|
||||
relora_steps: ${RELORA_STEPS}
|
||||
relora_warmup_steps: ${RELORA_WARMUP_STEPS}
|
||||
relora_anneal_steps: ${RELORA_ANNEAL_STEPS}
|
||||
relora_prune_ratio: ${RELORA_PRUNE_RATIO}
|
||||
relora_cpu_offload: ${RELORA_CPU_OFFLOAD}
|
||||
|
||||
wandb_mode: ${WANDB_MODE}
|
||||
wandb_project: ${WANDB_PROJECT}
|
||||
wandb_entity: ${WANDB_ENTITY}
|
||||
wandb_watch: ${WANDB_WATCH}
|
||||
wandb_name: ${WANDB_NAME}
|
||||
wandb_run_id: ${WANDB_RUN_ID}
|
||||
wandb_log_model: ${WANDB_LOG_MODEL}
|
||||
|
||||
mlflow_tracking_uri: ${MLFLOW_TRACKING_URI}
|
||||
mlflow_experiment_name: ${MLFLOW_EXPERIMENT_NAME}
|
||||
mlflow_run_name: ${MLFLOW_RUN_NAME}
|
||||
hf_mlflow_log_artifacts: ${HF_MLFLOW_LOG_ARTIFACTS}
|
||||
|
||||
use_comet: ${USE_COMET}
|
||||
comet_api_key: ${COMET_API_KEY}
|
||||
comet_workspace: ${COMET_WORKSPACE}
|
||||
comet_project_name: ${COMET_PROJECT_NAME}
|
||||
comet_experiment_key: ${COMET_EXPERIMENT_KEY}
|
||||
comet_mode: ${COMET_MODE}
|
||||
comet_online: ${COMET_ONLINE}
|
||||
comet_experiment_config: ${COMET_EXPERIMENT_CONFIG}
|
||||
|
||||
output_dir: ${OUTPUT_DIR}
|
||||
|
||||
torch_compile: ${TORCH_COMPILE}
|
||||
torch_compile_backend: ${TORCH_COMPILE_BACKEND}
|
||||
|
||||
gradient_accumulation_steps: ${GRADIENT_ACCUMULATION_STEPS}
|
||||
micro_batch_size: ${MICRO_BATCH_SIZE}
|
||||
eval_batch_size: ${EVAL_BATCH_SIZE}
|
||||
num_epochs: ${NUM_EPOCHS}
|
||||
warmup_steps: ${WARMUP_STEPS}
|
||||
warmup_ratio: ${WARMUP_RATIO}
|
||||
learning_rate: ${LEARNING_RATE}
|
||||
lr_quadratic_warmup: ${LR_QUADRATIC_WARMUP}
|
||||
logging_steps: ${LOGGING_STEPS}
|
||||
eval_steps: ${EVAL_STEPS}
|
||||
evals_per_epoch: ${EVALS_PER_EPOCH}
|
||||
save_strategy: ${SAVE_STRATEGY}
|
||||
save_steps: ${SAVE_STEPS}
|
||||
saves_per_epoch: ${SAVES_PER_EPOCH}
|
||||
save_total_limit: ${SAVE_TOTAL_LIMIT}
|
||||
max_steps: ${MAX_STEPS}
|
||||
|
||||
eval_table_size: ${EVAL_TABLE_SIZE}
|
||||
eval_max_new_tokens: ${EVAL_MAX_NEW_TOKENS}
|
||||
eval_causal_lm_metrics: ${EVAL_CAUSAL_LM_METRICS}
|
||||
|
||||
profiler_steps: ${PROFILER_STEPS}
|
||||
loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
|
||||
loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
|
||||
|
||||
train_on_inputs: ${TRAIN_ON_INPUTS}
|
||||
group_by_length: ${GROUP_BY_LENGTH}
|
||||
gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
|
||||
early_stopping_patience: ${EARLY_STOPPING_PATIENCE}
|
||||
|
||||
lr_scheduler: ${LR_SCHEDULER}
|
||||
lr_scheduler_kwargs: ${LR_SCHEDULER_KWARGS}
|
||||
cosine_min_lr_ratio: ${COSINE_MIN_LR_RATIO}
|
||||
cosine_constant_lr_ratio: ${COSINE_CONSTANT_LR_RATIO}
|
||||
lr_div_factor: ${LR_DIV_FACTOR}
|
||||
|
||||
optimizer: ${OPTIMIZER}
|
||||
optim_args: ${OPTIM_ARGS}
|
||||
optim_target_modules: ${OPTIM_TARGET_MODULES}
|
||||
weight_decay: ${WEIGHT_DECAY}
|
||||
adam_beta1: ${ADAM_BETA1}
|
||||
adam_beta2: ${ADAM_BETA2}
|
||||
adam_epsilon: ${ADAM_EPSILON}
|
||||
max_grad_norm: ${MAX_GRAD_NORM}
|
||||
|
||||
neftune_noise_alpha: ${NEFTUNE_NOISE_ALPHA}
|
||||
|
||||
flash_optimum: ${FLASH_OPTIMUM}
|
||||
xformers_attention: ${XFORMERS_ATTENTION}
|
||||
flash_attention: ${FLASH_ATTENTION}
|
||||
flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
|
||||
flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
|
||||
flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
|
||||
sdp_attention: ${SDP_ATTENTION}
|
||||
s2_attention: ${S2_ATTENTION}
|
||||
resume_from_checkpoint: ${RESUME_FROM_CHECKPOINT}
|
||||
auto_resume_from_checkpoints: ${AUTO_RESUME_FROM_CHECKPOINTS}
|
||||
|
||||
local_rank: ${LOCAL_RANK}
|
||||
|
||||
special_tokens:
|
||||
bos_token: ${SPECIAL_TOKEN_BOS}
|
||||
eos_token: ${SPECIAL_TOKEN_EOS}
|
||||
unk_token: ${SPECIAL_TOKEN_UNK}
|
||||
pad_token: ${SPECIAL_TOKEN_PAD}
|
||||
|
||||
tokens: ${TOKENS}
|
||||
|
||||
fsdp: ${FSDP}
|
||||
fsdp_config: ${FSDP_CONFIG}
|
||||
deepspeed: ${DEEPSPEED}
|
||||
|
||||
ddp_timeout: ${DDP_TIMEOUT}
|
||||
ddp_bucket_cap_mb: ${DDP_BUCKET_CAP_MB}
|
||||
ddp_broadcast_buffers: ${DDP_BROADCAST_BUFFERS}
|
||||
|
||||
torchdistx_path: ${TORCHDISTX_PATH}
|
||||
pretraining_dataset: ${PRETRAINING_DATASET}
|
||||
debug: ${DEBUG}
|
||||
seed: ${SEED}
|
||||
strict: ${STRICT}
|
||||
@@ -1,66 +0,0 @@
|
||||
"""
|
||||
Runpod serverless entrypoint handler
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import runpod
|
||||
import yaml
|
||||
from huggingface_hub._login import login
|
||||
from train import train
|
||||
from utils import get_output_dir
|
||||
|
||||
BASE_VOLUME = os.environ.get("BASE_VOLUME", "/runpod-volume")
|
||||
if not os.path.exists(BASE_VOLUME):
|
||||
os.makedirs(BASE_VOLUME)
|
||||
|
||||
logger = runpod.RunPodLogger()
|
||||
|
||||
|
||||
async def handler(job):
|
||||
runpod_job_id = job["id"]
|
||||
inputs = job["input"]
|
||||
run_id = inputs.get("run_id", "default_run_id")
|
||||
args = inputs.get("args", {})
|
||||
|
||||
# Set output directory
|
||||
output_dir = os.path.join(BASE_VOLUME, get_output_dir(run_id))
|
||||
args["output_dir"] = output_dir
|
||||
|
||||
# First save args to a temporary config file
|
||||
config_path = "/workspace/test_config.yaml"
|
||||
|
||||
# Add run_name and job_id to args before saving
|
||||
args["run_name"] = run_id
|
||||
args["runpod_job_id"] = runpod_job_id
|
||||
|
||||
yaml_data = yaml.dump(args, default_flow_style=False)
|
||||
with open(config_path, "w", encoding="utf-8") as file:
|
||||
file.write(yaml_data)
|
||||
|
||||
# Handle credentials
|
||||
credentials = inputs.get("credentials", {})
|
||||
|
||||
if "wandb_api_key" in credentials:
|
||||
os.environ["WANDB_API_KEY"] = credentials["wandb_api_key"]
|
||||
if "hf_token" in credentials:
|
||||
os.environ["HF_TOKEN"] = credentials["hf_token"]
|
||||
|
||||
if os.environ.get("HF_TOKEN"):
|
||||
login(token=os.environ["HF_TOKEN"])
|
||||
else:
|
||||
logger.info("No HF_TOKEN provided. Skipping login.")
|
||||
|
||||
logger.info("Starting Training.")
|
||||
async for result in train(config_path): # Pass the config path instead of args
|
||||
logger.info(result)
|
||||
logger.info("Training Complete.")
|
||||
|
||||
# Cleanup
|
||||
if "WANDB_API_KEY" in os.environ:
|
||||
del os.environ["WANDB_API_KEY"]
|
||||
if "HF_TOKEN" in os.environ:
|
||||
del os.environ["HF_TOKEN"]
|
||||
|
||||
|
||||
runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
|
||||
@@ -1,61 +0,0 @@
|
||||
{
|
||||
"input": {
|
||||
"user_id": "user",
|
||||
"model_id": "llama-test",
|
||||
"run_id": "llama-test",
|
||||
"credentials": {
|
||||
"wandb_api_key": "",
|
||||
"hf_token": ""
|
||||
},
|
||||
"args": {
|
||||
"base_model": "NousResearch/Meta-Llama-3-8B",
|
||||
"model_type": "LlamaForCausalLM",
|
||||
"tokenizer_type": "AutoTokenizer",
|
||||
"load_in_8bit": true,
|
||||
"load_in_4bit": false,
|
||||
"strict": false,
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca"
|
||||
}
|
||||
],
|
||||
"val_set_size": 0.05,
|
||||
"output_dir": "./outputs/lora-out",
|
||||
"sequence_len": 4096,
|
||||
"sample_packing": true,
|
||||
"eval_sample_packing": false,
|
||||
"pad_to_sequence_len": true,
|
||||
"adapter": "lora",
|
||||
"lora_r": 32,
|
||||
"lora_alpha": 16,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": true,
|
||||
"lora_modules_to_save": [
|
||||
"embed_tokens",
|
||||
"lm_head"
|
||||
],
|
||||
"gradient_accumulation_steps": 4,
|
||||
"micro_batch_size": 2,
|
||||
"num_epochs": 1,
|
||||
"optimizer": "adamw_bnb_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"learning_rate": 0.0002,
|
||||
"train_on_inputs": false,
|
||||
"group_by_length": false,
|
||||
"bf16": "auto",
|
||||
"tf32": false,
|
||||
"gradient_checkpointing": true,
|
||||
"logging_steps": 1,
|
||||
"flash_attention": true,
|
||||
"warmup_steps": 1,
|
||||
"evals_per_epoch": 1,
|
||||
"eval_max_new_tokens": 128,
|
||||
"saves_per_epoch": 1,
|
||||
"weight_decay": 0.0,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|end_of_text|>"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,45 +0,0 @@
|
||||
"""
|
||||
Runpod train entrypoint
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
|
||||
async def train(config_path: str, gpu_id: str = "0", preprocess: bool = True):
|
||||
"""
|
||||
Run preprocessing (if enabled) and training with the given config file
|
||||
:param config_path: Path to the YAML config file
|
||||
:param gpu_id: GPU ID to use (default: "0")
|
||||
:param preprocess: Whether to run preprocessing (default: True)
|
||||
|
||||
"""
|
||||
# First check if preprocessing is needed
|
||||
if preprocess:
|
||||
# Preprocess command
|
||||
preprocess_cmd = (
|
||||
f"CUDA_VISIBLE_DEVICES={gpu_id} axolotl preprocess {config_path}"
|
||||
)
|
||||
process = await asyncio.create_subprocess_shell(
|
||||
preprocess_cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.STDOUT,
|
||||
)
|
||||
|
||||
if process.stdout is not None:
|
||||
async for line in process.stdout:
|
||||
yield f"Preprocessing: {line.decode().strip()}"
|
||||
await process.wait()
|
||||
yield "Preprocessing completed."
|
||||
else:
|
||||
yield "Skipping preprocessing step."
|
||||
|
||||
# Training command
|
||||
train_cmd = f"axolotl train {config_path}"
|
||||
process = await asyncio.create_subprocess_shell(
|
||||
train_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
|
||||
)
|
||||
|
||||
if process.stdout is not None:
|
||||
async for line in process.stdout:
|
||||
yield f"Training: {line.decode().strip()}"
|
||||
await process.wait()
|
||||
@@ -1,89 +0,0 @@
|
||||
"""
|
||||
Runpod launcher utils
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
def get_output_dir(run_id):
|
||||
path = f"fine-tuning/{run_id}"
|
||||
return path
|
||||
|
||||
|
||||
def make_valid_config(input_args):
|
||||
"""
|
||||
Creates and saves updated config file, returns the path to the new config
|
||||
:param input_args: dict of input args
|
||||
:return: str, path to the updated config file
|
||||
"""
|
||||
# Load default config
|
||||
with open("config/config.yaml", "r", encoding="utf-8") as fin:
|
||||
all_args = yaml.safe_load(fin)
|
||||
|
||||
if not input_args:
|
||||
print("No args provided, using defaults")
|
||||
else:
|
||||
all_args.update(input_args)
|
||||
|
||||
# Create updated config path
|
||||
updated_config_path = "config/updated_config.yaml"
|
||||
|
||||
# Save updated config to new file
|
||||
with open(updated_config_path, "w", encoding="utf-8") as f:
|
||||
yaml.dump(all_args, f)
|
||||
|
||||
return updated_config_path
|
||||
|
||||
|
||||
def set_config_env_vars(args: dict):
|
||||
"""
|
||||
Convert API arguments into environment variables.
|
||||
Handles nested dictionaries, lists, and special values.
|
||||
|
||||
Args:
|
||||
args (dict): The arguments dictionary from the API request
|
||||
"""
|
||||
|
||||
def process_value(value):
|
||||
"""Convert Python values to string format for environment variables"""
|
||||
if value is None:
|
||||
return ""
|
||||
if isinstance(value, bool):
|
||||
return str(value).lower()
|
||||
if isinstance(value, (list, dict)):
|
||||
return str(value)
|
||||
return str(value)
|
||||
|
||||
def set_env_vars(data, prefix=""):
|
||||
"""Recursively set environment variables from nested dictionary"""
|
||||
for key, value in data.items():
|
||||
env_key = prefix + key.upper()
|
||||
|
||||
# Handle special cases
|
||||
if isinstance(value, dict):
|
||||
# For nested dictionaries (like special_tokens)
|
||||
set_env_vars(value, f"{env_key}_")
|
||||
elif isinstance(value, list):
|
||||
# Handle list of dictionaries (like datasets)
|
||||
if value and isinstance(value[0], dict):
|
||||
for i, item in enumerate(value):
|
||||
set_env_vars(item, f"{env_key}_{i}_")
|
||||
else:
|
||||
# For simple lists (like lora_target_modules)
|
||||
os.environ[env_key] = process_value(value)
|
||||
else:
|
||||
# Handle all other cases
|
||||
os.environ[env_key] = process_value(value)
|
||||
|
||||
# Clear any existing related environment variables
|
||||
# This prevents old values from persisting
|
||||
for key in list(os.environ.keys()):
|
||||
if key.startswith(
|
||||
("BASE_MODEL", "MODEL_TYPE", "TOKENIZER_TYPE", "DATASET", "LORA_", "WANDB_")
|
||||
):
|
||||
del os.environ[key]
|
||||
|
||||
# Set new environment variables
|
||||
set_env_vars(args)
|
||||
@@ -1,86 +0,0 @@
|
||||
{
|
||||
"input": {
|
||||
"name": "quick_smoke_test_sft",
|
||||
"user_id": "user",
|
||||
"model_id": "llama-test",
|
||||
"run_id": "llama-test",
|
||||
"credentials": {
|
||||
"wandb_api_key": "",
|
||||
"hf_token": ""
|
||||
},
|
||||
"args": {
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"model_type": "AutoModelForCausalLM",
|
||||
"tokenizer_type": "AutoTokenizer",
|
||||
"load_in_4bit": true,
|
||||
"strict": false,
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca",
|
||||
"split": "train[:10%]"
|
||||
}
|
||||
],
|
||||
"val_set_size": 0.02,
|
||||
"output_dir": "./outputs/lora-out",
|
||||
"sequence_len": 4096,
|
||||
"sample_packing": true,
|
||||
"eval_sample_packing": false,
|
||||
"pad_to_sequence_len": true,
|
||||
"adapter": "qlora",
|
||||
"lora_r": 32,
|
||||
"lora_alpha": 64,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": true,
|
||||
"lora_modules_to_save": [
|
||||
"embed_tokens",
|
||||
"lm_head"
|
||||
],
|
||||
"gradient_accumulation_steps": 2,
|
||||
"micro_batch_size": 1,
|
||||
"num_epochs": 1,
|
||||
"optimizer": "adamw_torch_fused",
|
||||
"lr_scheduler": "cosine",
|
||||
"learning_rate": 0.0002,
|
||||
"train_on_inputs": false,
|
||||
"group_by_length": false,
|
||||
"bf16": "auto",
|
||||
"tf32": true,
|
||||
"gradient_checkpointing": true,
|
||||
"logging_steps": 1,
|
||||
"flash_attention": true,
|
||||
"warmup_steps": 1,
|
||||
"evals_per_epoch": 1,
|
||||
"eval_max_new_tokens": 128,
|
||||
"saves_per_epoch": 1,
|
||||
"weight_decay": 0.0,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>"
|
||||
},
|
||||
"max_steps": 20
|
||||
},
|
||||
"timeout": 100000
|
||||
},
|
||||
"config": {
|
||||
"gpuTypeId": "NVIDIA GeForce RTX 4090",
|
||||
"gpuCount": 1,
|
||||
"containerDiskInGb": 200,
|
||||
"env": [
|
||||
{
|
||||
"key": "TOKENIZER",
|
||||
"value": ""
|
||||
},
|
||||
{
|
||||
"key": "DISABLE_LOG_STATS",
|
||||
"value": "true"
|
||||
}
|
||||
],
|
||||
"allowedCudaVersions": [
|
||||
"12.8",
|
||||
"12.7",
|
||||
"12.6",
|
||||
"12.5",
|
||||
"12.4"
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -1,90 +0,0 @@
|
||||
{
|
||||
"tests": [
|
||||
{
|
||||
"name": "quick_smoke_test_sft",
|
||||
"input": {
|
||||
"user_id": "user",
|
||||
"model_id": "llama-test",
|
||||
"run_id": "llama-test",
|
||||
"credentials": {
|
||||
"wandb_api_key": "",
|
||||
"hf_token": ""
|
||||
},
|
||||
"args": {
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"model_type": "AutoModelForCausalLM",
|
||||
"tokenizer_type": "AutoTokenizer",
|
||||
"load_in_4bit": true,
|
||||
"strict": false,
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca",
|
||||
"split": "train[:10%]"
|
||||
}
|
||||
],
|
||||
"val_set_size": 0.02,
|
||||
"output_dir": "./outputs/lora-out",
|
||||
"sequence_len": 4096,
|
||||
"sample_packing": true,
|
||||
"eval_sample_packing": false,
|
||||
"pad_to_sequence_len": true,
|
||||
"adapter": "qlora",
|
||||
"lora_r": 32,
|
||||
"lora_alpha": 64,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": true,
|
||||
"lora_modules_to_save": [
|
||||
"embed_tokens",
|
||||
"lm_head"
|
||||
],
|
||||
"gradient_accumulation_steps": 2,
|
||||
"micro_batch_size": 1,
|
||||
"num_epochs": 1,
|
||||
"optimizer": "adamw_torch_fused",
|
||||
"lr_scheduler": "cosine",
|
||||
"learning_rate": 0.0002,
|
||||
"train_on_inputs": false,
|
||||
"group_by_length": false,
|
||||
"bf16": "auto",
|
||||
"tf32": true,
|
||||
"gradient_checkpointing": true,
|
||||
"logging_steps": 1,
|
||||
"flash_attention": true,
|
||||
"warmup_steps": 1,
|
||||
"evals_per_epoch": 1,
|
||||
"eval_max_new_tokens": 128,
|
||||
"saves_per_epoch": 1,
|
||||
"weight_decay": 0.0,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>"
|
||||
},
|
||||
"max_steps": 20
|
||||
}
|
||||
},
|
||||
"timeout": 100000
|
||||
}
|
||||
],
|
||||
"config": {
|
||||
"gpuTypeId": "NVIDIA GeForce RTX 4090",
|
||||
"gpuCount": 1,
|
||||
"containerDiskInGb": 200,
|
||||
"env": [
|
||||
{
|
||||
"key": "TOKENIZER",
|
||||
"value": ""
|
||||
},
|
||||
{
|
||||
"key": "DISABLE_LOG_STATS",
|
||||
"value": "true"
|
||||
}
|
||||
],
|
||||
"allowedCudaVersions": [
|
||||
"12.8",
|
||||
"12.7",
|
||||
"12.6",
|
||||
"12.5",
|
||||
"12.4"
|
||||
]
|
||||
}
|
||||
}
|
||||
1
.vscode/README.md
vendored
1
.vscode/README.md
vendored
@@ -1 +0,0 @@
|
||||
See [docs/debugging.md](../docs/debugging.md) for guidance on how to modify these files to debug axolotl with VSCode.
|
||||
34
.vscode/launch.json
vendored
34
.vscode/launch.json
vendored
@@ -1,34 +0,0 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Debug axolotl prompt - sharegpt",
|
||||
"type": "python",
|
||||
"module": "accelerate.commands.launch",
|
||||
"request": "launch",
|
||||
"args": [
|
||||
"-m", "axolotl.cli.train", "dev_sharegpt.yml",
|
||||
// The flags below simplify debugging by overriding the axolotl config
|
||||
// with the debugging tips above. Modify as needed.
|
||||
"--dataset_processes=1", // limits data preprocessing to one process
|
||||
"--max_steps=1", // limits training to just one step
|
||||
"--batch_size=1", // minimizes batch size
|
||||
"--micro_batch_size=1", // minimizes batch size
|
||||
"--val_set_size=0", // disables validation
|
||||
"--sample_packing=False", // disables sample packing which is necessary for small datasets
|
||||
"--eval_sample_packing=False",// disables sample packing on eval set
|
||||
"--dataset_prepared_path=temp_debug/axolotl_outputs/data", // send data outputs to a temp folder
|
||||
"--output_dir=temp_debug/axolotl_outputs/model" // send model outputs to a temp folder
|
||||
],
|
||||
"console": "integratedTerminal", // show output in the integrated terminal
|
||||
"cwd": "${workspaceFolder}/devtools", // set working directory to devtools from the root of the project
|
||||
"justMyCode": true, // step through only axolotl code
|
||||
"env": {"CUDA_VISIBLE_DEVICES": "0", // Since we aren't doing distributed training, we need to limit to one GPU
|
||||
"HF_HOME": "${workspaceFolder}/devtools/temp_debug/.hf-cache"}, // send HF cache to a temp folder
|
||||
"preLaunchTask": "cleanup-for-dataprep", // delete temp folders (see below)
|
||||
}
|
||||
]
|
||||
}
|
||||
27
.vscode/tasks.json
vendored
27
.vscode/tasks.json
vendored
@@ -1,27 +0,0 @@
|
||||
//this file is used by launch.json
|
||||
{
|
||||
"version": "2.0.0",
|
||||
"tasks": [
|
||||
// this task changes into the devtools directory and deletes the temp_debug/axolotl_outputs folder
|
||||
{
|
||||
"label": "delete-outputs",
|
||||
"type": "shell",
|
||||
"command": "rm -rf temp_debug/axolotl_outputs",
|
||||
"options":{ "cwd": "${workspaceFolder}/devtools"},
|
||||
"problemMatcher": []
|
||||
},
|
||||
// this task changes into the devtools directory and deletes the `temp_debug/.hf-cache/datasets` folder
|
||||
{
|
||||
"label": "delete-temp-hf-dataset-cache",
|
||||
"type": "shell",
|
||||
"command": "rm -rf temp_debug/.hf-cache/datasets",
|
||||
"options":{ "cwd": "${workspaceFolder}/devtools"},
|
||||
"problemMatcher": []
|
||||
},
|
||||
// this task combines the two tasks above
|
||||
{
|
||||
"label": "cleanup-for-dataprep",
|
||||
"dependsOn": ["delete-outputs", "delete-temp-hf-dataset-cache"],
|
||||
}
|
||||
]
|
||||
}
|
||||
99
AGENTS.md
99
AGENTS.md
@@ -1,99 +0,0 @@
|
||||
# Axolotl
|
||||
|
||||
Fine-tuning framework for LLMs. Config-driven: every training run is defined by a single YAML file.
|
||||
|
||||
## Tech Stack
|
||||
|
||||
Python, PyTorch, HuggingFace Transformers, TRL, PEFT (LoRA/QLoRA), DeepSpeed, FSDP, vLLM (for GRPO generation).
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
axolotl train config.yaml # Train (single or multi-GPU, auto-detected)
|
||||
axolotl preprocess config.yaml # Tokenize dataset and validate config
|
||||
axolotl preprocess config.yaml --debug # Inspect tokenized samples and label masking
|
||||
axolotl inference config.yaml # Interactive inference
|
||||
axolotl merge-lora config.yaml # Merge LoRA adapter into base model
|
||||
axolotl vllm-serve config.yaml # Start vLLM server for GRPO/EBFT training
|
||||
axolotl fetch examples # Download example configs
|
||||
axolotl agent-docs # Show agent-optimized docs (bundled with pip package)
|
||||
axolotl agent-docs grpo # Topic-specific agent reference
|
||||
axolotl config-schema # Dump config JSON schema
|
||||
```
|
||||
|
||||
## Training Methods
|
||||
|
||||
| Method | Config Key | When to Use |
|
||||
|--------|-----------|-------------|
|
||||
| SFT | *(default)* | Input-output pairs, instruction tuning |
|
||||
| DPO/IPO | `rl: dpo` / `rl: dpo, dpo_loss_type: ["ipo"]` | Paired preference data (chosen vs rejected) |
|
||||
| KTO | `rl: kto` | Unpaired binary preference labels |
|
||||
| ORPO | `rl: orpo` | Single-stage alignment, no ref model |
|
||||
| GRPO | `rl: grpo` | RL with verifiable reward functions (math, code) |
|
||||
| EBFT | `rl: ebft` | Feature-matching rewards from internal representations |
|
||||
|
||||
Agent-specific references:
|
||||
- [docs/agents/sft.md](docs/agents/sft.md) — supervised fine-tuning
|
||||
- [docs/agents/preference_tuning.md](docs/agents/preference_tuning.md) — DPO, IPO, KTO, ORPO, SimPO
|
||||
- [docs/agents/grpo.md](docs/agents/grpo.md) — GRPO online RL with reward functions
|
||||
- [docs/agents/reward_modelling.md](docs/agents/reward_modelling.md) — outcome and process reward models
|
||||
- [docs/agents/pretraining.md](docs/agents/pretraining.md) — continual pretraining
|
||||
- [docs/agents/model_architectures.md](docs/agents/model_architectures.md) — model-specific quirks (Gemma4, Qwen3.5 MoE, etc.)
|
||||
- [docs/agents/new_model_support.md](docs/agents/new_model_support.md) — debugging and adding support for new model architectures
|
||||
|
||||
## Config Pattern
|
||||
|
||||
All training is config-driven. A YAML file specifies model, adapter, dataset(s), and hyperparameters:
|
||||
|
||||
```yaml
|
||||
base_model: meta-llama/Llama-3.1-8B-Instruct
|
||||
adapter: lora # or qlora, or omit for full fine-tune
|
||||
datasets:
|
||||
- path: my_dataset
|
||||
type: chat_template # prompt strategy (see docs/dataset-formats/)
|
||||
output_dir: ./outputs/lora-out
|
||||
```
|
||||
|
||||
Config schema: `src/axolotl/utils/schemas/config.py` (AxolotlInputConfig).
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
src/axolotl/
|
||||
cli/ # CLI entry points (train, preprocess, inference, merge_lora, vllm_serve)
|
||||
core/
|
||||
builders/ # TrainerBuilder classes (causal.py for SFT, rl.py for RLHF)
|
||||
trainers/ # Trainer classes, mixins (optimizer, scheduler, packing)
|
||||
dpo/ # DPO trainer and config
|
||||
grpo/ # GRPO trainer and sampler
|
||||
loaders/ # Model, tokenizer, adapter, processor loading
|
||||
prompt_strategies/ # Dataset format handlers (chat_template, alpaca, dpo/, kto/, orpo/)
|
||||
utils/schemas/ # Pydantic config schemas (config, model, training, peft, trl, fsdp)
|
||||
integrations/ # Plugins (liger, cut_cross_entropy, swanlab, nemo_gym)
|
||||
monkeypatch/ # Runtime patches for HF transformers
|
||||
|
||||
examples/ # Example YAML configs by model (llama-3/, qwen2/, mistral/, ebft/)
|
||||
deepspeed_configs/ # DeepSpeed JSON configs (zero2, zero3)
|
||||
docs/ # Quarto documentation site
|
||||
```
|
||||
|
||||
## Code Conventions
|
||||
|
||||
- Config-driven: features are toggled via YAML, not code changes
|
||||
- Prompt strategies: `src/axolotl/prompt_strategies/` — each `type:` value maps to a function
|
||||
- Plugin system: `plugins:` list in config loads integration modules
|
||||
- Trainer mixins: `core/trainers/mixins/` for composable trainer behaviors
|
||||
- Schemas: all config validation via Pydantic in `utils/schemas/`
|
||||
|
||||
## Key Documentation
|
||||
|
||||
- [Getting Started](docs/getting-started.qmd) — quickstart tutorial
|
||||
- [Choosing a Method](docs/choosing_method.qmd) — SFT vs DPO vs GRPO decision guide
|
||||
- [Config Reference](docs/config-reference.qmd) — all config options
|
||||
- [Dataset Formats](docs/dataset-formats/) — chat_template, alpaca, input_output, completion
|
||||
- [RLHF](docs/rlhf.qmd) — DPO, KTO, ORPO, GRPO, EBFT configs and dataset formats
|
||||
- [GRPO Deep Dive](docs/grpo.qmd) — async training, custom rewards, scaling
|
||||
- [vLLM Serving](docs/vllm_serving.qmd) — vLLM setup for GRPO/EBFT
|
||||
- [Multi-GPU](docs/multi-gpu.qmd) — FSDP and DeepSpeed
|
||||
- [Training Stability](docs/training_stability.qmd) — debugging loss, NaN, OOM
|
||||
- [Debugging](docs/debugging.qmd) — VSCode setup, Docker debugging
|
||||
10
CITATION.cff
10
CITATION.cff
@@ -1,10 +0,0 @@
|
||||
cff-version: 1.2.0
|
||||
type: software
|
||||
title: "Axolotl: Open Source LLM Post-Training"
|
||||
message: "If you use this software, please cite it as below."
|
||||
authors:
|
||||
- name: "Axolotl maintainers and contributors"
|
||||
repository-code: "https://github.com/axolotl-ai-cloud/axolotl"
|
||||
url: "https://axolotl.ai/"
|
||||
license: Apache-2.0
|
||||
date-released: "2023-05-30"
|
||||
@@ -1,7 +0,0 @@
|
||||
include README.md
|
||||
include LICENSE
|
||||
include VERSION
|
||||
include src/axolotl/utils/chat_templates/templates/*.jinja
|
||||
include AGENTS.md
|
||||
recursive-include docs/agents *.md
|
||||
recursive-include axolotl *.py
|
||||
854
README.md
854
README.md
@@ -1,232 +1,726 @@
|
||||
<p align="center">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_white.svg">
|
||||
<source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_black.svg">
|
||||
<img alt="Axolotl" src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_black.svg" width="400" height="104" style="max-width: 100%;">
|
||||
</picture>
|
||||
</p>
|
||||
<p align="center">
|
||||
<strong>A Free and Open Source LLM Fine-tuning Framework</strong><br>
|
||||
</p>
|
||||
# Axolotl
|
||||
|
||||
<p align="center">
|
||||
<img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
|
||||
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
|
||||
<a href="https://codecov.io/gh/axolotl-ai-cloud/axolotl"><img src="https://codecov.io/gh/axolotl-ai-cloud/axolotl/branch/main/graph/badge.svg" alt="codecov"></a>
|
||||
<a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
|
||||
<br/>
|
||||
<a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
|
||||
<img src="https://img.shields.io/github/stars/axolotl-ai-cloud/axolotl" alt="GitHub Repo stars">
|
||||
<br/>
|
||||
<a href="https://discord.com/invite/HhrNrHJPRb"><img src="https://img.shields.io/badge/discord-7289da.svg?style=flat-square&logo=discord" alt="discord" style="height: 20px;"></a>
|
||||
<a href="https://twitter.com/axolotl_ai"><img src="https://img.shields.io/twitter/follow/axolotl_ai?style=social" alt="twitter" style="height: 20px;"></a>
|
||||
<a href="https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="google-colab" style="height: 20px;"></a>
|
||||
<br/>
|
||||
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
|
||||
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
|
||||
</p>
|
||||
Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td>
|
||||
|
||||
## Table of Contents
|
||||
- [Introduction](#axolotl)
|
||||
- [Supported Features](#axolotl-supports)
|
||||
- [Quickstart](#quickstart-)
|
||||
- [Installation](#installation)
|
||||
- [Docker Installation](#environment)
|
||||
- [Conda/Pip venv Installation](#condapip-venv)
|
||||
- [LambdaLabs Installation](#lambdalabs)
|
||||
- [Dataset](#dataset)
|
||||
- [How to Add Custom Prompts](#how-to-add-custom-prompts)
|
||||
- [Config](#config)
|
||||
- [Train](#train)
|
||||
- [Inference](#inference)
|
||||
- [Merge LORA to Base](#merge-lora-to-base)
|
||||
- [Common Errors](#common-errors-)
|
||||
- [Need Help?](#need-help-)
|
||||
- [Badge](#badge-)
|
||||
- [Community Showcase](#community-showcase)
|
||||
- [Contributing](#contributing-)
|
||||
|
||||
</td>
|
||||
<td>
|
||||
|
||||
<div align="center">
|
||||
<img src="image/axolotl.png" alt="axolotl" width="160">
|
||||
<div>
|
||||
<p>
|
||||
<b>Axolotl provides a unified repository for fine-tuning <br />a variety of AI models with ease</b>
|
||||
</p>
|
||||
<p>
|
||||
Go ahead and axolotl questions!!
|
||||
</p>
|
||||
<img src="https://github.com/OpenAccess-AI-Collective/axolotl/actions/workflows/pre-commit.yml/badge.svg?branch=main" alt="pre-commit">
|
||||
<img alt="PyTest Status" src="https://github.com/OpenAccess-AI-Collective/axolotl/actions/workflows/tests.yml/badge.svg?branch=main">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
## Axolotl supports
|
||||
|
||||
| | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
|
||||
|----------|:----------|:-----|-------|------|-------------------|------------|---------------|
|
||||
| llama | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Pythia | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
|
||||
| cerebras | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
|
||||
| mpt | ✅ | ❌ | ❓ | ❌ | ❌ | ❌ | ❓ |
|
||||
| falcon | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
|
||||
| gpt-j | ✅ | ✅ | ✅ | ❌ | ❌ | ❓ | ❓ |
|
||||
| XGen | ✅ | ❓ | ✅ | ❓ | ❓ | ❓ | ✅ |
|
||||
|
||||
|
||||
## 🎉 Latest Updates
|
||||
## Quickstart ⚡
|
||||
|
||||
- 2026/04:
|
||||
- New model support has been added in Axolotl for [Mistral Medium 3.5](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mistral-medium-3_5) and [Gemma 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gemma4).
|
||||
- Axolotl is now [uv-first](https://github.com/axolotl-ai-cloud/axolotl/pull/3545) and has [SonicMoE fused LoRA](https://github.com/axolotl-ai-cloud/axolotl/pull/3519) support.
|
||||
- 2026/03:
|
||||
- New model support has been added in Axolotl for [Mistral Small 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mistral4), [Qwen3.5, Qwen3.5 MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3.5), [GLM-4.7-Flash](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm47-flash), [GLM-4.6V](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm46v), and [GLM-4.5-Air](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm45).
|
||||
- [MoE expert quantization](https://docs.axolotl.ai/docs/expert_quantization.html) support (via `quantize_moe_experts: true`) greatly reduces VRAM when training MoE models (FSDP2 compat).
|
||||
- 2026/02:
|
||||
- [ScatterMoE LoRA](https://github.com/axolotl-ai-cloud/axolotl/pull/3410) support. LoRA fine-tuning directly on MoE expert weights using custom Triton kernels.
|
||||
- Axolotl now has support for [SageAttention](https://github.com/axolotl-ai-cloud/axolotl/pull/2823) and [GDPO](https://github.com/axolotl-ai-cloud/axolotl/pull/3353) (Generalized DPO).
|
||||
- 2026/01:
|
||||
- New integration for [EAFT](https://github.com/axolotl-ai-cloud/axolotl/pull/3366) (Entropy-Aware Focal Training), weights loss by entropy of the top-k logit distribution, and [Scalable Softmax](https://github.com/axolotl-ai-cloud/axolotl/pull/3338), improves long context in attention.
|
||||
- 2025/12:
|
||||
- Axolotl now includes support for [Kimi-Linear](https://docs.axolotl.ai/docs/models/kimi-linear.html), [Plano-Orchestrator](https://docs.axolotl.ai/docs/models/plano.html), [MiMo](https://docs.axolotl.ai/docs/models/mimo.html), [InternVL 3.5](https://docs.axolotl.ai/docs/models/internvl3_5.html), [Olmo3](https://docs.axolotl.ai/docs/models/olmo3.html), [Trinity](https://docs.axolotl.ai/docs/models/trinity.html), and [Ministral3](https://docs.axolotl.ai/docs/models/ministral3.html).
|
||||
- [Distributed Muon Optimizer](https://github.com/axolotl-ai-cloud/axolotl/pull/3264) support has been added for FSDP2 pretraining.
|
||||
- 2025/10: New model support has been added in Axolotl for: [Qwen3 Next](https://docs.axolotl.ai/docs/models/qwen3-next.html), [Qwen2.5-vl, Qwen3-vl](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl), [Qwen3, Qwen3MoE](https://docs.axolotl.ai/docs/models/qwen3.html), [Granite 4](https://docs.axolotl.ai/docs/models/granite4.html), [HunYuan](https://docs.axolotl.ai/docs/models/hunyuan.html), [Magistral 2509](https://docs.axolotl.ai/docs/models/magistral/vision.html), [Apertus](https://docs.axolotl.ai/docs/models/apertus.html), and [Seed-OSS](https://docs.axolotl.ai/docs/models/seed-oss.html).
|
||||
Get started with Axolotl in just a few steps! This quickstart guide will walk you through setting up and running a basic fine-tuning task.
|
||||
|
||||
**Requirements**: Python >=3.9 and Pytorch >=2.0.
|
||||
|
||||
```bash
|
||||
git clone https://github.com/OpenAccess-AI-Collective/axolotl
|
||||
|
||||
pip3 install -e .
|
||||
pip3 install -U git+https://github.com/huggingface/peft.git
|
||||
|
||||
# finetune lora
|
||||
accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml
|
||||
|
||||
# inference
|
||||
accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
|
||||
--inference --lora_model_dir="./lora-out"
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
### Environment
|
||||
|
||||
- Docker
|
||||
```bash
|
||||
docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.10-cu118-2.0.1
|
||||
```
|
||||
- `winglian/axolotl-runpod:main-py3.10-cu118-2.0.1`: for runpod
|
||||
- `winglian/axolotl-runpod:main-py3.9-cu118-2.0.1-gptq`: for gptq
|
||||
|
||||
Or run on the current files for development:
|
||||
|
||||
```sh
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
- Conda/Pip venv
|
||||
1. Install python **3.9**
|
||||
|
||||
2. Install pytorch stable https://pytorch.org/get-started/locally/
|
||||
|
||||
3. Install python dependencies with ONE of the following:
|
||||
- Recommended, supports QLoRA, NO gptq/int4 support
|
||||
```bash
|
||||
pip3 install -e .
|
||||
pip3 install -U git+https://github.com/huggingface/peft.git
|
||||
```
|
||||
- gptq/int4 support, NO QLoRA
|
||||
```bash
|
||||
pip3 install -e .[gptq]
|
||||
```
|
||||
- same as above but not recommended
|
||||
```bash
|
||||
pip3 install -e .[gptq_triton]
|
||||
```
|
||||
|
||||
- LambdaLabs
|
||||
<details>
|
||||
|
||||
<summary>Click to Expand</summary>
|
||||
|
||||
1. Install python
|
||||
```bash
|
||||
sudo apt update
|
||||
sudo apt install -y python3.9
|
||||
|
||||
sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.9 1
|
||||
sudo update-alternatives --config python # pick 3.9 if given option
|
||||
python -V # should be 3.9
|
||||
|
||||
```
|
||||
|
||||
2. Install pip
|
||||
```bash
|
||||
wget https://bootstrap.pypa.io/get-pip.py
|
||||
python get-pip.py
|
||||
```
|
||||
|
||||
3. Install torch
|
||||
```bash
|
||||
pip3 install -U torch --index-url https://download.pytorch.org/whl/cu118
|
||||
```
|
||||
|
||||
4. Axolotl
|
||||
```bash
|
||||
git clone https://github.com/OpenAccess-AI-Collective/axolotl
|
||||
cd axolotl
|
||||
|
||||
pip3 install -e . # change depend on needs
|
||||
pip3 install protobuf==3.20.3
|
||||
pip3 install -U requests
|
||||
pip3 install -U --ignore-installed psutil
|
||||
pip3 install -U scipy
|
||||
pip3 install git+https://github.com/huggingface/peft.git # not for gptq
|
||||
```
|
||||
|
||||
5. Set path
|
||||
```bash
|
||||
export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
|
||||
```
|
||||
</details>
|
||||
|
||||
### Dataset
|
||||
|
||||
Axolotl supports a variety of dataset formats. Below are some of the formats you can use.
|
||||
Have dataset(s) in one of the following format (JSONL recommended):
|
||||
|
||||
- `alpaca`: instruction; input(optional)
|
||||
```json
|
||||
{"instruction": "...", "input": "...", "output": "..."}
|
||||
```
|
||||
- `sharegpt:chat`: conversations where `from` is `human`/`gpt`
|
||||
```json
|
||||
{"conversations": [{"from": "...", "value": "..."}]}
|
||||
```
|
||||
- `completion`: raw corpus
|
||||
```json
|
||||
{"text": "..."}
|
||||
```
|
||||
|
||||
<details>
|
||||
|
||||
<summary>Expand older updates</summary>
|
||||
<summary>See other formats</summary>
|
||||
|
||||
- 2025/09: Axolotl now has text diffusion training. Read more [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/diffusion).
|
||||
- 2025/08: QAT has been updated to include NVFP4 support. See [PR](https://github.com/axolotl-ai-cloud/axolotl/pull/3107).
|
||||
- 2025/07:
|
||||
- ND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the [blog post](https://huggingface.co/blog/accelerate-nd-parallel) for more info.
|
||||
- Axolotl adds more models: [GPT-OSS](https://docs.axolotl.ai/docs/models/gpt-oss.html), [Gemma 3n](https://docs.axolotl.ai/docs/models/gemma3n.html), [Liquid Foundation Model 2 (LFM2)](https://docs.axolotl.ai/docs/models/LiquidAI.html), and [Arcee Foundation Models (AFM)](https://docs.axolotl.ai/docs/models/arcee.html).
|
||||
- FP8 finetuning with fp8 gather op is now possible in Axolotl via `torchao`. Get started [here](https://docs.axolotl.ai/docs/mixed_precision.html#sec-fp8)!
|
||||
- [Voxtral](https://docs.axolotl.ai/docs/models/voxtral.html), [Magistral 1.1](https://docs.axolotl.ai/docs/models/magistral.html), and [Devstral](https://docs.axolotl.ai/docs/models/devstral.html) with mistral-common tokenizer support has been integrated in Axolotl!
|
||||
- TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
|
||||
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [docs](https://docs.axolotl.ai/docs/models/magistral.html) to start training your own Magistral models with Axolotl!
|
||||
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
|
||||
- 2025/04: Llama 4 support has been added in Axolotl. See [docs](https://docs.axolotl.ai/docs/models/llama-4.html) to start training your own Llama 4 models with Axolotl's linearized version!
|
||||
- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
|
||||
- 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
|
||||
- 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
|
||||
- 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
|
||||
- 2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See [docs](https://docs.axolotl.ai/docs/reward_modelling.html).
|
||||
- `jeopardy`: question and answer
|
||||
```json
|
||||
{"question": "...", "category": "...", "answer": "..."}
|
||||
```
|
||||
- `oasst`: instruction
|
||||
```json
|
||||
{"INSTRUCTION": "...", "RESPONSE": "..."}
|
||||
```
|
||||
- `gpteacher`: instruction; input(optional)
|
||||
```json
|
||||
{"instruction": "...", "input": "...", "response": "..."}
|
||||
```
|
||||
- `reflection`: instruction with reflect; input(optional)
|
||||
```json
|
||||
{"instruction": "...", "input": "...", "output": "...", "reflection": "...", "corrected": "..."}
|
||||
```
|
||||
- `explainchoice`: question, choices, (solution OR explanation)
|
||||
```json
|
||||
{"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
|
||||
```
|
||||
- `concisechoice`: question, choices, (solution OR explanation)
|
||||
```json
|
||||
{"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
|
||||
```
|
||||
- `summarizetldr`: article and summary
|
||||
```json
|
||||
{"article": "...", "summary": "..."}
|
||||
```
|
||||
- `alpaca_chat`: basic instruct for alpaca chat
|
||||
```json
|
||||
{"instruction": "...", "input": "...", "response": "..."}
|
||||
```
|
||||
- `alpaca_chat.load_qa`: question and answer for alpaca chat
|
||||
```json
|
||||
{"question": "...", "answer": "..."}
|
||||
```
|
||||
- `alpaca_chat.load_concise`: question and answer for alpaca chat, for concise answers
|
||||
```json
|
||||
{"instruction": "...", "input": "...", "response": "..."}
|
||||
```
|
||||
- `alpaca_chat.load_camel_ai`: question and answer for alpaca chat, for load_camel_ai
|
||||
```json
|
||||
{"message_1": "...", "message_2": "..."}
|
||||
```
|
||||
- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
|
||||
```json
|
||||
{"system_prompt": "...", "question": "...", "response": "..."}
|
||||
```
|
||||
- `context_qa`: in context question answering from an article
|
||||
```json
|
||||
{"article": "...", "question": "...", "answer": "..."}
|
||||
```
|
||||
- `context_qa.load_404`: in context question answering from an article, with default response for no answer from context
|
||||
```json
|
||||
{"article": "...", "unanswerable_question": "..."}
|
||||
```
|
||||
- `creative_acr.load_answer`: instruction and revision
|
||||
```json
|
||||
{"instruction": "...", "revision": "..."}
|
||||
```
|
||||
- `creative_acr.load_critique`: critique
|
||||
```json
|
||||
{"scores": "...", "critiques": "...", "instruction": "...", "answer": "..."}
|
||||
```
|
||||
- `creative_acr.load_revise`: critique and revise
|
||||
```json
|
||||
{"scores": "...", "critiques": "...", "instruction": "...", "answer": "...", "revision": "..."}
|
||||
```
|
||||
- `pygmalion`: pygmalion
|
||||
```json
|
||||
{"conversations": [{"role": "...", "value": "..."}]}
|
||||
```
|
||||
- `sharegpt_simple.load_role`: conversations where `role` is used instead of `from`
|
||||
```json
|
||||
{"conversations": [{"role": "...", "value": "..."}]}
|
||||
```
|
||||
- `sharegpt_simple.load_guanaco`: conversations where `from` is `prompter`/`assistant` instead of default sharegpt
|
||||
```json
|
||||
{"conversations": [{"from": "...", "value": "..."}]}
|
||||
```
|
||||
- `sharegpt_jokes`: creates a chat where bot is asked to tell a joke, then explain why the joke is funny
|
||||
```json
|
||||
{"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## ✨ Overview
|
||||
#### How to add custom prompts
|
||||
|
||||
Axolotl is a free and open-source tool designed to streamline post-training and fine-tuning for the latest large language models (LLMs).
|
||||
1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
|
||||
2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
|
||||
|
||||
Features:
|
||||
|
||||
- **Multiple Model Support**: Train various models like GPT-OSS, LLaMA, Mistral, Mixtral, Pythia, and many more models available on the Hugging Face Hub.
|
||||
- **Multimodal Training**: Fine-tune vision-language models (VLMs) including LLaMA-Vision, Qwen2-VL, Pixtral, LLaVA, SmolVLM2, GLM-4.6V, InternVL 3.5, Gemma 3n, and audio models like Voxtral with image, video, and audio support.
|
||||
- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO, GDPO), and Reward Modelling (RM) / Process Reward Modelling (PRM).
|
||||
- **Easy Configuration**: Re-use a single YAML configuration file across the full fine-tuning pipeline: dataset preprocessing, training, evaluation, quantization, and inference.
|
||||
- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention 2/3/4](https://docs.axolotl.ai/docs/attention.html#flash-attention), [Xformers](https://docs.axolotl.ai/docs/attention.html#xformers), [Flex Attention](https://docs.axolotl.ai/docs/attention.html#flex-attention), [SageAttention](https://docs.axolotl.ai/docs/attention.html#sageattention), [Liger Kernel](https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels), [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy), [ScatterMoE](https://docs.axolotl.ai/docs/custom_integrations.html#kernels-integration), [Sequence Parallelism (SP)](https://docs.axolotl.ai/docs/sequence_parallelism.html), [LoRA optimizations](https://docs.axolotl.ai/docs/lora_optims.html), [Multi-GPU training (FSDP1, FSDP2, DeepSpeed)](https://docs.axolotl.ai/docs/multi-gpu.html), [Multi-node training (Torchrun, Ray)](https://docs.axolotl.ai/docs/multi-node.html), and many more!
|
||||
- **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
|
||||
- **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.
|
||||
Optionally, download some datasets, see [data/README.md](data/README.md)
|
||||
|
||||
|
||||
|
||||
## 🚀 Quick Start - LLM Fine-tuning in Minutes
|
||||
### Config
|
||||
|
||||
**Requirements**:
|
||||
See [examples](examples) for quick start. It is recommended to duplicate and modify to your needs. The most important options are:
|
||||
|
||||
- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
|
||||
- Python >=3.11 (3.12 recommended)
|
||||
- PyTorch ≥2.9.1
|
||||
- model
|
||||
```yaml
|
||||
base_model: ./llama-7b-hf # local or huggingface repo
|
||||
```
|
||||
Note: The code will load the right architecture.
|
||||
|
||||
### Google Colab
|
||||
- dataset
|
||||
```yaml
|
||||
sequence_len: 2048 # max token length for prompt
|
||||
|
||||
[](https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb#scrollTo=msOCO4NRmRLa)
|
||||
# huggingface repo
|
||||
datasets:
|
||||
- path: vicgalle/alpaca-gpt4
|
||||
type: alpaca # format from earlier
|
||||
|
||||
### Installation
|
||||
# huggingface repo with specific configuration/subset
|
||||
datasets:
|
||||
- path: EleutherAI/pile
|
||||
name: enron_emails
|
||||
type: completion # format from earlier
|
||||
|
||||
```bash
|
||||
# install uv if you don't already have it installed (restart shell after)
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
# local
|
||||
datasets:
|
||||
- path: json
|
||||
data_files: data.jsonl # or json
|
||||
type: alpaca # format from earlier
|
||||
```
|
||||
|
||||
# change depending on system
|
||||
export UV_TORCH_BACKEND=cu128
|
||||
- loading
|
||||
```yaml
|
||||
load_in_4bit: true
|
||||
load_in_8bit: true
|
||||
bf16: true # require >=ampere
|
||||
fp16: true
|
||||
tf32: true # require >=ampere
|
||||
bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP (automatic mixed precision)
|
||||
float16: true # use instead of fp16 when you don't want AMP
|
||||
```
|
||||
Note: Repo does not do 4-bit quantization.
|
||||
|
||||
# create a new virtual environment
|
||||
uv venv --python 3.12
|
||||
source .venv/bin/activate
|
||||
|
||||
uv pip install torch==2.10.0 torchvision
|
||||
uv pip install --no-build-isolation axolotl[deepspeed]
|
||||
|
||||
# Download example axolotl configs, deepspeed configs
|
||||
axolotl fetch examples
|
||||
axolotl fetch deepspeed_configs # OPTIONAL
|
||||
```
|
||||
|
||||
#### Using Docker
|
||||
|
||||
Installing with Docker can be less error prone than installing in your own environment.
|
||||
```bash
|
||||
docker run --gpus '"all"' --ipc=host --rm -it axolotlai/axolotl:main-latest
|
||||
```
|
||||
|
||||
Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
#### Cloud Providers
|
||||
- lora
|
||||
```yaml
|
||||
adapter: lora # qlora or leave blank for full finetune
|
||||
lora_r: 8
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_modules:
|
||||
- q_proj
|
||||
- v_proj
|
||||
```
|
||||
|
||||
<details>
|
||||
|
||||
- [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
|
||||
- [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=github&utm_medium=developer_community&utm_campaign=template_launch_axolotl&utm_content=readme)
|
||||
- [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true)
|
||||
- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl)
|
||||
- [Novita](https://novita.ai/gpus-console?templateId=311)
|
||||
- [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
|
||||
- [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
|
||||
<summary>All yaml options</summary>
|
||||
|
||||
```yaml
|
||||
# this is the huggingface model that contains *.pt, *.safetensors, or *.bin files
|
||||
# this can also be a relative path to a model on disk
|
||||
base_model: ./llama-7b-hf
|
||||
# you can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
|
||||
base_model_ignore_patterns:
|
||||
# if the base_model repo on hf hub doesn't include configuration .json files,
|
||||
# you can set that here, or leave this empty to default to base_model
|
||||
base_model_config: ./llama-7b-hf
|
||||
# you can specify to choose a specific model revision from huggingface hub
|
||||
model_revision:
|
||||
# Optional tokenizer configuration override in case you want to use a different tokenizer
|
||||
# than the one defined in the base model
|
||||
tokenizer_config:
|
||||
# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
|
||||
model_type: AutoModelForCausalLM
|
||||
# Corresponding tokenizer for the model AutoTokenizer is a good choice
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Trust remote code for untrusted source
|
||||
trust_remote_code:
|
||||
# use_fast option for tokenizer loading from_pretrained, default to True
|
||||
tokenizer_use_fast:
|
||||
# Whether to use the legacy tokenizer setting, defaults to True
|
||||
tokenizer_legacy:
|
||||
# resize the model embeddings when new tokens are added to multiples of 32
|
||||
# this is reported to improve training speed on some models
|
||||
resize_token_embeddings_to_32x:
|
||||
|
||||
# whether you are training a 4-bit GPTQ quantized model
|
||||
gptq: true
|
||||
gptq_groupsize: 128 # group size
|
||||
gptq_model_v1: false # v1 or v2
|
||||
|
||||
# this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
|
||||
load_in_8bit: true
|
||||
# use bitsandbytes 4 bit
|
||||
load_in_4bit:
|
||||
|
||||
# Use CUDA bf16
|
||||
bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
|
||||
# Use CUDA fp16
|
||||
fp16: true
|
||||
# Use CUDA tf32
|
||||
tf32: true # require >=ampere
|
||||
|
||||
# a list of one or more datasets to finetune the model with
|
||||
datasets:
|
||||
# hf dataset repo | "json" for local dataset, make sure to fill data_files
|
||||
- path: vicgalle/alpaca-gpt4
|
||||
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
||||
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
||||
data_files: # path to source data files
|
||||
shards: # number of shards to split data into
|
||||
name: # name of dataset configuration to load
|
||||
|
||||
# axolotl attempts to save the dataset as an arrow after packing the data together so
|
||||
# subsequent training attempts load faster, relative path
|
||||
dataset_prepared_path: data/last_run_prepared
|
||||
# push prepared dataset to hub
|
||||
push_dataset_to_hub: # repo path
|
||||
# push checkpoints to hub
|
||||
hub_model_id: # repo path to push finetuned model
|
||||
# how to push checkpoints to hub
|
||||
# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
|
||||
hub_strategy:
|
||||
# whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
|
||||
# required to be true when used in combination with `push_dataset_to_hub`
|
||||
hf_use_auth_token: # boolean
|
||||
# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
|
||||
val_set_size: 0.04
|
||||
# Num shards for whole dataset
|
||||
dataset_shard_num:
|
||||
# Index of shard to use for whole dataset
|
||||
dataset_shard_idx:
|
||||
|
||||
# the maximum length of an input to train with, this should typically be less than 2048
|
||||
# as most models have a token/context limit of 2048
|
||||
sequence_len: 2048
|
||||
# max sequence length to concatenate training samples together up to
|
||||
# inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
|
||||
# FutureWarning: This will soon be DEPRECATED
|
||||
max_packed_sequence_len: 1024
|
||||
# use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
|
||||
sample_packing:
|
||||
# you can set these packing optimizations AFTER starting a training at least once.
|
||||
# The trainer will provide recommended values for these values.
|
||||
sample_packing_eff_est:
|
||||
total_num_tokens:
|
||||
|
||||
# if you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
|
||||
adapter: lora
|
||||
# if you already have a lora model trained that you want to load, put that here
|
||||
# lora hyperparameters
|
||||
lora_model_dir:
|
||||
lora_r: 8
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_modules:
|
||||
- q_proj
|
||||
- v_proj
|
||||
# - k_proj
|
||||
# - o_proj
|
||||
# - gate_proj
|
||||
# - down_proj
|
||||
# - up_proj
|
||||
lora_target_linear: # if true, will target all linear layers
|
||||
lora_modules_to_save:
|
||||
# - embed_tokens
|
||||
# - lm_head
|
||||
lora_out_dir:
|
||||
lora_fan_in_fan_out: false
|
||||
|
||||
# wandb configuration if you're using it
|
||||
wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
|
||||
wandb_project: # your wandb project name
|
||||
wandb_entity: # a wandb Team name if using a Team
|
||||
wandb_watch:
|
||||
wandb_run_id: # set the name of your wandb run
|
||||
wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
|
||||
|
||||
# where to save the finished model to
|
||||
output_dir: ./completed-model
|
||||
|
||||
# training hyperparameters
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 2
|
||||
eval_batch_size: 2
|
||||
num_epochs: 3
|
||||
warmup_steps: 100
|
||||
learning_rate: 0.00003
|
||||
lr_quadratic_warmup:
|
||||
logging_steps:
|
||||
save_steps: # leave empty to save at each epoch
|
||||
eval_steps:
|
||||
save_total_limit: # checkpoints saved at a time
|
||||
max_steps:
|
||||
|
||||
# save model as safetensors (require safetensors package)
|
||||
save_safetensors:
|
||||
|
||||
# whether to mask out or include the human's prompt from the training labels
|
||||
train_on_inputs: false
|
||||
# group similarly sized data to minimize padding
|
||||
# may be slower to start, as it must download and sort the entire dataset
|
||||
# note that training loss may have an oscillating pattern with this enabled
|
||||
group_by_length: false
|
||||
|
||||
# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
|
||||
gradient_checkpointing: false
|
||||
|
||||
# stop training after this many evaluation losses have increased in a row
|
||||
# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
|
||||
early_stopping_patience: 3
|
||||
|
||||
# specify a scheduler and kwargs to use with the optimizer
|
||||
lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
|
||||
lr_scheduler_kwargs:
|
||||
|
||||
# for one_cycle optim
|
||||
lr_div_factor: # learning rate div factor
|
||||
|
||||
# for log_sweep optim
|
||||
log_sweep_min_lr:
|
||||
log_sweep_max_lr:
|
||||
|
||||
# specify optimizer
|
||||
optimizer:
|
||||
# specify weight decay
|
||||
weight_decay:
|
||||
# adamw hyperparams
|
||||
adam_beta1:
|
||||
adam_beta2:
|
||||
adam_epsilon:
|
||||
# Gradient clipping max norm
|
||||
max_grad_norm:
|
||||
|
||||
# whether to bettertransformers
|
||||
flash_optimum:
|
||||
# whether to use xformers attention patch https://github.com/facebookresearch/xformers:
|
||||
xformers_attention:
|
||||
# whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
|
||||
flash_attention:
|
||||
# whether to use scaled-dot-product attention
|
||||
# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
|
||||
sdp_attention:
|
||||
# Landmark attention (only llama)
|
||||
landmark_attention:
|
||||
# xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
|
||||
# llama only
|
||||
xpos_rope:
|
||||
# RoPE Scaling https://github.com/huggingface/transformers/pull/24653
|
||||
rope_scaling:
|
||||
type: # linear | dynamic
|
||||
factor: # float
|
||||
|
||||
# resume from a specific checkpoint dir
|
||||
resume_from_checkpoint:
|
||||
# if resume_from_checkpoint isn't set and you simply want it to start where it left off
|
||||
# be careful with this being turned on between different models
|
||||
auto_resume_from_checkpoints: false
|
||||
|
||||
# don't mess with this, it's here for accelerate and torchrun
|
||||
local_rank:
|
||||
|
||||
# add or change special tokens
|
||||
special_tokens:
|
||||
# bos_token: "<s>"
|
||||
# eos_token: "</s>"
|
||||
# unk_token: "<unk>"
|
||||
# add extra tokens
|
||||
tokens:
|
||||
|
||||
# FSDP
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
|
||||
# Deepspeed config path
|
||||
deepspeed:
|
||||
|
||||
# Path to torch distx for optim 'adamw_anyprecision'
|
||||
torchdistx_path:
|
||||
|
||||
# Set padding for data collator to 'longest'
|
||||
collator_pad_to_longest:
|
||||
|
||||
# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
|
||||
pretraining_dataset:
|
||||
|
||||
# Debug mode
|
||||
debug:
|
||||
|
||||
# Seed
|
||||
seed:
|
||||
|
||||
# Allow overwrite yml config using from cli
|
||||
strict:
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### Your First Fine-tune
|
||||
### Train
|
||||
|
||||
Run
|
||||
```bash
|
||||
accelerate launch scripts/finetune.py configs/your_config.yml
|
||||
```
|
||||
|
||||
#### Multi-GPU
|
||||
|
||||
You can optionally pre-tokenize dataset with the following before finetuning:
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES="" accelerate ... --prepare_ds_only
|
||||
```
|
||||
|
||||
##### Config
|
||||
|
||||
- llama FSDP
|
||||
```yaml
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_config:
|
||||
fsdp_offload_params: true
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
```
|
||||
|
||||
- llama Deepspeed
|
||||
```yaml
|
||||
deepspeed: deepspeed/zero3.json
|
||||
```
|
||||
|
||||
##### Weights & Biases Logging
|
||||
|
||||
- wandb options
|
||||
```yaml
|
||||
wandb_mode:
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_run_id:
|
||||
wandb_log_model:
|
||||
```
|
||||
|
||||
### Inference
|
||||
|
||||
Pass the appropriate flag to the train command:
|
||||
|
||||
- Pretrained LORA:
|
||||
```bash
|
||||
--inference --lora_model_dir="./lora-output-dir"
|
||||
```
|
||||
- Full weights finetune:
|
||||
```bash
|
||||
--inference --base_model="./completed-model"
|
||||
```
|
||||
- Full weights finetune w/ a prompt from a text file:
|
||||
```bash
|
||||
cat /tmp/prompt.txt | python scripts/finetune.py configs/your_config.yml \
|
||||
--base_model="./completed-model" --inference --prompter=None --load_in_8bit=True
|
||||
```
|
||||
|
||||
### Merge LORA to base
|
||||
|
||||
Add below flag to train command above
|
||||
|
||||
```bash
|
||||
# Fetch axolotl examples
|
||||
axolotl fetch examples
|
||||
|
||||
# Or, specify a custom path
|
||||
axolotl fetch examples --dest path/to/folder
|
||||
|
||||
# Train a model using LoRA
|
||||
axolotl train examples/llama-3/lora-1b.yml
|
||||
--merge_lora --lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False
|
||||
```
|
||||
|
||||
That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.
|
||||
|
||||
|
||||
## 📚 Documentation
|
||||
|
||||
- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
|
||||
- [Configuration Guide](https://docs.axolotl.ai/docs/config-reference.html) - Full configuration options and examples
|
||||
- [Dataset Loading](https://docs.axolotl.ai/docs/dataset_loading.html) - Loading datasets from various sources
|
||||
- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
|
||||
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
||||
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
||||
- [Multipacking](https://docs.axolotl.ai/docs/multipack.html)
|
||||
- [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
|
||||
- [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions
|
||||
|
||||
## AI Agent Support
|
||||
|
||||
Axolotl ships with built-in documentation optimized for AI coding agents (Claude Code, Cursor, Copilot, etc.). These docs are bundled with the pip package — no repo clone needed.
|
||||
If you run out of CUDA memory, you can try to merge in system RAM with
|
||||
|
||||
```bash
|
||||
# Show overview and available training methods
|
||||
axolotl agent-docs
|
||||
|
||||
# Topic-specific references
|
||||
axolotl agent-docs sft # supervised fine-tuning
|
||||
axolotl agent-docs grpo # GRPO online RL
|
||||
axolotl agent-docs preference_tuning # DPO, KTO, ORPO, SimPO
|
||||
axolotl agent-docs reward_modelling # outcome and process reward models
|
||||
axolotl agent-docs pretraining # continual pretraining
|
||||
axolotl agent-docs --list # list all topics
|
||||
|
||||
# Dump config schema for programmatic use
|
||||
axolotl config-schema
|
||||
axolotl config-schema --field adapter
|
||||
CUDA_VISIBLE_DEVICES="" python3 scripts/finetune.py ...
|
||||
```
|
||||
|
||||
If you're working with the source repo, agent docs are also available at `docs/agents/` and the project overview is in `AGENTS.md`.
|
||||
## Common Errors 🧰
|
||||
|
||||
## 🤝 Getting Help
|
||||
> If you encounter a 'Cuda out of memory' error, it means your GPU ran out of memory during the training process. Here's how to resolve it:
|
||||
|
||||
- Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
|
||||
- Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
|
||||
- Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html)
|
||||
- Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options
|
||||
Please reduce any below
|
||||
- `micro_batch_size`
|
||||
- `eval_batch_size`
|
||||
- `gradient_accumulation_steps`
|
||||
- `sequence_len`
|
||||
|
||||
## 🌟 Contributing
|
||||
> `failed (exitcode: -9)` usually means your system has run out of system memory.
|
||||
Similarly, you should consider reducing the same settings as when you run out of VRAM.
|
||||
Additionally, look into upgrading your system RAM which should be simpler than GPU upgrades.
|
||||
|
||||
Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.
|
||||
> RuntimeError: expected scalar type Float but found Half
|
||||
|
||||
## 📈 Telemetry
|
||||
Try set `fp16: true`
|
||||
|
||||
Axolotl has opt-out telemetry that helps us understand how the project is being used
|
||||
and prioritize improvements. We collect basic system information, model types, and
|
||||
error rates—never personal data or file paths. Telemetry is enabled by default. To
|
||||
disable it, set AXOLOTL_DO_NOT_TRACK=1. For more details, see our [telemetry documentation](https://docs.axolotl.ai/docs/telemetry.html).
|
||||
> NotImplementedError: No operator found for `memory_efficient_attention_forward` ...
|
||||
|
||||
## ❤️ Sponsors
|
||||
Try to turn off xformers.
|
||||
|
||||
Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai)
|
||||
> accelerate config missing
|
||||
|
||||
## 📝 Citing Axolotl
|
||||
It's safe to ignore it.
|
||||
|
||||
If you use Axolotl in your research or projects, please cite it as follows:
|
||||
## Need help? 🙋♂️
|
||||
|
||||
```bibtex
|
||||
@software{axolotl,
|
||||
title = {Axolotl: Open Source LLM Post-Training},
|
||||
author = {{Axolotl maintainers and contributors}},
|
||||
url = {https://github.com/axolotl-ai-cloud/axolotl},
|
||||
license = {Apache-2.0},
|
||||
year = {2023}
|
||||
}
|
||||
Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
|
||||
|
||||
## Badge ❤🏷️
|
||||
|
||||
Building something cool with Axolotl? Consider adding a badge to your model card.
|
||||
|
||||
```markdown
|
||||
[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
|
||||
```
|
||||
|
||||
## 📜 License
|
||||
[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
|
||||
|
||||
This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
|
||||
## Community Showcase
|
||||
|
||||
Check out some of the projects and models that have been built using Axolotl! Have a model you'd like to add to our Community Showcase? Open a PR with your model.
|
||||
|
||||
Open Access AI Collective
|
||||
- [Minotaur 13b](https://huggingface.co/openaccess-ai-collective/minotaur-13b)
|
||||
- [Manticore 13b](https://huggingface.co/openaccess-ai-collective/manticore-13b)
|
||||
- [Hippogriff 30b](https://huggingface.co/openaccess-ai-collective/hippogriff-30b-chat)
|
||||
|
||||
PocketDoc Labs
|
||||
- [Dan's PersonalityEngine 13b LoRA](https://huggingface.co/PocketDoc/Dans-PersonalityEngine-13b-LoRA)
|
||||
|
||||
## Contributing 🤝
|
||||
|
||||
Please read the [contributing guide](./.github/CONTRIBUTING.md)
|
||||
|
||||
Bugs? Please check the [open issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues/bug) else create a new Issue.
|
||||
|
||||
PRs are **greatly welcome**!
|
||||
|
||||
Please run below to setup env
|
||||
```bash
|
||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
||||
pre-commit install
|
||||
|
||||
# test
|
||||
pytest tests/
|
||||
```
|
||||
|
||||
273
SETUP_MIAAI.md
273
SETUP_MIAAI.md
@@ -1,273 +0,0 @@
|
||||
# Axolotl Setup — miaai (RTX 5080, CUDA 13.2)
|
||||
|
||||
## System Info
|
||||
- GPU: NVIDIA RTX 5080 (16GB VRAM, sm_120 / Blackwell)
|
||||
- Driver: 580.126.09 — max CUDA 13.0 shown by nvidia-smi, but nvcc from conda is 13.2
|
||||
- OS: Ubuntu 25.10 (Python 3.13 system — do NOT use system Python for ML)
|
||||
- Axolotl repo: `/home/tocmo0nlord/axolotl` (branch: `activeblue/main`)
|
||||
- Conda env: `axolotl` at `/opt/miniconda3/envs/axolotl`
|
||||
|
||||
---
|
||||
|
||||
## Starting from Bare Ubuntu 25.10
|
||||
|
||||
If rebuilding from scratch, complete these steps first before anything else.
|
||||
|
||||
### A. System packages
|
||||
```bash
|
||||
sudo apt update && sudo apt upgrade -y
|
||||
sudo apt install -y \
|
||||
build-essential cmake git curl wget \
|
||||
python3-dev libssl-dev zlib1g-dev \
|
||||
ca-certificates gnupg lsb-release
|
||||
```
|
||||
|
||||
### B. NVIDIA driver (580.xx)
|
||||
Ubuntu 25.10 is too new for NVIDIA's apt repo. Install via ubuntu-drivers:
|
||||
```bash
|
||||
sudo ubuntu-drivers autoinstall
|
||||
sudo reboot
|
||||
```
|
||||
|
||||
After reboot, verify:
|
||||
```bash
|
||||
nvidia-smi
|
||||
# Must show: NVIDIA GeForce RTX 5080, Driver Version: 580.x
|
||||
```
|
||||
|
||||
If ubuntu-drivers installs the wrong version, force the right one:
|
||||
```bash
|
||||
sudo apt install -y nvidia-driver-580
|
||||
sudo reboot
|
||||
```
|
||||
|
||||
### C. Install Ollama
|
||||
```bash
|
||||
curl -fsSL https://ollama.com/install.sh | sh
|
||||
|
||||
# Verify it's running
|
||||
systemctl status ollama
|
||||
```
|
||||
|
||||
### D. HuggingFace CLI
|
||||
```bash
|
||||
pip3 install huggingface_hub
|
||||
huggingface-cli login
|
||||
# Paste your HF token — required for gated models like meta-llama
|
||||
```
|
||||
|
||||
Once steps A–D are done, continue with the One-time Setup below.
|
||||
|
||||
---
|
||||
|
||||
## Pre-Training Checklist (every session)
|
||||
|
||||
```bash
|
||||
# 1. Stop Ollama — if it receives a request mid-training it will compete for VRAM
|
||||
sudo systemctl stop ollama
|
||||
|
||||
# 2. Activate conda env
|
||||
export PATH="/opt/miniconda3/bin:$PATH"
|
||||
conda activate axolotl
|
||||
|
||||
# 3. Set env vars
|
||||
export CUDA_HOME=$CONDA_PREFIX
|
||||
export PATH=$CUDA_HOME/bin:$PATH
|
||||
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
||||
|
||||
# 4. Confirm GPU is clear (should show no processes before training)
|
||||
nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv
|
||||
|
||||
# 5. Go to axolotl directory
|
||||
cd /home/tocmo0nlord/axolotl
|
||||
```
|
||||
|
||||
## Run Training
|
||||
```bash
|
||||
axolotl train ~/human_chat_qlora.yml
|
||||
```
|
||||
|
||||
## After Training
|
||||
```bash
|
||||
# Restart Ollama
|
||||
sudo systemctl start ollama
|
||||
|
||||
# Test the adapter interactively
|
||||
axolotl inference ~/human_chat_qlora.yml \
|
||||
--lora-model-dir ~/outputs/llama31-8b-humanchat \
|
||||
--prompter chat
|
||||
|
||||
# (Optional) Merge adapter into base model for standalone deployment
|
||||
axolotl merge-lora ~/human_chat_qlora.yml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## One-time Setup (fresh machine — after bare Ubuntu steps above)
|
||||
|
||||
### 1. Install Miniconda
|
||||
```bash
|
||||
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
|
||||
bash miniconda.sh -b -p /opt/miniconda3
|
||||
/opt/miniconda3/bin/conda init bash
|
||||
source ~/.bashrc
|
||||
```
|
||||
|
||||
### 2. Create Python 3.11 environment
|
||||
```bash
|
||||
conda create -n axolotl python=3.11 -y
|
||||
conda activate axolotl
|
||||
```
|
||||
|
||||
### 3. Clone axolotl repo
|
||||
```bash
|
||||
git clone https://git.activeblue.net/tocmo0nlord/axolotl.git /home/tocmo0nlord/axolotl
|
||||
cd /home/tocmo0nlord/axolotl
|
||||
git remote add upstream https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
git fetch upstream
|
||||
git rebase upstream/main # keeps activeblue patches on top
|
||||
```
|
||||
|
||||
### 4. Install CUDA toolkit (needed to compile flash-attn and bitsandbytes)
|
||||
```bash
|
||||
conda install -y -c "nvidia/label/cuda-12.8.0" cuda-toolkit
|
||||
export CUDA_HOME=$CONDA_PREFIX
|
||||
export PATH=$CUDA_HOME/bin:$PATH
|
||||
```
|
||||
|
||||
> NOTE: Despite installing from the cuda-12.8.0 channel, conda resolves nvcc to **13.2.78**.
|
||||
> This is fine — use cu132 everywhere to match.
|
||||
|
||||
### 5. Install PyTorch — use cu132 (matches nvcc from conda)
|
||||
```bash
|
||||
# torchaudio has no cu132 wheel — skip it, not needed for LLM training
|
||||
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu132
|
||||
python -c "import torch; print('CUDA:', torch.version.cuda); print('GPU:', torch.cuda.get_device_name(0))"
|
||||
```
|
||||
|
||||
### 6. Install Axolotl
|
||||
```bash
|
||||
cd /home/tocmo0nlord/axolotl
|
||||
pip install -e "."
|
||||
```
|
||||
|
||||
### 7. Install flash-attn
|
||||
> Compiles CUDA kernels from source — takes 15–25 min on 10 cores of i7-14700K.
|
||||
```bash
|
||||
MAX_JOBS=10 pip install flash-attn --no-build-isolation
|
||||
```
|
||||
|
||||
### 8. Compile bitsandbytes from source for sm_120 (RTX 5080 / Blackwell)
|
||||
|
||||
Prebuilt wheels do not include sm_120. CUDA 13.2 also dropped sm_50–53.
|
||||
Must compile from source with a patched CMakeLists.txt.
|
||||
|
||||
```bash
|
||||
# Clone bitsandbytes v0.49.1
|
||||
git clone --branch v0.49.1 --depth 1 \
|
||||
https://github.com/bitsandbytes-foundation/bitsandbytes.git /tmp/bnb_0491
|
||||
|
||||
# Patch CMakeLists.txt: insert sm_120 override before the foreach loop
|
||||
# (cmake >= 3.23.0 uses its own built-in arch list which does not include sm_120)
|
||||
sed -i '/ foreach(capability \${CMAKE_CUDA_ARCHITECTURES_ALL})/i\ # RTX 5080 sm_120 patch\n set(CMAKE_CUDA_ARCHITECTURES_ALL 120)' /tmp/bnb_0491/CMakeLists.txt
|
||||
|
||||
# Verify patch landed correctly — set() line must appear immediately before foreach
|
||||
grep -n "ARCHITECTURES_ALL\|foreach" /tmp/bnb_0491/CMakeLists.txt | tail -5
|
||||
|
||||
# Configure — must point cmake at conda's nvcc explicitly
|
||||
cmake \
|
||||
-DCMAKE_CUDA_COMPILER=/opt/miniconda3/envs/axolotl/bin/nvcc \
|
||||
-DCOMPUTE_BACKEND=cuda \
|
||||
-S /tmp/bnb_0491 \
|
||||
-B /tmp/bnb_0491/build 2>&1 | grep -E "(Capabilit|CUDA Ver|Error)"
|
||||
# Must show: CUDA Capabilities Selected: 120
|
||||
|
||||
# Build (adjust -j to your CPU core count)
|
||||
cmake --build /tmp/bnb_0491/build -j10
|
||||
|
||||
# Install into conda site-packages
|
||||
cp -r /tmp/bnb_0491/bitsandbytes \
|
||||
/opt/miniconda3/envs/axolotl/lib/python3.11/site-packages/
|
||||
|
||||
# Verify CUDA works
|
||||
python3 -c "
|
||||
import torch, bitsandbytes as bnb
|
||||
x = torch.randn(64, 64, device='cuda')
|
||||
l = bnb.nn.Linear8bitLt(64, 64).cuda()
|
||||
print('bitsandbytes CUDA OK:', l(x).shape)
|
||||
"
|
||||
```
|
||||
|
||||
### 9. Copy training config to home
|
||||
```bash
|
||||
cp /home/tocmo0nlord/axolotl/human_chat_qlora.yml ~/human_chat_qlora.yml
|
||||
```
|
||||
|
||||
### 10. Verify the full stack
|
||||
```bash
|
||||
python3 -c "
|
||||
import torch, bitsandbytes as bnb, flash_attn, transformers
|
||||
print('torch :', torch.__version__, '| CUDA:', torch.version.cuda)
|
||||
print('bitsandbytes:', bnb.__version__)
|
||||
print('flash_attn :', flash_attn.__version__)
|
||||
print('transformers:', transformers.__version__)
|
||||
print('GPU :', torch.cuda.get_device_name(0))
|
||||
print('VRAM :', round(torch.cuda.get_device_properties(0).total_memory/1e9, 1), 'GB')
|
||||
"
|
||||
```
|
||||
|
||||
Expected output:
|
||||
```
|
||||
torch : 2.x.x | CUDA: 13.2
|
||||
bitsandbytes: 0.50.0.dev0
|
||||
flash_attn : 2.x.x
|
||||
transformers: 5.x.x
|
||||
GPU : NVIDIA GeForce RTX 5080
|
||||
VRAM : 16.3 GB
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Training Config — human_chat_qlora.yml
|
||||
|
||||
Key settings tuned for RTX 5080 (16GB):
|
||||
|
||||
| Setting | Value | Notes |
|
||||
|---|---|---|
|
||||
| `sequence_len` | `2048` | 4096 OOMs during loss computation (logits x 128k vocab) |
|
||||
| `micro_batch_size` | `1` | Effective batch = micro x grad_accum = 8 |
|
||||
| `gradient_accumulation_steps` | `8` | Keeps effective batch size at 8 |
|
||||
| `adapter` | `qlora` | 4-bit via bitsandbytes compiled from source |
|
||||
| `attn_implementation` | `flash_attention_2` | Not the deprecated `flash_attention: true` |
|
||||
| `type` (datasets) | `chat_template` | Not the deprecated `sharegpt` |
|
||||
|
||||
Expected training metrics (RTX 5080, ~65k samples, 2 epochs):
|
||||
- VRAM: ~10–11 GB active, ~11 GB allocated
|
||||
- Training duration: ~3.5 hours
|
||||
- Initial eval loss: ~0.81, perplexity ~2.25
|
||||
- Final loss target: ~0.55–0.60
|
||||
|
||||
To push VRAM to ~14GB and improve training: set `micro_batch_size: 2` and `gradient_accumulation_steps: 4`.
|
||||
|
||||
---
|
||||
|
||||
## Common Pitfalls
|
||||
|
||||
| Problem | Cause | Fix |
|
||||
|---|---|---|
|
||||
| `externally-managed-environment` | System Python 3.13 blocks pip | Use conda env, never system pip |
|
||||
| `No module named torch` (flash-attn) | pip builds in isolated env | Use `--no-build-isolation` |
|
||||
| `CUDA_HOME not set` | CUDA toolkit not installed | `conda install cuda-toolkit` from nvidia channel |
|
||||
| `CUDA version mismatch 13.2 vs 12.8` | Conda nvcc is 13.2, torch was cu128 | Reinstall torch with `--index-url .../cu132` |
|
||||
| `torchaudio` not found for cu132 | No cu132 wheel exists | Skip torchaudio — not needed |
|
||||
| flash-attn compile is slow | Single-threaded by default | Set `MAX_JOBS=<cpu_count>` before pip install |
|
||||
| `nvcc fatal: Unsupported gpu architecture 'compute_50'` | bitsandbytes CMakeLists.txt hardcodes sm_50; CUDA 13.2 dropped it | Patch CMakeLists.txt (see step 8 above) |
|
||||
| `CUDA Capabilities Selected: 50;52;...` ignores -D flag | cmake >= 3.23 built-in arch list lacks sm_120; CMakeLists.txt overrides -D | Insert `set(CMAKE_CUDA_ARCHITECTURES_ALL 120)` before foreach loop |
|
||||
| `BackendUnavailable: scikit_build_core` | pip install of bnb triggers cmake rebuild | Copy .so directly to site-packages instead |
|
||||
| `torch.OutOfMemoryError` during eval | logits tensor (batch x 4096 x 128k vocab) too large | Set `sequence_len: 2048`, `micro_batch_size: 1` |
|
||||
| `type: sharegpt` deprecation warning | axolotl removed sharegpt type | Use `type: chat_template` with field mappings |
|
||||
| `flash_attention: true` deprecation | Old config key removed | Use `attn_implementation: flash_attention_2` |
|
||||
| Capybara dataset `field_messages null` | Capybara uses input/output format, not conversations | Switch to SlimOrca or OpenHermes-2.5 |
|
||||
| Ollama loads model mid-training | Ollama is enabled and receives a request | `sudo systemctl stop ollama` before training |
|
||||
| Training much slower than eval speed | The fast it/s on screen is the eval loop (forward only) | Normal — training includes backward pass and optimizer (~3.5h total) |
|
||||
| ubuntu-drivers installs wrong NVIDIA version | Multiple driver candidates available | Force with `apt install nvidia-driver-580` |
|
||||
10
TODO.md
Normal file
10
TODO.md
Normal file
@@ -0,0 +1,10 @@
|
||||
# todo list
|
||||
|
||||
- [] Validation of parameters for combinations that won't work
|
||||
|
||||
|
||||
|
||||
## things that are known not to work
|
||||
|
||||
- FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203
|
||||
- adamw_bnb_8bit doesn't play well with FSDP offload
|
||||
367
_quarto.yml
367
_quarto.yml
@@ -1,367 +0,0 @@
|
||||
project:
|
||||
type: website
|
||||
pre-render:
|
||||
- docs/scripts/generate_config_docs.py
|
||||
- docs/scripts/generate_examples_docs.py
|
||||
|
||||
quartodoc:
|
||||
dir: docs/api
|
||||
package: axolotl
|
||||
title: API Reference
|
||||
parser: google
|
||||
|
||||
sections:
|
||||
- title: Core
|
||||
desc: Core functionality for training
|
||||
contents:
|
||||
- train
|
||||
- evaluate
|
||||
- datasets
|
||||
- convert
|
||||
- prompt_tokenizers
|
||||
- logging_config
|
||||
- core.builders.base
|
||||
- core.builders.causal
|
||||
- core.builders.rl
|
||||
- core.training_args
|
||||
- core.chat.messages
|
||||
- core.chat.format.chatml
|
||||
- core.chat.format.llama3x
|
||||
- core.chat.format.shared
|
||||
- core.datasets.chat
|
||||
- core.datasets.transforms.chat_builder
|
||||
- title: CLI
|
||||
desc: Command-line interface
|
||||
contents:
|
||||
- cli.main
|
||||
- cli.train
|
||||
- cli.evaluate
|
||||
- cli.args
|
||||
- cli.art
|
||||
- cli.checks
|
||||
- cli.config
|
||||
- cli.delinearize_llama4
|
||||
- cli.inference
|
||||
- cli.merge_lora
|
||||
- cli.merge_sharded_fsdp_weights
|
||||
- cli.preprocess
|
||||
- cli.quantize
|
||||
- cli.vllm_serve
|
||||
- cli.cloud.base
|
||||
- cli.cloud.modal_
|
||||
- cli.utils
|
||||
- cli.utils.args
|
||||
- cli.utils.fetch
|
||||
- cli.utils.load
|
||||
- cli.utils.sweeps
|
||||
- cli.utils.train
|
||||
- title: Trainers
|
||||
desc: Training implementations
|
||||
contents:
|
||||
- core.trainers.base
|
||||
- core.trainers.trl
|
||||
- core.trainers.mamba
|
||||
- core.trainers.dpo.trainer
|
||||
- core.trainers.grpo.trainer
|
||||
- core.trainers.grpo.sampler
|
||||
- core.trainers.utils
|
||||
- title: Model Loading
|
||||
desc: Functionality for loading and patching models, tokenizers, etc.
|
||||
contents:
|
||||
- loaders.model
|
||||
- loaders.tokenizer
|
||||
- loaders.processor
|
||||
- loaders.adapter
|
||||
- loaders.patch_manager
|
||||
- loaders.constants
|
||||
- title: Mixins
|
||||
desc: Mixin classes for augmenting trainers
|
||||
contents:
|
||||
- core.trainers.mixins.optimizer
|
||||
- core.trainers.mixins.rng_state_loader
|
||||
- core.trainers.mixins.scheduler
|
||||
- title: Context Managers
|
||||
desc: Context managers for altering trainer behaviors
|
||||
contents:
|
||||
- utils.ctx_managers.sequence_parallel
|
||||
- title: Prompt Strategies
|
||||
desc: Prompt formatting strategies
|
||||
contents:
|
||||
- prompt_strategies.base
|
||||
- prompt_strategies.chat_template
|
||||
- prompt_strategies.alpaca_chat
|
||||
- prompt_strategies.alpaca_instruct
|
||||
- prompt_strategies.alpaca_w_system
|
||||
- prompt_strategies.user_defined
|
||||
- prompt_strategies.llama2_chat
|
||||
- prompt_strategies.completion
|
||||
- prompt_strategies.input_output
|
||||
- prompt_strategies.stepwise_supervised
|
||||
- prompt_strategies.metharme
|
||||
- prompt_strategies.orcamini
|
||||
- prompt_strategies.pygmalion
|
||||
- prompt_strategies.messages.chat
|
||||
- prompt_strategies.dpo.chat_template
|
||||
- prompt_strategies.dpo.llama3
|
||||
- prompt_strategies.dpo.chatml
|
||||
- prompt_strategies.dpo.zephyr
|
||||
- prompt_strategies.dpo.user_defined
|
||||
- prompt_strategies.dpo.passthrough
|
||||
- prompt_strategies.kto.llama3
|
||||
- prompt_strategies.kto.chatml
|
||||
- prompt_strategies.kto.user_defined
|
||||
- prompt_strategies.orpo.chat_template
|
||||
- prompt_strategies.bradley_terry.llama3
|
||||
- title: Kernels
|
||||
desc: Low-level performance optimizations
|
||||
contents:
|
||||
- kernels.lora
|
||||
- kernels.geglu
|
||||
- kernels.swiglu
|
||||
- kernels.quantize
|
||||
- kernels.utils
|
||||
- title: Monkey Patches
|
||||
desc: Runtime patches for model optimizations
|
||||
contents:
|
||||
- monkeypatch.llama_attn_hijack_flash
|
||||
- monkeypatch.llama_attn_hijack_xformers
|
||||
- monkeypatch.mistral_attn_hijack_flash
|
||||
- monkeypatch.multipack
|
||||
- monkeypatch.relora
|
||||
- monkeypatch.lora_kernels
|
||||
- monkeypatch.utils
|
||||
- monkeypatch.btlm_attn_hijack_flash
|
||||
- monkeypatch.stablelm_attn_hijack_flash
|
||||
- monkeypatch.trainer_fsdp_optim
|
||||
- monkeypatch.transformers_fa_utils
|
||||
- monkeypatch.data.batch_dataset_fetcher
|
||||
- monkeypatch.mixtral
|
||||
- monkeypatch.gradient_checkpointing.offload_cpu
|
||||
- monkeypatch.gradient_checkpointing.offload_disk
|
||||
- title: Utils
|
||||
desc: Utility functions
|
||||
contents:
|
||||
- utils.tokenization
|
||||
- utils.chat_templates
|
||||
- utils.lora
|
||||
- utils.model_shard_quant
|
||||
- utils.bench
|
||||
- utils.freeze
|
||||
- utils.trainer
|
||||
- utils.schedulers
|
||||
- utils.distributed
|
||||
- utils.dict
|
||||
- utils.optimizers.adopt
|
||||
- utils.data.streaming
|
||||
- utils.data.sft
|
||||
- utils.quantization
|
||||
- title: Schemas
|
||||
desc: Pydantic data models for Axolotl config
|
||||
contents:
|
||||
- utils.schemas.config
|
||||
- utils.schemas.model
|
||||
- utils.schemas.training
|
||||
- utils.schemas.datasets
|
||||
- utils.schemas.peft
|
||||
- utils.schemas.trl
|
||||
- utils.schemas.multimodal
|
||||
- utils.schemas.integrations
|
||||
- utils.schemas.enums
|
||||
- utils.schemas.utils
|
||||
- title: Integrations
|
||||
desc: Third-party integrations and extensions
|
||||
contents:
|
||||
- integrations.base
|
||||
- integrations.cut_cross_entropy.args
|
||||
- integrations.grokfast.optimizer
|
||||
- integrations.kd.trainer
|
||||
- integrations.liger.args
|
||||
- integrations.lm_eval.args
|
||||
- integrations.spectrum.args
|
||||
- title: Common
|
||||
desc: Common utilities and shared functionality
|
||||
contents:
|
||||
- common.architectures
|
||||
- common.const
|
||||
- common.datasets
|
||||
- title: Models
|
||||
desc: Custom model implementations
|
||||
contents:
|
||||
- models.mamba.modeling_mamba
|
||||
- title: Data Processing
|
||||
desc: Data processing utilities
|
||||
contents:
|
||||
- utils.collators.core
|
||||
- utils.collators.batching
|
||||
- utils.collators.mamba
|
||||
- utils.collators.mm_chat
|
||||
- utils.samplers.multipack
|
||||
- title: Callbacks
|
||||
desc: Training callbacks
|
||||
contents:
|
||||
- utils.callbacks.perplexity
|
||||
- utils.callbacks.profiler
|
||||
- utils.callbacks.lisa
|
||||
- utils.callbacks.mlflow_
|
||||
- utils.callbacks.comet_
|
||||
- utils.callbacks.qat
|
||||
website:
|
||||
title: "Axolotl"
|
||||
description: "We make fine-tuning accessible, scalable, and fun"
|
||||
favicon: favicon.jpg
|
||||
|
||||
google-analytics: "G-9KYCVJBNMQ"
|
||||
|
||||
navbar:
|
||||
logo: image/axolotl_logo_digital_white.svg
|
||||
title: false
|
||||
background: dark
|
||||
pinned: false
|
||||
collapse: false
|
||||
tools:
|
||||
- icon: twitter
|
||||
href: https://twitter.com/axolotl_ai
|
||||
- icon: github
|
||||
href: https://github.com/axolotl-ai-cloud/axolotl/
|
||||
- icon: discord
|
||||
href: https://discord.gg/7m9sfhzaf3
|
||||
|
||||
sidebar:
|
||||
pinned: true
|
||||
collapse-level: 2
|
||||
style: docked
|
||||
contents:
|
||||
- text: Home
|
||||
href: index.qmd
|
||||
|
||||
- section: "Getting Started"
|
||||
contents:
|
||||
- docs/getting-started.qmd
|
||||
- docs/choosing_method.qmd
|
||||
- docs/installation.qmd
|
||||
- docs/inference.qmd
|
||||
- section: "Model Guides"
|
||||
contents:
|
||||
- docs/models/kimi-linear.qmd
|
||||
- docs/models/plano.qmd
|
||||
- docs/models/mimo.qmd
|
||||
- docs/models/internvl3_5.qmd
|
||||
- docs/models/olmo3.qmd
|
||||
- docs/models/trinity.qmd
|
||||
- docs/models/arcee.qmd
|
||||
- section: "Ministral3"
|
||||
contents:
|
||||
- docs/models/ministral3.qmd
|
||||
- docs/models/ministral3/think.qmd
|
||||
- docs/models/ministral3/vision.qmd
|
||||
- section: "Magistral"
|
||||
contents:
|
||||
- docs/models/magistral.qmd
|
||||
- docs/models/magistral/think.qmd
|
||||
- docs/models/magistral/vision.qmd
|
||||
- docs/models/ministral.qmd
|
||||
- docs/models/mistral-small.qmd
|
||||
- docs/models/voxtral.qmd
|
||||
- docs/models/devstral.qmd
|
||||
- docs/models/mistral.qmd
|
||||
- docs/models/llama-4.qmd
|
||||
- docs/models/llama-2.qmd
|
||||
- docs/models/qwen3-next.qmd
|
||||
- docs/models/qwen3.qmd
|
||||
- docs/models/gemma3n.qmd
|
||||
- docs/models/apertus.qmd
|
||||
- docs/models/gpt-oss.qmd
|
||||
- docs/models/seed-oss.qmd
|
||||
- docs/models/phi.qmd
|
||||
- docs/models/smolvlm2.qmd
|
||||
- docs/models/granite4.qmd
|
||||
- docs/models/LiquidAI.qmd
|
||||
- docs/models/hunyuan.qmd
|
||||
- docs/models/jamba.qmd
|
||||
- docs/models/orpheus.qmd
|
||||
|
||||
- docs/cli.qmd
|
||||
- docs/telemetry.qmd
|
||||
- docs/config-reference.qmd
|
||||
- text: "API Reference"
|
||||
href: docs/api
|
||||
|
||||
- section: "Dataset Formats"
|
||||
contents: docs/dataset-formats/*
|
||||
|
||||
- section: "Deployments"
|
||||
contents:
|
||||
- docs/docker.qmd
|
||||
- docs/multi-gpu.qmd
|
||||
- docs/multi-node.qmd
|
||||
- docs/ray-integration.qmd
|
||||
- docs/amd_hpc.qmd
|
||||
- docs/mac.qmd
|
||||
|
||||
- section: "How To Guides"
|
||||
contents:
|
||||
- docs/multimodal.qmd
|
||||
- docs/rlhf.qmd
|
||||
- docs/grpo.qmd
|
||||
- docs/ebft.qmd
|
||||
- docs/vllm_serving.qmd
|
||||
- docs/reward_modelling.qmd
|
||||
- docs/lr_groups.qmd
|
||||
- docs/lora_optims.qmd
|
||||
- docs/dataset_loading.qmd
|
||||
- docs/qat.qmd
|
||||
- docs/quantize.qmd
|
||||
- docs/1_58bit_finetuning.qmd
|
||||
- docs/optimizations.qmd
|
||||
|
||||
- section: "Core Concepts"
|
||||
contents:
|
||||
- docs/batch_vs_grad.qmd
|
||||
- docs/dataset_preprocessing.qmd
|
||||
- docs/streaming.qmd
|
||||
- docs/multipack.qmd
|
||||
- docs/mixed_precision.qmd
|
||||
- docs/optimizers.qmd
|
||||
- docs/attention.qmd
|
||||
|
||||
- section: "Advanced Features"
|
||||
contents:
|
||||
- docs/fsdp_qlora.qmd
|
||||
- docs/torchao.qmd
|
||||
- docs/custom_integrations.qmd
|
||||
- docs/sequence_parallelism.qmd
|
||||
- docs/gradient_checkpointing.qmd
|
||||
- docs/nd_parallelism.qmd
|
||||
- docs/expert_quantization.qmd
|
||||
|
||||
- section: "Troubleshooting"
|
||||
contents:
|
||||
- docs/faq.qmd
|
||||
- docs/training_stability.qmd
|
||||
- docs/debugging.qmd
|
||||
- docs/nccl.qmd
|
||||
|
||||
format:
|
||||
html:
|
||||
theme: darkly
|
||||
css: styles.css
|
||||
toc: true
|
||||
# Enable better handling of line breaks in markdown
|
||||
preserve-tabs: true
|
||||
html-math-method: mathjax
|
||||
# Improved markdown processing options
|
||||
md-extensions:
|
||||
- markdown_it
|
||||
- def_list
|
||||
- attr_list
|
||||
- fenced_divs
|
||||
- tables
|
||||
- html_admonition
|
||||
- lineblocks
|
||||
- fancy_lists
|
||||
# Control whitespace handling
|
||||
whitespace: preserve
|
||||
# Process newlines in paragraphs
|
||||
wrap: preserve
|
||||
# Better line break handling
|
||||
preserve-linebreaks: true
|
||||
@@ -1,208 +0,0 @@
|
||||
"""Benchmark for entropy_from_logits Triton kernel vs original chunked implementation.
|
||||
|
||||
Usage: CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_entropy.py
|
||||
"""
|
||||
|
||||
import gc
|
||||
import statistics
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from axolotl.monkeypatch.trainer.utils import entropy_from_logits
|
||||
|
||||
V = 151936 # Qwen vocab
|
||||
WARMUP = 5
|
||||
BENCH_ITERS = 20
|
||||
MEM_ITERS = 10
|
||||
|
||||
|
||||
def entropy_from_logits_original(logits: torch.Tensor, chunk_size: int = 128):
|
||||
"""Original chunked implementation (reference)."""
|
||||
original_shape = logits.shape[:-1]
|
||||
num_classes = logits.shape[-1]
|
||||
flat_logits = logits.reshape(-1, num_classes)
|
||||
entropies = []
|
||||
for chunk in flat_logits.split(chunk_size, dim=0):
|
||||
logps = F.log_softmax(chunk, dim=-1)
|
||||
chunk_entropy = -(torch.exp(logps) * logps).sum(-1)
|
||||
entropies.append(chunk_entropy)
|
||||
return torch.cat(entropies, dim=0).reshape(original_shape)
|
||||
|
||||
|
||||
def _clean_gpu():
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
torch.cuda.reset_accumulated_memory_stats()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
def profile_time(fn, logits, n_iters=BENCH_ITERS):
|
||||
for _ in range(WARMUP):
|
||||
out = fn(logits, chunk_size=128)
|
||||
del out
|
||||
torch.cuda.synchronize()
|
||||
|
||||
times = []
|
||||
for _ in range(n_iters):
|
||||
s = torch.cuda.Event(enable_timing=True)
|
||||
e = torch.cuda.Event(enable_timing=True)
|
||||
s.record()
|
||||
out = fn(logits, chunk_size=128)
|
||||
e.record()
|
||||
torch.cuda.synchronize()
|
||||
times.append(s.elapsed_time(e))
|
||||
del out
|
||||
return times
|
||||
|
||||
|
||||
def profile_memory(fn, logits, n_iters=MEM_ITERS):
|
||||
for _ in range(WARMUP):
|
||||
out = fn(logits, chunk_size=128)
|
||||
del out
|
||||
torch.cuda.synchronize()
|
||||
|
||||
peaks = []
|
||||
for _ in range(n_iters):
|
||||
_clean_gpu()
|
||||
base = torch.cuda.max_memory_allocated()
|
||||
out = fn(logits, chunk_size=128)
|
||||
torch.cuda.synchronize()
|
||||
peaks.append(torch.cuda.max_memory_allocated() - base)
|
||||
del out
|
||||
return [p / 1e6 for p in peaks]
|
||||
|
||||
|
||||
def fmt(values, unit=""):
|
||||
mean = statistics.mean(values)
|
||||
std = statistics.stdev(values) if len(values) > 1 else 0.0
|
||||
return f"{mean:8.2f} ± {std:5.2f} {unit} [min={min(values):.2f}, max={max(values):.2f}]"
|
||||
|
||||
|
||||
def benchmark_contiguous():
|
||||
print("=" * 60)
|
||||
print(
|
||||
f"CONTIGUOUS BENCHMARK (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})"
|
||||
)
|
||||
print("=" * 60)
|
||||
|
||||
configs = [
|
||||
(1, 2048),
|
||||
(1, 8192),
|
||||
(1, 16384),
|
||||
(4, 4096),
|
||||
(8, 2048),
|
||||
(16, 2048),
|
||||
(16, 4096),
|
||||
]
|
||||
|
||||
for B, L in configs:
|
||||
mem_gb = B * L * V * 2 / 1e9
|
||||
if mem_gb > 28:
|
||||
print(f"\n skip B={B}, L={L} ({mem_gb:.1f} GB)")
|
||||
continue
|
||||
|
||||
N = B * L
|
||||
print(f"\n{'─' * 60}")
|
||||
print(f"B={B:2d}, L={L:5d} ({N:6d} rows, logits {mem_gb:.2f} GB)")
|
||||
print(f"{'─' * 60}")
|
||||
|
||||
torch.manual_seed(42)
|
||||
logits = torch.randn(B, L, V, device="cuda", dtype=torch.bfloat16)
|
||||
|
||||
t_orig = profile_time(entropy_from_logits_original, logits)
|
||||
t_triton = profile_time(entropy_from_logits, logits)
|
||||
orig_mean = statistics.mean(t_orig)
|
||||
triton_mean = statistics.mean(t_triton)
|
||||
|
||||
print(" TIME (ms):")
|
||||
print(f" original: {fmt(t_orig, 'ms')}")
|
||||
print(f" triton: {fmt(t_triton, 'ms')}")
|
||||
print(f" speedup: {orig_mean / triton_mean:.2f}x")
|
||||
|
||||
m_orig = profile_memory(entropy_from_logits_original, logits)
|
||||
m_triton = profile_memory(entropy_from_logits, logits)
|
||||
orig_peak = statistics.mean(m_orig)
|
||||
triton_peak = statistics.mean(m_triton)
|
||||
|
||||
print(" MEMORY (peak overhead):")
|
||||
print(f" original: {fmt(m_orig, 'MB')}")
|
||||
print(f" triton: {fmt(m_triton, 'MB')}")
|
||||
print(f" saved: {orig_peak - triton_peak:.1f} MB")
|
||||
|
||||
del logits
|
||||
_clean_gpu()
|
||||
|
||||
|
||||
def benchmark_noncontiguous():
|
||||
print("\n" + "=" * 60)
|
||||
print(
|
||||
f"NON-CONTIGUOUS BENCHMARK (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})"
|
||||
)
|
||||
print("=" * 60)
|
||||
|
||||
configs = [
|
||||
(4, 2048, "transpose"),
|
||||
(4, 8192, "transpose"),
|
||||
(8, 2048, "transpose"),
|
||||
(4, 4096, "slice_batch"),
|
||||
]
|
||||
|
||||
for B, L, method in configs:
|
||||
torch.manual_seed(42)
|
||||
|
||||
if method == "transpose":
|
||||
raw = torch.randn(L, B, V, device="cuda", dtype=torch.bfloat16)
|
||||
logits_nc = raw.transpose(0, 1)
|
||||
raw_gb = L * B * V * 2 / 1e9
|
||||
elif method == "slice_batch":
|
||||
raw = torch.randn(B * 2, L, V, device="cuda", dtype=torch.bfloat16)
|
||||
logits_nc = raw[::2]
|
||||
raw_gb = B * 2 * L * V * 2 / 1e9
|
||||
else:
|
||||
continue
|
||||
|
||||
if raw_gb > 28:
|
||||
print(f"\n skip B={B}, L={L}, {method} ({raw_gb:.1f} GB)")
|
||||
del raw, logits_nc
|
||||
torch.cuda.empty_cache()
|
||||
continue
|
||||
|
||||
N = B * L
|
||||
print(f"\n{'─' * 60}")
|
||||
print(f"B={B}, L={L} {method} ({N} rows, raw {raw_gb:.2f} GB)")
|
||||
print(f"{'─' * 60}")
|
||||
|
||||
def original_with_copy(logits, chunk_size=128):
|
||||
return entropy_from_logits_original(
|
||||
logits.contiguous(), chunk_size=chunk_size
|
||||
)
|
||||
|
||||
t_orig = profile_time(original_with_copy, logits_nc)
|
||||
t_triton = profile_time(entropy_from_logits, logits_nc)
|
||||
orig_mean = statistics.mean(t_orig)
|
||||
triton_mean = statistics.mean(t_triton)
|
||||
|
||||
print(" TIME (ms):")
|
||||
print(f" orig+copy: {fmt(t_orig, 'ms')}")
|
||||
print(f" triton-strided:{fmt(t_triton, 'ms')}")
|
||||
print(f" speedup: {orig_mean / triton_mean:.2f}x")
|
||||
|
||||
m_orig = profile_memory(original_with_copy, logits_nc)
|
||||
m_triton = profile_memory(entropy_from_logits, logits_nc)
|
||||
orig_peak = statistics.mean(m_orig)
|
||||
triton_peak = statistics.mean(m_triton)
|
||||
|
||||
print(" MEMORY (peak overhead):")
|
||||
print(f" orig+copy: {fmt(m_orig, 'MB')}")
|
||||
print(f" triton-strided:{fmt(m_triton, 'MB')}")
|
||||
print(f" saved: {orig_peak - triton_peak:.1f} MB")
|
||||
|
||||
del raw, logits_nc
|
||||
_clean_gpu()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
benchmark_contiguous()
|
||||
benchmark_noncontiguous()
|
||||
@@ -1,284 +0,0 @@
|
||||
"""Benchmark for ScatterMoE LoRA Triton kernels.
|
||||
|
||||
Measures forward, backward dX, and backward dA/dB kernels at common MoE
|
||||
model shapes. Reports per-kernel timings, LoRA overhead vs base scatter2scatter,
|
||||
and full fwd+bwd autograd throughput.
|
||||
|
||||
Usage:
|
||||
CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py
|
||||
CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py --ranks 16 64
|
||||
CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py --models Qwen/Qwen3.5-35B-A3B
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import time
|
||||
from functools import partial
|
||||
|
||||
import torch
|
||||
|
||||
from axolotl.integrations.kernels.libs.scattermoe_lora.kernels import (
|
||||
lora_ops,
|
||||
ops as base_ops,
|
||||
)
|
||||
from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
|
||||
flatten_sort_count,
|
||||
)
|
||||
from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_linear_lora import (
|
||||
ScatterMoELoRA,
|
||||
)
|
||||
|
||||
DEVICE = "cuda"
|
||||
DTYPE = torch.bfloat16
|
||||
WARMUP = 5
|
||||
ITERS = 20
|
||||
|
||||
# ─── Model configs ──────────────────────────────────────────────────────────
|
||||
|
||||
BUILTIN_CONFIGS = {
|
||||
"Qwen3.5-35B-A3B": (256, 2048, 512, 8), # E, H, I, k
|
||||
"Qwen3-30B-A3B": (128, 2048, 768, 8),
|
||||
"OLMoE-1B-7B": (64, 2048, 1024, 8),
|
||||
"Mixtral-8x7B": (8, 4096, 14336, 2),
|
||||
}
|
||||
|
||||
|
||||
def _resolve_config(spec):
|
||||
"""Resolve a model spec to (E, H, I, k). Accepts builtin names or HF IDs."""
|
||||
key = spec.lower().replace("/", "-")
|
||||
for name, cfg in BUILTIN_CONFIGS.items():
|
||||
if key in name.lower() or name.lower() in key:
|
||||
return name, cfg
|
||||
|
||||
from transformers import AutoConfig
|
||||
|
||||
hf_cfg = AutoConfig.from_pretrained(spec, trust_remote_code=True)
|
||||
if callable(getattr(hf_cfg, "get_text_config", None)):
|
||||
tc = hf_cfg.get_text_config()
|
||||
if hasattr(tc, "model_type") and tc.model_type != hf_cfg.model_type:
|
||||
hf_cfg = tc
|
||||
hidden = hf_cfg.hidden_size
|
||||
inter = getattr(hf_cfg, "moe_intermediate_size", None) or hf_cfg.intermediate_size
|
||||
experts = (
|
||||
getattr(hf_cfg, "num_experts", None)
|
||||
or getattr(hf_cfg, "num_local_experts", None)
|
||||
or getattr(hf_cfg, "n_routed_experts", None)
|
||||
)
|
||||
top_k = (
|
||||
getattr(hf_cfg, "num_experts_per_tok", None)
|
||||
or getattr(hf_cfg, "num_experts_per_token", None)
|
||||
or 2
|
||||
)
|
||||
name = spec.split("/")[-1]
|
||||
return name, (experts, hidden, inter, top_k)
|
||||
|
||||
|
||||
# ─── Benchmark helpers ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _clean():
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
def _bench(fn, warmup=WARMUP, iters=ITERS):
|
||||
for _ in range(warmup):
|
||||
fn()
|
||||
torch.cuda.synchronize()
|
||||
times = []
|
||||
for _ in range(iters):
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.perf_counter()
|
||||
fn()
|
||||
torch.cuda.synchronize()
|
||||
times.append((time.perf_counter() - t0) * 1000)
|
||||
times.sort()
|
||||
return times[len(times) // 2]
|
||||
|
||||
|
||||
def _setup(num_experts, K, N, T, top_k, R):
|
||||
torch.manual_seed(42)
|
||||
x = torch.randn(T, K, device=DEVICE, dtype=DTYPE)
|
||||
W = torch.randn(num_experts, K, N, device=DEVICE, dtype=DTYPE) * 0.02
|
||||
lora_A = torch.randn(R * num_experts, K, device=DEVICE, dtype=DTYPE) * 0.01
|
||||
lora_B = torch.randn(N, R * num_experts, device=DEVICE, dtype=DTYPE) * 0.01
|
||||
logits = torch.randn(T, num_experts, device=DEVICE)
|
||||
_, top_idx = torch.topk(torch.softmax(logits, dim=-1), top_k, dim=-1)
|
||||
sei, ssi, eo = flatten_sort_count(top_idx, num_experts)
|
||||
gx = base_ops.group(x, ssi, fan_out=top_k)
|
||||
dy = torch.randn(gx.size(0), N, device=DEVICE, dtype=DTYPE)
|
||||
return x, W, lora_A, lora_B, sei, ssi, eo, gx, dy
|
||||
|
||||
|
||||
# ─── Kernel wrappers (avoid B023 loop-variable capture) ──────────────────────
|
||||
|
||||
|
||||
def _call_fwd(x, W, sei, ssi, top_k, lA, lB):
|
||||
return lora_ops.scatter2scatter_lora(
|
||||
X=x,
|
||||
W=W,
|
||||
sorted_expert_idxs=sei,
|
||||
sorted_scattered_idxs=ssi,
|
||||
k=top_k,
|
||||
lora_A=lA,
|
||||
lora_B=lB,
|
||||
scaling=2.0,
|
||||
)
|
||||
|
||||
|
||||
def _call_base(x, W, sei, ssi, top_k):
|
||||
return base_ops.scatter2scatter(
|
||||
X=x,
|
||||
W=W,
|
||||
sorted_expert_idxs=sei,
|
||||
sorted_scattered_idxs=ssi,
|
||||
k=top_k,
|
||||
)
|
||||
|
||||
|
||||
def _call_dx(dy, W, sei, ssi, lA, lB):
|
||||
return lora_ops.scatter2scatter_lora_dX(
|
||||
DY=dy,
|
||||
W=W,
|
||||
sorted_expert_idxs=sei,
|
||||
sorted_scattered_idxs=ssi,
|
||||
k=1,
|
||||
lora_A=lA,
|
||||
lora_B=lB,
|
||||
scaling=2.0,
|
||||
dy_grouped=True,
|
||||
dx_grouped=False,
|
||||
)
|
||||
|
||||
|
||||
def _call_bwd(dy, gx, lA, lB, eo, num_experts):
|
||||
return lora_ops.group_bwd_lora(
|
||||
DY=dy,
|
||||
X=gx,
|
||||
lora_A=lA,
|
||||
lora_B=lB,
|
||||
expert_offsets=eo,
|
||||
E=num_experts,
|
||||
scaling=2.0,
|
||||
)
|
||||
|
||||
|
||||
# ─── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="ScatterMoE LoRA kernel benchmark")
|
||||
parser.add_argument(
|
||||
"--models",
|
||||
"-m",
|
||||
nargs="+",
|
||||
help="Model names or HF IDs (default: all builtins)",
|
||||
)
|
||||
parser.add_argument("--ranks", "-r", nargs="+", type=int, default=[16, 32, 64])
|
||||
parser.add_argument("--seq-len", "-T", type=int, default=2048)
|
||||
args = parser.parse_args()
|
||||
|
||||
T = args.seq_len
|
||||
print(f"GPU: {torch.cuda.get_device_name()}")
|
||||
print(f"T={T}, ranks={args.ranks}\n")
|
||||
|
||||
if args.models:
|
||||
configs = [_resolve_config(m) for m in args.models]
|
||||
else:
|
||||
configs = list(BUILTIN_CONFIGS.items())
|
||||
|
||||
for model_name, (num_experts, hidden, inter, top_k) in configs:
|
||||
print(f"{'=' * 70}")
|
||||
print(f" {model_name}: E={num_experts}, H={hidden}, I={inter}, k={top_k}")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
for R in args.ranks:
|
||||
for proj, K, N in [("gate_up", hidden, 2 * inter), ("down", inter, hidden)]:
|
||||
_clean()
|
||||
x, W, lA, lB, sei, ssi, eo, gx, dy = _setup(
|
||||
num_experts, K, N, T, top_k, R
|
||||
)
|
||||
|
||||
# Forward with LoRA (auto-dispatched: fused or split)
|
||||
dispatch = (
|
||||
"split"
|
||||
if (
|
||||
num_experts <= lora_ops._SPLIT_LORA_FWD_MAX_EXPERTS
|
||||
and K * N >= lora_ops._SPLIT_LORA_FWD_THRESHOLD
|
||||
)
|
||||
else "fused"
|
||||
)
|
||||
t_fwd = _bench(partial(_call_fwd, x, W, sei, ssi, top_k, lA, lB))
|
||||
t_base = _bench(partial(_call_base, x, W, sei, ssi, top_k))
|
||||
t_dx = _bench(partial(_call_dx, dy, W, sei, ssi, lA, lB))
|
||||
t_bwd = _bench(partial(_call_bwd, dy, gx, lA, lB, eo, num_experts))
|
||||
|
||||
total = t_fwd + t_dx + t_bwd
|
||||
overhead = t_fwd / t_base - 1 if t_base > 0 else 0
|
||||
|
||||
print(
|
||||
f" R={R:>2} {proj:<8} "
|
||||
f"fwd={t_fwd:>6.2f}ms [{dispatch}] "
|
||||
f"base={t_base:>6.2f}ms "
|
||||
f"(+{overhead * 100:.0f}%) "
|
||||
f"dx={t_dx:>6.2f}ms bwd={t_bwd:>6.2f}ms "
|
||||
f"total={total:>6.2f}ms"
|
||||
)
|
||||
|
||||
# Full autograd fwd+bwd with memory measurement
|
||||
x_ag = x.clone().requires_grad_(True)
|
||||
lA_ag = lA.clone().requires_grad_(True)
|
||||
lB_ag = lB.clone().requires_grad_(True)
|
||||
|
||||
def _run_autograd(
|
||||
_x=x_ag,
|
||||
_W=W,
|
||||
_k=top_k,
|
||||
_sei=sei,
|
||||
_ssi=ssi,
|
||||
_eo=eo,
|
||||
_lA=lA_ag,
|
||||
_lB=lB_ag,
|
||||
):
|
||||
out = ScatterMoELoRA.apply(
|
||||
_x,
|
||||
_W,
|
||||
_k,
|
||||
_sei,
|
||||
_ssi,
|
||||
_eo,
|
||||
_lA,
|
||||
_lB,
|
||||
2.0,
|
||||
None,
|
||||
None,
|
||||
False,
|
||||
False,
|
||||
True,
|
||||
False,
|
||||
)
|
||||
out.sum().backward()
|
||||
_x.grad = None
|
||||
_lA.grad = None
|
||||
_lB.grad = None
|
||||
|
||||
t_full = _bench(_run_autograd)
|
||||
|
||||
_clean()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
mem_before = torch.cuda.memory_allocated()
|
||||
_run_autograd()
|
||||
torch.cuda.synchronize()
|
||||
mem_peak = torch.cuda.max_memory_allocated() - mem_before
|
||||
|
||||
print(
|
||||
f" full_fwd_bwd={t_full:>6.2f}ms "
|
||||
f"peak_delta={mem_peak / 1e6:>6.1f}MB"
|
||||
)
|
||||
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,191 +0,0 @@
|
||||
"""Benchmark for selective_log_softmax Triton kernel vs original implementation.
|
||||
|
||||
Usage: CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_selective_logsoftmax.py
|
||||
"""
|
||||
|
||||
import gc
|
||||
import statistics
|
||||
|
||||
import torch
|
||||
|
||||
from axolotl.monkeypatch.trainer.utils import (
|
||||
selective_log_softmax,
|
||||
selective_log_softmax_original,
|
||||
)
|
||||
|
||||
V = 151936 # Qwen vocab
|
||||
WARMUP = 5
|
||||
BENCH_ITERS = 20
|
||||
MEM_ITERS = 10
|
||||
|
||||
|
||||
def _clean_gpu():
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
torch.cuda.reset_accumulated_memory_stats()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
def profile_time(fn, args, n_iters=BENCH_ITERS):
|
||||
for _ in range(WARMUP):
|
||||
fn(*args)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
times = []
|
||||
for _ in range(n_iters):
|
||||
s = torch.cuda.Event(enable_timing=True)
|
||||
e = torch.cuda.Event(enable_timing=True)
|
||||
s.record()
|
||||
fn(*args)
|
||||
e.record()
|
||||
torch.cuda.synchronize()
|
||||
times.append(s.elapsed_time(e))
|
||||
return times
|
||||
|
||||
|
||||
def profile_memory(fn, args, n_iters=MEM_ITERS):
|
||||
for _ in range(WARMUP):
|
||||
out = fn(*args)
|
||||
del out
|
||||
torch.cuda.synchronize()
|
||||
|
||||
peaks = []
|
||||
for _ in range(n_iters):
|
||||
_clean_gpu()
|
||||
base = torch.cuda.max_memory_allocated()
|
||||
out = fn(*args)
|
||||
torch.cuda.synchronize()
|
||||
peaks.append(torch.cuda.max_memory_allocated() - base)
|
||||
del out
|
||||
return [p / 1e6 for p in peaks]
|
||||
|
||||
|
||||
def fmt(values, unit=""):
|
||||
mean = statistics.mean(values)
|
||||
std = statistics.stdev(values) if len(values) > 1 else 0.0
|
||||
return f"{mean:8.2f} ± {std:5.2f} {unit} [min={min(values):.2f}, max={max(values):.2f}]"
|
||||
|
||||
|
||||
def benchmark_forward():
|
||||
print("=" * 60)
|
||||
print(f"FORWARD BENCHMARK (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})")
|
||||
print("=" * 60)
|
||||
|
||||
configs = [
|
||||
(1, 2048),
|
||||
(1, 8192),
|
||||
(4, 4096),
|
||||
(8, 2048),
|
||||
(16, 2048),
|
||||
(16, 4096),
|
||||
]
|
||||
|
||||
for B, L in configs:
|
||||
mem_gb = B * L * V * 2 / 1e9
|
||||
if mem_gb > 28:
|
||||
print(f"\n skip B={B}, L={L} ({mem_gb:.1f} GB)")
|
||||
continue
|
||||
|
||||
N = B * L
|
||||
print(f"\n{'─' * 60}")
|
||||
print(f"B={B:2d}, L={L:5d} ({N:6d} rows, logits {mem_gb:.2f} GB)")
|
||||
print(f"{'─' * 60}")
|
||||
|
||||
torch.manual_seed(42)
|
||||
logits = torch.randn(B, L, V, device="cuda", dtype=torch.bfloat16)
|
||||
index = torch.randint(0, V, (B, L), device="cuda")
|
||||
|
||||
t_orig = profile_time(selective_log_softmax_original, (logits, index))
|
||||
t_triton = profile_time(selective_log_softmax, (logits, index))
|
||||
orig_mean = statistics.mean(t_orig)
|
||||
triton_mean = statistics.mean(t_triton)
|
||||
|
||||
print(" TIME (ms):")
|
||||
print(f" original: {fmt(t_orig, 'ms')}")
|
||||
print(f" triton: {fmt(t_triton, 'ms')}")
|
||||
print(f" speedup: {orig_mean / triton_mean:.2f}x")
|
||||
|
||||
m_orig = profile_memory(selective_log_softmax_original, (logits, index))
|
||||
m_triton = profile_memory(selective_log_softmax, (logits, index))
|
||||
orig_peak = statistics.mean(m_orig)
|
||||
triton_peak = statistics.mean(m_triton)
|
||||
|
||||
print(" MEMORY (peak overhead):")
|
||||
print(f" original: {fmt(m_orig, 'MB')}")
|
||||
print(f" triton: {fmt(m_triton, 'MB')}")
|
||||
print(f" saved: {orig_peak - triton_peak:.1f} MB")
|
||||
|
||||
del logits, index
|
||||
_clean_gpu()
|
||||
|
||||
|
||||
def benchmark_backward():
|
||||
print("\n" + "=" * 60)
|
||||
print(f"FWD+BWD BENCHMARK (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})")
|
||||
print("=" * 60)
|
||||
|
||||
configs = [
|
||||
(1, 2048),
|
||||
(1, 8192),
|
||||
(4, 4096),
|
||||
(8, 2048),
|
||||
(16, 2048),
|
||||
(16, 4096),
|
||||
]
|
||||
|
||||
def fwd_bwd_original(logits, index):
|
||||
logits.grad = None
|
||||
out = selective_log_softmax_original(logits, index)
|
||||
out.sum().backward()
|
||||
|
||||
def fwd_bwd_triton(logits, index):
|
||||
logits.grad = None
|
||||
out = selective_log_softmax(logits, index)
|
||||
out.sum().backward()
|
||||
|
||||
for B, L in configs:
|
||||
mem_gb = B * L * V * 2 / 1e9
|
||||
if mem_gb > 20:
|
||||
print(f"\n skip B={B}, L={L} ({mem_gb:.1f} GB, need room for grads)")
|
||||
continue
|
||||
|
||||
N = B * L
|
||||
print(f"\n{'─' * 60}")
|
||||
print(f"B={B:2d}, L={L:5d} ({N:6d} rows, logits {mem_gb:.2f} GB)")
|
||||
print(f"{'─' * 60}")
|
||||
|
||||
torch.manual_seed(42)
|
||||
logits_orig = torch.randn(
|
||||
B, L, V, device="cuda", dtype=torch.bfloat16, requires_grad=True
|
||||
)
|
||||
logits_tri = logits_orig.detach().clone().requires_grad_(True)
|
||||
index = torch.randint(0, V, (B, L), device="cuda")
|
||||
|
||||
t_orig = profile_time(fwd_bwd_original, (logits_orig, index))
|
||||
t_triton = profile_time(fwd_bwd_triton, (logits_tri, index))
|
||||
orig_mean = statistics.mean(t_orig)
|
||||
triton_mean = statistics.mean(t_triton)
|
||||
|
||||
print(" FWD+BWD TIME (ms):")
|
||||
print(f" original: {fmt(t_orig, 'ms')}")
|
||||
print(f" triton: {fmt(t_triton, 'ms')}")
|
||||
print(f" speedup: {orig_mean / triton_mean:.2f}x")
|
||||
|
||||
m_orig = profile_memory(fwd_bwd_original, (logits_orig, index))
|
||||
m_triton = profile_memory(fwd_bwd_triton, (logits_tri, index))
|
||||
orig_peak = statistics.mean(m_orig)
|
||||
triton_peak = statistics.mean(m_triton)
|
||||
|
||||
print(" FWD+BWD MEMORY (peak overhead):")
|
||||
print(f" original: {fmt(m_orig, 'MB')}")
|
||||
print(f" triton: {fmt(m_triton, 'MB')}")
|
||||
print(f" saved: {orig_peak - triton_peak:.1f} MB")
|
||||
|
||||
del logits_orig, logits_tri, index
|
||||
_clean_gpu()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
benchmark_forward()
|
||||
benchmark_backward()
|
||||
@@ -1,55 +0,0 @@
|
||||
FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}
|
||||
|
||||
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
||||
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
|
||||
ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
|
||||
ENV CUDA="{{ CUDA }}"
|
||||
ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
|
||||
ENV GITHUB_REF="{{ GITHUB_REF }}"
|
||||
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
|
||||
ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
|
||||
ENV HF_HOME="{{ HF_HOME }}"
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
|
||||
WORKDIR /workspace/axolotl
|
||||
|
||||
RUN git fetch origin +$GITHUB_REF && \
|
||||
git checkout FETCH_HEAD
|
||||
|
||||
RUN uv pip install packaging==26.0 setuptools==78.1.1
|
||||
RUN uv pip install torchvision
|
||||
RUN uv pip uninstall causal_conv1d
|
||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||
uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
else \
|
||||
uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
||||
fi
|
||||
|
||||
# Override with nightly HF packages for nightly builds
|
||||
RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
|
||||
uv pip install --no-deps \
|
||||
"transformers @ git+https://github.com/huggingface/transformers.git@main" \
|
||||
"peft @ git+https://github.com/huggingface/peft.git@main" \
|
||||
"accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
|
||||
"trl @ git+https://github.com/huggingface/trl.git@main" \
|
||||
"datasets @ git+https://github.com/huggingface/datasets.git@main"; \
|
||||
fi
|
||||
|
||||
RUN python scripts/cutcrossentropy_install.py --uv | sh
|
||||
|
||||
# So we can test the Docker image
|
||||
RUN uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
|
||||
codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
|
||||
|
||||
# fix so that git fetch/pull from remote works
|
||||
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
||||
git config --get remote.origin.fetch
|
||||
|
||||
# helper for huggingface-login cli
|
||||
RUN git config --global credential.helper store
|
||||
73
cicd/cicd.sh
73
cicd/cicd.sh
@@ -1,73 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__, f'Expected torch $PYTORCH_VERSION but got {torch.__version__}'"
|
||||
|
||||
set -o pipefail
|
||||
for i in 1 2 3; do
|
||||
if curl --silent --show-error --fail -L \
|
||||
https://axolotl-ci.b-cdn.net/hf-cache.tar.zst \
|
||||
| tar -xpf - -C "${HF_HOME}/hub/" --use-compress-program unzstd --strip-components=1; then
|
||||
echo "HF cache extracted successfully"
|
||||
break
|
||||
fi
|
||||
echo "Attempt $i failed, cleaning up and retrying in 15s..."
|
||||
rm -rf "${HF_HOME}/hub/"*
|
||||
sleep 15
|
||||
done
|
||||
# hf download "NousResearch/Meta-Llama-3-8B"
|
||||
# hf download "NousResearch/Meta-Llama-3-8B-Instruct"
|
||||
# hf download "microsoft/Phi-4-reasoning"
|
||||
# hf download "microsoft/Phi-3.5-mini-instruct"
|
||||
# hf download "microsoft/Phi-3-medium-128k-instruct"
|
||||
|
||||
# Run unit tests with initial coverage report
|
||||
pytest -v --durations=10 -n8 \
|
||||
--ignore=tests/e2e/ \
|
||||
--ignore=tests/patched/ \
|
||||
--ignore=tests/cli \
|
||||
/workspace/axolotl/tests/ \
|
||||
--cov=axolotl
|
||||
|
||||
# Run lora kernels tests with coverage append
|
||||
pytest -v --durations=10 \
|
||||
/workspace/axolotl/tests/e2e/patched/lora_kernels \
|
||||
--cov=axolotl \
|
||||
--cov-append
|
||||
|
||||
# Run patched tests excluding lora kernels with coverage append
|
||||
pytest --full-trace -vvv --durations=10 \
|
||||
--ignore=tests/e2e/patched/lora_kernels \
|
||||
/workspace/axolotl/tests/e2e/patched \
|
||||
--cov=axolotl \
|
||||
--cov-append
|
||||
|
||||
# Run solo tests with coverage append
|
||||
pytest -v --durations=10 -n1 \
|
||||
/workspace/axolotl/tests/e2e/solo/ \
|
||||
--cov=axolotl \
|
||||
--cov-append
|
||||
|
||||
# Run integration tests with coverage append
|
||||
pytest -v --durations=10 \
|
||||
/workspace/axolotl/tests/e2e/integrations/ \
|
||||
--cov=axolotl \
|
||||
--cov-append
|
||||
|
||||
pytest -v --durations=10 /workspace/axolotl/tests/cli \
|
||||
--cov=axolotl \
|
||||
--cov-append
|
||||
|
||||
# Run remaining e2e tests with coverage append and final report
|
||||
pytest -v --durations=10 \
|
||||
--ignore=tests/e2e/solo/ \
|
||||
--ignore=tests/e2e/patched/ \
|
||||
--ignore=tests/e2e/multigpu/ \
|
||||
--ignore=tests/e2e/integrations/ \
|
||||
--ignore=tests/cli \
|
||||
/workspace/axolotl/tests/e2e/ \
|
||||
--cov=axolotl \
|
||||
--cov-append \
|
||||
--cov-report=xml:e2e-coverage.xml
|
||||
|
||||
codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true
|
||||
@@ -1,19 +0,0 @@
|
||||
"""Modal app to run axolotl GPU cleanup"""
|
||||
|
||||
from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
|
||||
|
||||
|
||||
@app.function(
|
||||
image=cicd_image,
|
||||
timeout=60 * 60,
|
||||
cpu=8.0,
|
||||
memory=131072,
|
||||
volumes=VOLUME_CONFIG,
|
||||
)
|
||||
def cleanup():
|
||||
run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
|
||||
|
||||
|
||||
@app.local_entrypoint()
|
||||
def main():
|
||||
cleanup.remote()
|
||||
@@ -1,6 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# cleanup old cache files for datasets processing and intermediate mappings
|
||||
find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
|
||||
find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;
|
||||
@@ -1,20 +0,0 @@
|
||||
"""Modal app to run axolotl GPU tests"""
|
||||
|
||||
from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
|
||||
|
||||
|
||||
@app.function(
|
||||
image=cicd_image,
|
||||
gpu=GPU_CONFIG,
|
||||
timeout=120 * 60, # 90 min
|
||||
cpu=8.0,
|
||||
memory=131072,
|
||||
volumes=VOLUME_CONFIG,
|
||||
)
|
||||
def cicd_pytest():
|
||||
run_cmd("./cicd/cicd.sh", "/workspace/axolotl")
|
||||
|
||||
|
||||
@app.local_entrypoint()
|
||||
def main():
|
||||
cicd_pytest.remote()
|
||||
@@ -1,85 +0,0 @@
|
||||
"""
|
||||
modal application to run axolotl gpu tests in Modal
|
||||
"""
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
import tempfile
|
||||
|
||||
import jinja2
|
||||
import modal
|
||||
from jinja2 import select_autoescape
|
||||
from modal import App, Image
|
||||
|
||||
cicd_path = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
|
||||
template_env = jinja2.Environment(
|
||||
loader=template_loader, autoescape=select_autoescape()
|
||||
)
|
||||
dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
|
||||
df_template = template_env.get_template(dockerfile)
|
||||
|
||||
df_args = {
|
||||
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
||||
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
||||
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
|
||||
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
|
||||
"CUDA": os.environ.get("CUDA", "126"),
|
||||
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
||||
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
||||
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
|
||||
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
|
||||
"HF_HOME": "/workspace/data/huggingface-cache/hub",
|
||||
"PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"),
|
||||
"DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"),
|
||||
}
|
||||
|
||||
dockerfile_contents = df_template.render(**df_args)
|
||||
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
|
||||
f.write(dockerfile_contents)
|
||||
|
||||
cicd_image = Image.from_dockerfile(
|
||||
pathlib.Path(temp_dir) / "Dockerfile",
|
||||
force_build=True,
|
||||
gpu="A10G",
|
||||
).env(df_args)
|
||||
|
||||
app = App("Axolotl CI/CD", secrets=[])
|
||||
|
||||
hf_cache_volume = modal.Volume.from_name(
|
||||
"axolotl-ci-hf-hub-cache", create_if_missing=True
|
||||
)
|
||||
VOLUME_CONFIG = {
|
||||
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
|
||||
}
|
||||
|
||||
N_GPUS = int(os.environ.get("N_GPUS", 2))
|
||||
GPU_CONFIG = f"H100:{N_GPUS}"
|
||||
|
||||
|
||||
def run_cmd(cmd: str, run_folder: str):
|
||||
import subprocess # nosec
|
||||
|
||||
# Propagate errors from subprocess.
|
||||
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
|
||||
exit(exit_code)
|
||||
|
||||
|
||||
@app.function(
|
||||
image=cicd_image,
|
||||
gpu=GPU_CONFIG,
|
||||
timeout=120 * 60,
|
||||
cpu=16.0,
|
||||
memory=131072 * N_GPUS,
|
||||
volumes=VOLUME_CONFIG,
|
||||
)
|
||||
def cicd_pytest():
|
||||
run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")
|
||||
|
||||
|
||||
@app.local_entrypoint()
|
||||
def main():
|
||||
cicd_pytest.remote()
|
||||
@@ -1,25 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
|
||||
pytest -v --durations=10 -n2 --maxfail=3 \
|
||||
--ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
|
||||
--ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
|
||||
/workspace/axolotl/tests/e2e/multigpu/ \
|
||||
--cov=axolotl
|
||||
|
||||
# Run solo tests with coverage append
|
||||
pytest -v --durations=10 -n1 \
|
||||
/workspace/axolotl/tests/e2e/multigpu/solo/ \
|
||||
--cov=axolotl \
|
||||
--cov-append
|
||||
|
||||
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
|
||||
--cov=axolotl \
|
||||
--cov-append \
|
||||
--cov-report=xml:multigpu-coverage.xml
|
||||
|
||||
# Upload coverage to Codecov if CODECOV_TOKEN is available
|
||||
if [ -n "$CODECOV_TOKEN" ]; then
|
||||
codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
|
||||
fi
|
||||
@@ -1,73 +0,0 @@
|
||||
"""Modal app to run axolotl GPU tests"""
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
import tempfile
|
||||
|
||||
import jinja2
|
||||
import modal
|
||||
import modal.experimental
|
||||
from jinja2 import select_autoescape
|
||||
from modal import App
|
||||
|
||||
cicd_path = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
|
||||
template_env = jinja2.Environment(
|
||||
loader=template_loader, autoescape=select_autoescape()
|
||||
)
|
||||
dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
|
||||
df_template = template_env.get_template(dockerfile)
|
||||
|
||||
df_args = {
|
||||
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
||||
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
||||
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
|
||||
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
|
||||
"CUDA": os.environ.get("CUDA", "126"),
|
||||
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
||||
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
||||
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
|
||||
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
|
||||
"HF_HOME": "/workspace/data/huggingface-cache/hub",
|
||||
"PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"),
|
||||
"DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"),
|
||||
}
|
||||
|
||||
dockerfile_contents = df_template.render(**df_args)
|
||||
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
|
||||
f.write(dockerfile_contents)
|
||||
|
||||
cicd_image = modal.experimental.raw_dockerfile_image(
|
||||
pathlib.Path(temp_dir) / "Dockerfile",
|
||||
# context_mount=None,
|
||||
force_build=True,
|
||||
# gpu="A10G",
|
||||
).env(df_args)
|
||||
|
||||
app = App("Axolotl CI/CD", secrets=[])
|
||||
|
||||
hf_cache_volume = modal.Volume.from_name(
|
||||
"axolotl-ci-hf-hub-cache", create_if_missing=True
|
||||
)
|
||||
VOLUME_CONFIG = {
|
||||
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
|
||||
}
|
||||
|
||||
N_GPUS = int(os.environ.get("N_GPUS", 1))
|
||||
GPU_TYPE = os.environ.get("GPU_TYPE", "L40S")
|
||||
GPU_CONFIG = f"{GPU_TYPE}:{N_GPUS}"
|
||||
|
||||
|
||||
def run_cmd(cmd: str, run_folder: str):
|
||||
import subprocess # nosec
|
||||
|
||||
sp_env = os.environ.copy()
|
||||
sp_env["AXOLOTL_DATASET_NUM_PROC"] = "8"
|
||||
|
||||
# Propagate errors from subprocess.
|
||||
exit_code = subprocess.call(cmd.split(), cwd=run_folder, env=sp_env) # nosec
|
||||
if exit_code:
|
||||
raise RuntimeError(f"Command '{cmd}' failed with exit code {exit_code}")
|
||||
58
codecov.yml
58
codecov.yml
@@ -1,58 +0,0 @@
|
||||
codecov:
|
||||
require_ci_to_pass: yes
|
||||
notify:
|
||||
wait_for_ci: true
|
||||
|
||||
coverage:
|
||||
precision: 2
|
||||
round: down
|
||||
range: "70...100"
|
||||
status:
|
||||
project:
|
||||
default:
|
||||
# basic
|
||||
target: auto
|
||||
threshold: 1%
|
||||
base: auto
|
||||
# advanced
|
||||
branches: null
|
||||
if_no_uploads: error
|
||||
if_not_found: success
|
||||
if_ci_failed: error
|
||||
only_pulls: true
|
||||
flags: null
|
||||
paths: null
|
||||
informational: true
|
||||
patch:
|
||||
default:
|
||||
# basic
|
||||
target: auto
|
||||
threshold: 1%
|
||||
base: auto
|
||||
# advanced
|
||||
branches: null
|
||||
if_no_uploads: error
|
||||
if_not_found: success
|
||||
if_ci_failed: error
|
||||
only_pulls: false
|
||||
flags: null
|
||||
paths: null
|
||||
informational: true
|
||||
|
||||
parsers:
|
||||
gcov:
|
||||
branch_detection:
|
||||
conditional: yes
|
||||
loop: yes
|
||||
method: no
|
||||
macro: no
|
||||
|
||||
comment:
|
||||
layout: "reach,diff,flags,files,footer"
|
||||
behavior: default
|
||||
require_changes: no
|
||||
require_base: no
|
||||
require_head: yes
|
||||
|
||||
github_checks:
|
||||
annotations: false
|
||||
24
data/README.md
Normal file
24
data/README.md
Normal file
@@ -0,0 +1,24 @@
|
||||
|
||||
## Download some datasets
|
||||
```shell
|
||||
curl https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_gpt4.json -o data/raw/alpaca_data_gpt4.json
|
||||
curl https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -L -o data/raw/vicuna_cleaned.json
|
||||
curl https://github.com/teknium1/GPTeacher/blob/main/Instruct/gpt4-instruct-similarity-0.6-dataset.json?raw=true -L -o data/raw/gpt4-instruct-similarity-0.6-dataset.json
|
||||
curl https://github.com/teknium1/GPTeacher/blob/main/Roleplay/roleplay-similarity_0.6-instruct-dataset.json?raw=true -L -o data/raw/roleplay-similarity_0.6-instruct-dataset.json
|
||||
```
|
||||
|
||||
## Convert the JSON data files to JSONL.
|
||||
|
||||
```shell
|
||||
python3 ./scripts/alpaca_json_to_jsonl.py --file data/alpaca_data_gpt4.json --output data/alpaca_data_gpt4.jsonl
|
||||
python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/vicuna_cleaned.json --output data/vicuna_cleaned.jsonl
|
||||
python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/roleplay-similarity_0.6-instruct-dataset.json --output data/roleplay-similarity_0.6-instruct-dataset.jsonl
|
||||
python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/gpt4-instruct-similarity-0.6-dataset.json --output data/gpt4-instruct-similarity-0.6-dataset.jsonl
|
||||
```
|
||||
---
|
||||
|
||||
Using JSONL makes it easier to subset the data if you want a smaller training set, i.e get 2000 random examples.
|
||||
|
||||
```shell
|
||||
shuf -n2000 data/vicuna_cleaned.jsonl > data/vicuna_cleaned.subset0.jsonl
|
||||
```
|
||||
1
data/raw/.gitignore
vendored
Normal file
1
data/raw/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
**
|
||||
57
deepspeed/zero3.json
Normal file
57
deepspeed/zero3.json
Normal file
@@ -0,0 +1,57 @@
|
||||
{
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"offload_param": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 0,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": 0,
|
||||
"stage3_max_reuse_distance": 0,
|
||||
"stage3_gather_16bit_weights_on_model_save": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"auto_cast": false,
|
||||
"loss_scale": 0,
|
||||
"initial_scale_power": 32,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"betas": [
|
||||
0.9,
|
||||
0.95
|
||||
],
|
||||
"eps": 1e-8,
|
||||
"weight_decay": "auto"
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@@ -1,23 +0,0 @@
|
||||
{
|
||||
"zero_optimization": {
|
||||
"stage": 1,
|
||||
"overlap_comm": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"auto_cast": false,
|
||||
"loss_scale": 0,
|
||||
"initial_scale_power": 32,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
{
|
||||
"zero_optimization": {
|
||||
"stage": 1,
|
||||
"overlap_comm": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"auto_cast": false,
|
||||
"loss_scale": 0,
|
||||
"initial_scale_power": 32,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"compile": {
|
||||
"disable": false,
|
||||
"backend": "inductor"
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
{
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu"
|
||||
},
|
||||
"contiguous_gradients": true,
|
||||
"overlap_comm": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"auto_cast": false,
|
||||
"loss_scale": 0,
|
||||
"initial_scale_power": 32,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
{
|
||||
"compile": {
|
||||
"disable": false,
|
||||
"backend": "inductor"
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu"
|
||||
},
|
||||
"contiguous_gradients": true,
|
||||
"overlap_comm": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"auto_cast": false,
|
||||
"loss_scale": 0,
|
||||
"initial_scale_power": 32,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
{
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 0,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"max_live_parameters": 0,
|
||||
"max_reuse_distance": 0,
|
||||
"gather_16bit_weights_on_model_save": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"auto_cast": false,
|
||||
"loss_scale": 0,
|
||||
"initial_scale_power": 32,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
{
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 0,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"max_live_parameters": 0,
|
||||
"max_reuse_distance": 0,
|
||||
"gather_16bit_weights_on_model_save": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": true
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@@ -1,32 +0,0 @@
|
||||
{
|
||||
"zero_force_ds_cpu_optimizer": false,
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"offload_param": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 0,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"max_live_parameters": 0,
|
||||
"max_reuse_distance": 0,
|
||||
"gather_16bit_weights_on_model_save": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": true
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@@ -1,28 +0,0 @@
|
||||
{
|
||||
"zero_force_ds_cpu_optimizer": false,
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_param": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 0,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"max_live_parameters": 0,
|
||||
"max_reuse_distance": 0,
|
||||
"gather_16bit_weights_on_model_save": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": true
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information.
|
||||
@@ -1,48 +0,0 @@
|
||||
# Example config for debugging the chat_template prompt format
|
||||
base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
shards: 10
|
||||
val_set_size: 0
|
||||
output_dir: temp_debug/axolotl_outputs/model
|
||||
dataset_prepared_path: temp_debug/axolotl_outputs/data
|
||||
dataset_num_proc: 1
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
max_steps: 10
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: false
|
||||
fp16: true
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
weight_decay: 0.0
|
||||
@@ -9,11 +9,6 @@ services:
|
||||
- ~/.cache/huggingface/:/root/.cache/huggingface/
|
||||
# set environment variables
|
||||
environment:
|
||||
# Set environment variables
|
||||
- GIT_AUTHOR_NAME=${GIT_AUTHOR_NAME}
|
||||
- GIT_AUTHOR_EMAIL=${GIT_AUTHOR_EMAIL}
|
||||
- GIT_COMMITTER_NAME=${GIT_COMMITTER_NAME}
|
||||
- GIT_COMMITTER_EMAIL=${GIT_COMMITTER_EMAIL}
|
||||
- WANDB_API_KEY=${WANDB_API_KEY}
|
||||
deploy:
|
||||
resources:
|
||||
|
||||
@@ -1,47 +1,30 @@
|
||||
ARG BASE_TAG=main-base
|
||||
FROM axolotlai/axolotl-base:$BASE_TAG
|
||||
FROM winglian/axolotl-base:$BASE_TAG
|
||||
|
||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
||||
ARG AXOLOTL_EXTRAS=""
|
||||
ARG AXOLOTL_ARGS=""
|
||||
ARG CUDA="118"
|
||||
ARG PYTORCH_VERSION="2.1.2"
|
||||
ARG TARGETARCH
|
||||
|
||||
ENV PYTORCH_VERSION=$PYTORCH_VERSION
|
||||
ENV BNB_CUDA_VERSION=$CUDA
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
|
||||
rm -rf /var/cache/apt/archives && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
apt-get install -y vim curl
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
|
||||
WORKDIR /workspace/axolotl
|
||||
|
||||
# If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
|
||||
RUN pip uninstall -y causal_conv1d
|
||||
RUN if [ "$TARGETARCH" = "arm64" ]; then \
|
||||
BASE_EXTRAS="optimizers,ray"; \
|
||||
RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
|
||||
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||
RUN cd axolotl && \
|
||||
if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||
pip install -e .[$AXOLOTL_EXTRAS]; \
|
||||
else \
|
||||
BASE_EXTRAS="deepspeed,optimizers,ray"; \
|
||||
fi && \
|
||||
if [ "$AXOLOTL_EXTRAS" != "" ]; then \
|
||||
pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
else \
|
||||
pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
|
||||
fi && \
|
||||
python scripts/cutcrossentropy_install.py | sh && \
|
||||
pip install pytest && \
|
||||
pip cache purge
|
||||
pip install -e .; \
|
||||
fi
|
||||
|
||||
# fix so that git fetch/pull from remote works with shallow clone
|
||||
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
||||
git config --get remote.origin.fetch && \
|
||||
git config --global credential.helper store
|
||||
# fix so that git fetch/pull from remote works
|
||||
RUN cd axolotl && \
|
||||
git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
||||
git config --get remote.origin.fetch
|
||||
|
||||
COPY .axolotl-complete.bash /root/.axolotl-complete.bash
|
||||
RUN chmod +x /root/.axolotl-complete.bash && \
|
||||
echo 'source /root/.axolotl-complete.bash' >> ~/.bashrc
|
||||
# helper for huggingface-login cli
|
||||
RUN git config --global credential.helper store
|
||||
|
||||
@@ -2,59 +2,103 @@ ARG CUDA_VERSION="11.8.0"
|
||||
ARG CUDNN_VERSION="8"
|
||||
ARG UBUNTU_VERSION="22.04"
|
||||
ARG MAX_JOBS=4
|
||||
ARG TARGETARCH
|
||||
|
||||
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
||||
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION as base-builder
|
||||
|
||||
ENV PATH="/root/miniconda3/bin:${PATH}"
|
||||
|
||||
ARG TARGETARCH
|
||||
ARG PYTHON_VERSION="3.11"
|
||||
ARG PYTORCH_VERSION="2.1.2"
|
||||
ARG CUDA="128"
|
||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
||||
ARG PYTHON_VERSION="3.9"
|
||||
ARG PYTORCH_VERSION="2.0.1"
|
||||
ARG CUDA="118"
|
||||
|
||||
ENV PYTHON_VERSION=$PYTHON_VERSION
|
||||
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
wget git build-essential ninja-build git-lfs libaio-dev pkg-config \
|
||||
ibverbs-providers ibverbs-utils infiniband-diags \
|
||||
librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm \
|
||||
&& rm -rf /var/cache/apt/archives \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
MINICONDA_ARCH="x86_64"; \
|
||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
||||
MINICONDA_ARCH="aarch64"; \
|
||||
else \
|
||||
echo "Unsupported architecture: $TARGETARCH"; exit 1; \
|
||||
fi \
|
||||
&& wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN wget \
|
||||
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
||||
&& mkdir /root/.conda \
|
||||
&& bash Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh -b \
|
||||
&& rm -f Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
|
||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
|
||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
|
||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh
|
||||
|
||||
RUN conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||
|
||||
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel psutil && \
|
||||
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
|
||||
python3 -m pip cache purge
|
||||
RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
|
||||
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA
|
||||
|
||||
RUN if [ "$CUDA" != "130" ] ; then \
|
||||
CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@v1.5.4"; \
|
||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
|
||||
python3 -m pip cache purge; \
|
||||
fi
|
||||
|
||||
RUN git lfs install --skip-repo && \
|
||||
pip3 install awscli && \
|
||||
FROM base-builder AS flash-attn-builder
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
||||
|
||||
RUN git clone https://github.com/Dao-AILab/flash-attention.git && \
|
||||
cd flash-attention && \
|
||||
git checkout v2.0.4 && \
|
||||
python3 setup.py bdist_wheel && \
|
||||
cd csrc/fused_dense_lib && \
|
||||
python3 setup.py bdist_wheel && \
|
||||
cd ../xentropy && \
|
||||
python3 setup.py bdist_wheel && \
|
||||
cd ../rotary && \
|
||||
python3 setup.py bdist_wheel && \
|
||||
cd ../layer_norm && \
|
||||
python3 setup.py bdist_wheel
|
||||
|
||||
FROM base-builder AS deepspeed-builder
|
||||
|
||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN git clone https://github.com/microsoft/DeepSpeed.git && \
|
||||
cd DeepSpeed && \
|
||||
MAX_CONCURRENCY=8 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_OPS=1 python3 setup.py bdist_wheel
|
||||
|
||||
FROM base-builder AS bnb-builder
|
||||
|
||||
WORKDIR /workspace
|
||||
ARG CUDA="118"
|
||||
ENV CUDA=$CUDA
|
||||
|
||||
RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \
|
||||
cd bitsandbytes && \
|
||||
CUDA_VERSION=$CUDA make cuda11x && \
|
||||
python setup.py bdist_wheel
|
||||
|
||||
FROM base-builder
|
||||
|
||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
||||
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
|
||||
|
||||
# recompile apex
|
||||
RUN python3 -m pip uninstall -y apex
|
||||
RUN git clone https://github.com/NVIDIA/apex
|
||||
# `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
|
||||
RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
|
||||
|
||||
RUN mkdir -p /workspace/builds
|
||||
COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes
|
||||
|
||||
RUN mkdir -p /workspace/wheels/bitsandbytes
|
||||
COPY --from=deepspeed-builder /workspace/DeepSpeed/dist/deepspeed-*.whl wheels
|
||||
COPY --from=bnb-builder /workspace/bitsandbytes/dist/bitsandbytes-*.whl wheels
|
||||
COPY --from=bnb-builder /workspace/bitsandbytes/bitsandbytes/libbitsandbytes*.so wheels/bitsandbytes
|
||||
COPY --from=flash-attn-builder /workspace/flash-attention/dist/flash_attn-*.whl wheels
|
||||
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/fused_dense_lib/dist/fused_dense_lib-*.whl wheels
|
||||
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/xentropy/dist/xentropy_cuda_lib-*.whl wheels
|
||||
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/rotary/dist/rotary_emb-*.whl wheels
|
||||
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/layer_norm/dist/dropout_layer_norm-*.whl wheels
|
||||
|
||||
RUN pip3 install wheels/deepspeed-*.whl wheels/flash_attn-*.whl wheels/fused_dense_lib-*.whl wheels/xentropy_cuda_lib-*.whl wheels/rotary_emb-*.whl wheels/dropout_layer_norm-*.whl
|
||||
RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
|
||||
RUN git lfs install --skip-repo
|
||||
RUN pip3 install awscli && \
|
||||
# The base image ships with `pydantic==1.8.2` which is not working
|
||||
pip3 install -U --no-cache-dir pydantic==1.10.10 && \
|
||||
pip3 cache purge
|
||||
pip3 install -U --no-cache-dir pydantic==1.10.10
|
||||
|
||||
@@ -1,37 +0,0 @@
|
||||
ARG CUDA_VERSION="12.8.2"
|
||||
ARG UBUNTU_VERSION="22.04"
|
||||
ARG MAX_JOBS=4
|
||||
|
||||
FROM nvidia/cuda:12.8.2-devel-ubuntu22.04 AS base-builder
|
||||
|
||||
ENV PATH="/root/miniforge3/bin:${PATH}"
|
||||
|
||||
ARG PYTHON_VERSION="3.11"
|
||||
ARG PYTORCH_VERSION="next"
|
||||
ARG CUDA="128"
|
||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0 12.0+PTX"
|
||||
|
||||
ENV PYTHON_VERSION=$PYTHON_VERSION
|
||||
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
|
||||
&& wget \
|
||||
https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh \
|
||||
&& mkdir /root/.conda \
|
||||
&& bash Miniforge3-Linux-x86_64.sh -b \
|
||||
&& rm -f Miniforge3-Linux-x86_64.sh \
|
||||
&& /root/miniforge3/bin/conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||
|
||||
ENV PATH="/root/miniforge3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
|
||||
python3 -m pip install --no-cache-dir -U torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
|
||||
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
|
||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
|
||||
|
||||
RUN git lfs install --skip-repo && \
|
||||
pip3 install awscli && \
|
||||
pip3 install -U --no-cache-dir pydantic==2.10.6
|
||||
@@ -1,43 +0,0 @@
|
||||
ARG CUDA_VERSION="12.8.1"
|
||||
ARG CUDNN_VERSION="8"
|
||||
ARG UBUNTU_VERSION="22.04"
|
||||
ARG MAX_JOBS=4
|
||||
|
||||
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
||||
|
||||
ENV PATH="/root/miniconda3/bin:${PATH}"
|
||||
|
||||
ARG PYTHON_VERSION="3.11"
|
||||
ARG PYTORCH_VERSION="nightly"
|
||||
ARG CUDA="128"
|
||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
||||
|
||||
ENV PYTHON_VERSION=$PYTHON_VERSION
|
||||
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
|
||||
&& wget \
|
||||
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
||||
&& mkdir /root/.conda \
|
||||
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
|
||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
|
||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||
|
||||
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel && \
|
||||
python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
|
||||
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
|
||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
RUN git lfs install --skip-repo && \
|
||||
pip3 install awscli && \
|
||||
# The base image ships with `pydantic==1.8.2` which is not working
|
||||
pip3 install -U --no-cache-dir pydantic==1.10.10 && \
|
||||
pip3 cache purge
|
||||
@@ -1,30 +0,0 @@
|
||||
ARG BASE_TAG=main
|
||||
FROM axolotlai/axolotl:$BASE_TAG
|
||||
|
||||
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
|
||||
ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
||||
|
||||
EXPOSE 8888
|
||||
EXPOSE 22
|
||||
|
||||
COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
|
||||
COPY scripts/motd /etc/motd
|
||||
|
||||
RUN pip install jupyterlab notebook ipywidgets && \
|
||||
jupyter lab clean
|
||||
RUN apt update && \
|
||||
apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
|
||||
rm -rf /var/cache/apt/archives && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
mkdir -p ~/.ssh && \
|
||||
chmod 700 ~/.ssh && \
|
||||
printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
|
||||
printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
|
||||
chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
|
||||
chmod +x /root/cloud-entrypoint.sh && \
|
||||
echo 'set-option -g history-limit 5000' >> ~/.tmux.conf
|
||||
|
||||
ENTRYPOINT ["/root/cloud-entrypoint.sh"]
|
||||
CMD ["sleep", "infinity"]
|
||||
@@ -1,28 +0,0 @@
|
||||
ARG BASE_TAG=main
|
||||
FROM axolotlai/axolotl:$BASE_TAG
|
||||
|
||||
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
|
||||
ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
||||
|
||||
EXPOSE 8888
|
||||
EXPOSE 22
|
||||
|
||||
COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
|
||||
COPY scripts/motd /etc/motd
|
||||
|
||||
RUN pip install jupyterlab notebook ipywidgets && \
|
||||
jupyter lab clean
|
||||
RUN apt update && \
|
||||
apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
|
||||
rm -rf /var/cache/apt/archives && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
mkdir -p ~/.ssh && \
|
||||
chmod 700 ~/.ssh && \
|
||||
printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
|
||||
chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
|
||||
chmod +x /root/cloud-entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/root/cloud-entrypoint.sh"]
|
||||
CMD ["sleep", "infinity"]
|
||||
@@ -1,31 +0,0 @@
|
||||
ARG BASE_TAG=main
|
||||
FROM axolotlai/axolotl-uv:$BASE_TAG
|
||||
|
||||
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
|
||||
ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
||||
|
||||
EXPOSE 8888
|
||||
EXPOSE 22
|
||||
|
||||
COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
|
||||
COPY scripts/motd /etc/motd
|
||||
|
||||
RUN uv pip install jupyterlab notebook ipywidgets && \
|
||||
jupyter lab clean
|
||||
RUN apt update && \
|
||||
apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
|
||||
rm -rf /var/cache/apt/archives && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
mkdir -p ~/.ssh && \
|
||||
chmod 700 ~/.ssh && \
|
||||
printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
|
||||
printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
|
||||
printf "source /workspace/axolotl-venv/bin/activate\n" >> ~/.bashrc && \
|
||||
chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
|
||||
chmod +x /root/cloud-entrypoint.sh && \
|
||||
echo 'set-option -g history-limit 5000' >> ~/.tmux.conf
|
||||
|
||||
ENTRYPOINT ["/root/cloud-entrypoint.sh"]
|
||||
CMD ["sleep", "infinity"]
|
||||
18
docker/Dockerfile-runpod
Normal file
18
docker/Dockerfile-runpod
Normal file
@@ -0,0 +1,18 @@
|
||||
ARG BASE_TAG=main
|
||||
FROM winglian/axolotl:$BASE_TAG
|
||||
|
||||
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
|
||||
ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
||||
ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
|
||||
|
||||
COPY scripts/runpod-entrypoint.sh /root/runpod-entrypoint.sh
|
||||
|
||||
RUN apt install --yes --no-install-recommends openssh-server tmux && \
|
||||
mkdir -p ~/.ssh && \
|
||||
chmod 700 ~/.ssh && \
|
||||
printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
|
||||
chmod +x /workspace/axolotl/scripts/runpod-entrypoint.sh && \
|
||||
chmod +x /root/runpod-entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/root/runpod-entrypoint.sh"]
|
||||
CMD ["sleep", "infinity"]
|
||||
@@ -1,40 +0,0 @@
|
||||
ARG BASE_TAG=main-base
|
||||
FROM axolotlai/axolotl-base:$BASE_TAG
|
||||
|
||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
||||
ARG AXOLOTL_EXTRAS=""
|
||||
ARG AXOLOTL_ARGS=""
|
||||
ARG CUDA="118"
|
||||
ARG PYTORCH_VERSION="2.1.2"
|
||||
ARG GITHUB_REF="main"
|
||||
|
||||
ENV PYTORCH_VERSION=$PYTORCH_VERSION
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
|
||||
WORKDIR /workspace/axolotl
|
||||
|
||||
RUN git fetch origin +$GITHUB_REF && \
|
||||
git checkout FETCH_HEAD
|
||||
|
||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||
pip install --no-build-isolation -e .[deepspeed,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
else \
|
||||
pip install --no-build-isolation -e .[deepspeed,mamba-ssm] $AXOLOTL_ARGS; \
|
||||
fi
|
||||
|
||||
# So we can test the Docker image
|
||||
RUN pip install pytest
|
||||
|
||||
# fix so that git fetch/pull from remote works
|
||||
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
||||
git config --get remote.origin.fetch
|
||||
|
||||
# helper for huggingface-login cli
|
||||
RUN git config --global credential.helper store
|
||||
@@ -1,47 +0,0 @@
|
||||
ARG BASE_TAG=main-base
|
||||
FROM axolotlai/axolotl-base-uv:$BASE_TAG
|
||||
|
||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
||||
ARG AXOLOTL_EXTRAS=""
|
||||
ARG AXOLOTL_ARGS=""
|
||||
ARG CUDA="118"
|
||||
ARG PYTORCH_VERSION="2.1.2"
|
||||
ARG TARGETARCH
|
||||
|
||||
ENV PYTORCH_VERSION=$PYTORCH_VERSION
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
|
||||
rm -rf /var/cache/apt/archives && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
|
||||
WORKDIR /workspace/axolotl
|
||||
|
||||
# If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
|
||||
RUN uv pip uninstall causal_conv1d
|
||||
RUN if [ "$TARGETARCH" = "arm64" ]; then \
|
||||
BASE_EXTRAS="optimizers,ray"; \
|
||||
else \
|
||||
BASE_EXTRAS="deepspeed,optimizers,ray"; \
|
||||
fi && \
|
||||
if [ "$AXOLOTL_EXTRAS" != "" ]; then \
|
||||
uv pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
else \
|
||||
uv pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
|
||||
fi && \
|
||||
python scripts/cutcrossentropy_install.py --uv | sh && \
|
||||
uv pip install pytest && \
|
||||
uv cache clean
|
||||
|
||||
# fix so that git fetch/pull from remote works with shallow clone
|
||||
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
||||
git config --get remote.origin.fetch && \
|
||||
git config --global credential.helper store
|
||||
|
||||
COPY .axolotl-complete.bash /root/.axolotl-complete.bash
|
||||
RUN chmod +x /root/.axolotl-complete.bash && \
|
||||
echo 'source /root/.axolotl-complete.bash' >> ~/.bashrc
|
||||
@@ -1,40 +0,0 @@
|
||||
ARG CUDA_VERSION="12.6.3"
|
||||
ARG CUDNN_VERSION=""
|
||||
ARG UBUNTU_VERSION="22.04"
|
||||
ARG MAX_JOBS=4
|
||||
ARG TARGETARCH
|
||||
|
||||
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
||||
|
||||
ARG TARGETARCH
|
||||
ARG PYTHON_VERSION="3.11"
|
||||
ARG PYTORCH_VERSION="2.6.0"
|
||||
ARG CUDA="126"
|
||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
||||
|
||||
ENV PYTHON_VERSION=$PYTHON_VERSION
|
||||
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
|
||||
ENV UV_TORCH_BACKEND="cu${CUDA}"
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
|
||||
&& git lfs install --skip-repo \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
ENV PATH="/root/.local/bin:${PATH}"
|
||||
|
||||
RUN uv python install ${PYTHON_VERSION}
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN uv venv --no-project --relocatable axolotl-venv
|
||||
|
||||
ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
|
||||
|
||||
RUN uv pip install packaging setuptools wheel psutil \
|
||||
&& uv pip install torch==${PYTORCH_VERSION} torchvision \
|
||||
&& uv pip install awscli pydantic
|
||||
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
MAMBA_SKIP_CUDA_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE uv pip install --no-build-isolation mamba_ssm causal_conv1d; \
|
||||
fi
|
||||
7
docs/.gitignore
vendored
7
docs/.gitignore
vendored
@@ -1,7 +0,0 @@
|
||||
/.quarto/
|
||||
_site/
|
||||
/api/*.qmd
|
||||
/api/*.html
|
||||
config-reference.qmd
|
||||
models/**/*.qmd
|
||||
models/**/*.html
|
||||
@@ -1,70 +0,0 @@
|
||||
---
|
||||
title: "1.58-bit Finetuning"
|
||||
back-to-top-navigation: true
|
||||
toc: true
|
||||
toc-expand: 2
|
||||
toc-depth: 4
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
1.58-bit finetuning allows you to finetune BitNet models when their prequantized weights are provided. In theory, it will be possible to fine-tune any LLM in 1.58bit format but the performance degradation will be dramatic.
|
||||
|
||||
Axolotl supports 1.58-bit finetuning via the [`onebitllms`](https://github.com/tiiuae/onebitllms) library, which replaces standard linear layers with BitNet-compatible counterparts ready to use for training.
|
||||
|
||||
::: {.callout-note}
|
||||
LoRA is not supported for BitNet models
|
||||
:::
|
||||
|
||||
## Installation
|
||||
|
||||
Install the `onebitllms` package before using this feature:
|
||||
|
||||
```bash
|
||||
uv pip install onebitllms
|
||||
```
|
||||
|
||||
Or from source:
|
||||
|
||||
```bash
|
||||
uv pip install git+https://github.com/tiiuae/onebitllms
|
||||
```
|
||||
|
||||
## Supported models
|
||||
|
||||
For now, only `Falcon-E` series of models are supported. Make sure to use their `-prequantized` version:
|
||||
|
||||
```bash
|
||||
tiiuae/Falcon-E-3B-Base-prequantized
|
||||
tiiuae/Falcon-E-1B-Base-prequantized
|
||||
```
|
||||
|
||||
In theory, any other model would 'work' but the performance degradation will be huge. This remains an area of exploration.
|
||||
|
||||
## Configuration
|
||||
|
||||
To enable 1.58-bit finetuning, set the following in your configuration file:
|
||||
|
||||
```yaml
|
||||
base_model: tiiuae/Falcon-E-3B-Base-prequantized # A BitNet-compatible model
|
||||
|
||||
use_onebitllms: true
|
||||
```
|
||||
|
||||
::: {.callout-note}
|
||||
For BitNet models, it is recommended to use a higher learning rate than classic models (usually in the order of magnitude of 10x).
|
||||
:::
|
||||
|
||||
## Considerations after training
|
||||
|
||||
Once your model has been trained with 1.58bit fine-tuning, you can convert the trained model in ternary format using the `onebitllms` CLI:
|
||||
|
||||
```bash
|
||||
onebitllms quantize_to_1bit INPUT_PATH OUTPUT_PATH
|
||||
```
|
||||
|
||||
After that, you can use supported packages such as `llama.cpp` or Apple MLX package to run the trained model.
|
||||
|
||||
## Example Configuration
|
||||
|
||||
You can find example configurations in `examples/falcon-e` which contain one configuration for SFT and one configuration for DPO.
|
||||
@@ -1,71 +0,0 @@
|
||||
# GRPO — Agent Reference
|
||||
|
||||
Online RL with verifiable reward functions. For full config reference, async features, and scaling, see [grpo.qmd](../grpo.qmd). For vLLM setup, see [vllm_serving.qmd](../vllm_serving.qmd).
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
Terminal 1 (GPU 0) Terminal 2 (GPU 1)
|
||||
┌──────────────────────┐ ┌──────────────────────────────────┐
|
||||
│ vLLM Server │ HTTP │ Trainer │
|
||||
│ Serves base model │◄────────────►│ 1. Send prompts to vLLM │
|
||||
│ + LoRA adapter │ /generate │ 2. Score completions (rewards) │
|
||||
│ │ /set_lora │ 3. Compute advantages │
|
||||
│ Punica kernels for │ │ 4. PPO-clip gradient update │
|
||||
│ LoRA inference │ │ 5. Sync LoRA weights to vLLM │
|
||||
└──────────────────────┘ └──────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Components Required
|
||||
|
||||
1. A YAML config with `rl: grpo`
|
||||
2. A reward module (Python file with reward functions)
|
||||
3. A running vLLM server (`axolotl vllm-serve config.yaml`)
|
||||
|
||||
## Reward Function Signature
|
||||
|
||||
```python
|
||||
def my_reward(completions, **kwargs) -> list[float]:
|
||||
# completions[i][0]["content"] = text of i-th completion
|
||||
# **kwargs contains dataset columns not removed by transform
|
||||
return [score_for_each_completion]
|
||||
```
|
||||
|
||||
Multiple rewards: `reward_funcs: [r1, r2]` with `reward_weights: [1.0, 0.5]`.
|
||||
|
||||
## Key Async Features
|
||||
|
||||
| Feature | Config | Purpose |
|
||||
|---------|--------|---------|
|
||||
| Async prefetch | `async_prefetch: true` | Overlap generation with training |
|
||||
| LoRA sync | `vllm_lora_sync: true` | Fast adapter sync via filesystem |
|
||||
| Streaming scoring | `streaming_partial_batch: true` | Score one group at a time |
|
||||
| Zero-adv skip | `skip_zero_advantage_batches: true` | Skip batches with no learning signal |
|
||||
| Replay buffer | `replay_buffer_size: 100` | Cache high-signal groups |
|
||||
| IS correction | `vllm_importance_sampling_correction: true` | Fix off-policy distribution shift |
|
||||
|
||||
## Health Checks
|
||||
|
||||
- `rewards/*/mean` > 0.15 within 20 steps (else: test reward function standalone)
|
||||
- `reward_std` > 0 on most steps (else: no learning signal)
|
||||
- `entropy` 0.05-0.5 (< 0.01 = mode collapse)
|
||||
- `grad_norm` 0.001-1.0 (> 10 = unstable, 0.0 = zero-advantage skip)
|
||||
|
||||
See [training_stability.qmd](../training_stability.qmd) for detailed diagnostics.
|
||||
|
||||
## File Map
|
||||
|
||||
```
|
||||
src/axolotl/
|
||||
cli/train.py # Entry point
|
||||
cli/vllm_serve.py # Entry point for vLLM server
|
||||
core/trainers/grpo/
|
||||
trainer.py # AxolotlGRPOTrainer
|
||||
sampler.py # Sampling utilities
|
||||
core/builders/rl.py # HFRLTrainerBuilder — routes rl type → trainer
|
||||
scripts/vllm_serve_lora.py # vLLM serve script with LoRA sync support
|
||||
utils/schemas/trl.py # TRL config schema (all trl: options)
|
||||
|
||||
docs/grpo.qmd # Full user docs: async, rewards, scaling, config reference
|
||||
docs/vllm_serving.qmd # vLLM server modes, LoRA sync, weight sync
|
||||
```
|
||||
@@ -1,198 +0,0 @@
|
||||
# Model Architectures — Agent Reference
|
||||
|
||||
Model-specific quirks, required settings, and known issues. Check this before debugging training failures on specific model families.
|
||||
|
||||
## VLM (Vision Language Model) Quick Start
|
||||
|
||||
All VLM configs require these four lines:
|
||||
```yaml
|
||||
processor_type: AutoProcessor
|
||||
skip_prepare_dataset: true
|
||||
remove_unused_columns: false
|
||||
sample_packing: false
|
||||
```
|
||||
|
||||
Decision tree for VLM config:
|
||||
```text
|
||||
Is the model multimodal (has vision/audio encoder)?
|
||||
├─ YES: Add `freeze_mm_modules: true` if training text only
|
||||
│ Add `chat_template: <model_template>` (e.g. gemma4, qwen3_5, gemma3)
|
||||
│ LoRA: use regex `lora_target_modules` to restrict to language model
|
||||
└─ NO: Train as a regular text model
|
||||
|
||||
Is the model MoE (e.g. Gemma4 26B-A4B, Qwen3.5 35B-A3B)?
|
||||
├─ YES: Add `lora_target_parameters` for expert LoRA
|
||||
│ Consider ScatterMoE kernels (see Plugins section)
|
||||
└─ NO: Standard LoRA config
|
||||
```
|
||||
|
||||
## Plugins & Optimizations
|
||||
|
||||
### Cut Cross Entropy (CCE)
|
||||
|
||||
Computes loss from hidden states + lm_head weight without materializing the full logits tensor, saving significant VRAM. Install if not already present:
|
||||
|
||||
```bash
|
||||
uv pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@main"
|
||||
```
|
||||
|
||||
```yaml
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
```
|
||||
|
||||
### ScatterMoE Kernels
|
||||
|
||||
Fuses expert + LoRA computation into a single kernel for MoE models. Significant speedup for models with many experts.
|
||||
|
||||
```yaml
|
||||
plugins:
|
||||
- axolotl.integrations.kernels.KernelsPlugin
|
||||
use_kernels: true
|
||||
use_scattermoe: true
|
||||
experts_implementation: scattermoe
|
||||
|
||||
# Expert LoRA targets (3D parameter tensors, not nn.Linear):
|
||||
lora_target_parameters:
|
||||
- experts.gate_up_proj
|
||||
- experts.down_proj
|
||||
```
|
||||
|
||||
Supported: Gemma4 (`gemma4_text`), Mixtral, Qwen MoE variants. The plugin auto-detects model type and routing function. Without ScatterMoE, expert LoRA still works but runs base expert matmul and LoRA as separate operations.
|
||||
|
||||
## Gemma 4
|
||||
|
||||
**Models**: `google/gemma-4-26B-A4B` (MoE), `google/gemma-4-31B` (dense), `google/gemma-4-E2B`, `google/gemma-4-E4B`
|
||||
|
||||
**Architecture**: Multimodal wrapper (`Gemma4ForConditionalGeneration`) over a text backbone (`Gemma4TextModel`), with optional vision/audio encoders. All Gemma4 HF repos have `model_type: "gemma4"` — even text-only variants load as multimodal with a vision tower.
|
||||
|
||||
### Required settings
|
||||
|
||||
```yaml
|
||||
# Always needed for Gemma4:
|
||||
freeze_mm_modules: true # Freeze vision/audio encoders for text-only training
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false # Shared per-layer norms cause "marked ready twice" with reentrant
|
||||
|
||||
# LoRA target — restrict to language model only (DO NOT use lora_target_linear: true):
|
||||
lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||
```
|
||||
|
||||
### Auto-detection
|
||||
|
||||
Axolotl auto-detects Gemma4 and applies:
|
||||
- `use_reentrant: false` for gradient checkpointing
|
||||
- `ddp_find_unused_parameters: true` for DDP (skipped when `activation_offloading: true`)
|
||||
|
||||
### Multi-GPU
|
||||
|
||||
| Strategy | Works? | Notes |
|
||||
|----------|--------|-------|
|
||||
| DDP | Yes | Auto-sets `ddp_find_unused_parameters=True` |
|
||||
| DDP + activation_offloading | Yes | `find_unused_parameters` is skipped (conflicts with checkpoint wrappers) |
|
||||
| FSDP1 | No | OOM during dequantization/sharding with QLoRA |
|
||||
| FSDP2 | Yes | Use `Gemma4TextDecoderLayer` (not `Gemma4DecoderLayer`) as wrap class |
|
||||
| FSDP2 + activation_offloading | Yes | Lowest VRAM (~26 GiB/GPU for 26B-A4B) |
|
||||
|
||||
FSDP2 config:
|
||||
```yaml
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_config:
|
||||
fsdp_version: 2
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer
|
||||
```
|
||||
|
||||
### MoE (26B-A4B)
|
||||
|
||||
- `enable_moe_block: true`, 256 experts, top-k routing
|
||||
- No separate `SparseMoeBlock` — MoE is embedded in each decoder layer
|
||||
- Expert LoRA targets 3D parameter tensors:
|
||||
```yaml
|
||||
lora_target_parameters:
|
||||
- experts.gate_up_proj
|
||||
- experts.down_proj
|
||||
```
|
||||
- ScatterMoE kernel acceleration:
|
||||
```yaml
|
||||
plugins:
|
||||
- axolotl.integrations.kernels.KernelsPlugin
|
||||
use_kernels: true
|
||||
use_scattermoe: true
|
||||
experts_implementation: scattermoe
|
||||
```
|
||||
|
||||
### VLM (Vision) Training
|
||||
|
||||
All Gemma4 models load as `Gemma4ForConditionalGeneration` with a vision tower. No custom `ProcessingStrategy` needed — the base class auto-detects the image token.
|
||||
|
||||
```yaml
|
||||
base_model: google/gemma-4-E2B-it # or E4B-it, 26B-A4B
|
||||
processor_type: AutoProcessor
|
||||
freeze_mm_modules: true
|
||||
chat_template: gemma4
|
||||
|
||||
skip_prepare_dataset: true
|
||||
remove_unused_columns: false
|
||||
sample_packing: false
|
||||
```
|
||||
|
||||
A starting VLM loss of ~8-15 is typical. In most runs, loss converges below 1.0 within ~30-50 steps, though results may vary across configurations.
|
||||
|
||||
For the 26B-A4B MoE variant with ScatterMoE + expert LoRA + CCE, add:
|
||||
```yaml
|
||||
plugins:
|
||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
- axolotl.integrations.kernels.KernelsPlugin
|
||||
use_kernels: true
|
||||
use_scattermoe: true
|
||||
experts_implementation: scattermoe
|
||||
lora_target_parameters:
|
||||
- experts.gate_up_proj
|
||||
- experts.down_proj
|
||||
```
|
||||
|
||||
### Common issues
|
||||
|
||||
| Symptom | Cause | Fix |
|
||||
|---------|-------|-----|
|
||||
| `mm_token_type_ids is required` in DDP | `model.config` not accessible through DDP wrapper | Already fixed — `unwrap_model()` in `compute_loss` and `prediction_step` |
|
||||
| `marked a variable ready twice` in DDP | `ddp_find_unused_parameters=True` + activation_offloading checkpoint wrappers | Auto-handled — `find_unused_parameters` is skipped when `activation_offloading: true` |
|
||||
| Loss ~12 instead of ~0.5 | Using `lora_target_linear: true` (applies LoRA to vision/audio modules) | Use the regex `lora_target_modules` pattern instead |
|
||||
| FSDP2 `Could not find Gemma4AudioLayer` | Auto-wrap detects `_no_split_modules` including audio layers that don't exist | Explicitly set `fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer` |
|
||||
| `Gemma4ClippableLinear not supported` by PEFT | Vision tower uses a non-standard linear wrapper | Axolotl patches this automatically via `_patch_peft_clippable_linear()` |
|
||||
|
||||
### E2B/E4B dense models
|
||||
|
||||
These have `hidden_size_per_layer_input: 256` (per-layer input embeddings) and `attention_k_eq_v: False`. Known issue: loss starts higher than expected (~12 vs ~0.5 for 26B). Root cause under investigation — may be related to the per-layer input mechanism or the `Gemma4ForConditionalGeneration` loss computation.
|
||||
|
||||
## Gemma 3
|
||||
|
||||
**Models**: `google/gemma-3-*`
|
||||
|
||||
- `ddp_find_unused_parameters: true` needed (multimodal unused params)
|
||||
- `use_reentrant: false` recommended
|
||||
- Attention mask must be dropped for sample packing (handled automatically)
|
||||
- Multi-GPU test currently skipped (`tests/e2e/multigpu/test_gemma3.py`)
|
||||
|
||||
## Qwen 3.5 MoE
|
||||
|
||||
**Models**: `Qwen/Qwen3.5-35B-A3B`
|
||||
|
||||
- Hybrid architecture: DeltaNet linear attention (30 layers) + full attention (10 layers)
|
||||
- 256 experts, 8 active per token
|
||||
- Known weight scale drift in late DeltaNet layers (36-38) due to AdamW + rare expert interaction
|
||||
- Fix: `normalize_weight_scales` config to detect and rescale outliers:
|
||||
```yaml
|
||||
normalize_weight_scales:
|
||||
- name_pattern: 'linear_attn\.conv1d\.weight'
|
||||
threshold: 1.3
|
||||
```
|
||||
|
||||
## General MoE Notes
|
||||
|
||||
- `lora_target_linear: true` with multimodal MoE models will apply LoRA to ALL linear modules including vision/audio encoders — use regex `lora_target_modules` to restrict to language model only
|
||||
- Rare experts get larger effective learning rate from AdamW (small second-moment estimates) — can cause weight drift in recurrent/SSM components. Use `normalize_weight_scales` with `dry_run: true` to detect.
|
||||
- For ScatterMoE kernel support, set `experts_implementation: scattermoe` and add the KernelsPlugin
|
||||
@@ -1,181 +0,0 @@
|
||||
# New Model Support — Agent Reference
|
||||
|
||||
Guide for debugging and adding support for new model architectures in axolotl. Based on lessons learned from Gemma4, Gemma3, Qwen2-VL, and other multimodal/MoE models.
|
||||
|
||||
## Quick Validation Checklist
|
||||
|
||||
When testing a new model, run through these checks in order:
|
||||
|
||||
1. **Does the model load?** `axolotl preprocess config.yaml` — catches config schema errors
|
||||
2. **Does LoRA apply?** Check for "Unsupported layer type" warnings from PEFT
|
||||
3. **Is the initial loss sane?** First-step loss for a pretrained model should be 0.5–2.0 for SFT
|
||||
4. **Does sample packing work?** Compare loss with `sample_packing: true` vs `false` — should be similar
|
||||
5. **Is CCE active?** Check for "Applying Cut Cross Entropy" log and verify peak VRAM is lower
|
||||
|
||||
## Loss Debugging
|
||||
|
||||
### Expected initial loss
|
||||
A pretrained model doing SFT should start with loss roughly in the 0.5–2.0 range. If loss starts above 3.0, something is wrong. If it's near `log(vocab_size)` (≈ 12 for 262K vocab), the model is predicting at random — attention masking or model weights are broken.
|
||||
|
||||
### Direct comparison technique
|
||||
The fastest way to isolate a loss issue — bypass the trainer entirely:
|
||||
|
||||
```python
|
||||
# Load model via axolotl's pipeline (applies all patches)
|
||||
from axolotl.cli.config import load_cfg
|
||||
from axolotl.utils.config import normalize_config, prepare_plugins
|
||||
from axolotl.loaders.tokenizer import load_tokenizer
|
||||
from axolotl.loaders.model import ModelLoader
|
||||
|
||||
cfg = load_cfg("your_config.yaml")
|
||||
normalize_config(cfg)
|
||||
prepare_plugins(cfg)
|
||||
tokenizer = load_tokenizer(cfg)
|
||||
model, _ = ModelLoader(cfg, tokenizer).load()
|
||||
|
||||
# Forward pass on preprocessed data
|
||||
model.train()
|
||||
out = model(input_ids, labels=labels)
|
||||
print(f"Direct loss: {out.loss.item()}") # Compare to trainer's reported loss
|
||||
```
|
||||
|
||||
If direct loss is correct (~1.0) but trainer reports 3–4x higher, check `model_accepts_loss_kwargs` (see below).
|
||||
|
||||
### `model_accepts_loss_kwargs` inflation
|
||||
HF Trainer checks if the model's `forward()` has `**kwargs` and sets `model_accepts_loss_kwargs=True`. This changes loss normalization: the trainer does NOT divide loss by `gradient_accumulation_steps` before logging. The gradient is correct — only the logged loss is inflated.
|
||||
|
||||
**Symptom**: Logged loss ≈ actual_loss × gradient_accumulation_steps.
|
||||
|
||||
**Which models are affected**: Any model with `**kwargs` in forward (common in multimodal models for extra inputs like `mm_token_type_ids`, `pixel_values`, etc.).
|
||||
|
||||
**Fix location**: `src/axolotl/core/trainers/base.py` `__init__()` — after `super().__init__()`, check if the unwrapped model actually has `num_items_in_batch` in its forward signature. If not, set `self.model_accepts_loss_kwargs = False`.
|
||||
|
||||
## Multimodal Models (ForConditionalGeneration)
|
||||
|
||||
Many recent models use `ForConditionalGeneration` as the top-level class, not `ForCausalLM`:
|
||||
- Gemma3 → `Gemma3ForConditionalGeneration`
|
||||
- Gemma4 → `Gemma4ForConditionalGeneration`
|
||||
- Qwen2-VL → `Qwen2VLForConditionalGeneration`
|
||||
- LLaVA → `LlavaForConditionalGeneration`
|
||||
|
||||
### Why this matters
|
||||
|
||||
| Component | Targets `ForCausalLM` | Needs `ForConditionalGeneration` |
|
||||
|-----------|----------------------|--------------------------------|
|
||||
| CCE patches | ✅ (default) | ❌ silently inactive if not patched |
|
||||
| PEFT LoRA | ✅ | May fail on custom layer types |
|
||||
| HF Trainer label handling | ✅ | May need extra inputs |
|
||||
|
||||
### Required extra inputs
|
||||
Multimodal models require special inputs during training even for text-only data:
|
||||
|
||||
| Model | Required Input | Value for Text-Only |
|
||||
|-------|---------------|-------------------|
|
||||
| Gemma4 | `mm_token_type_ids` | `torch.zeros_like(input_ids)` |
|
||||
| Gemma3 | `token_type_ids` | `torch.zeros_like(input_ids)` |
|
||||
|
||||
Auto-inject in `compute_loss()` when not provided by the data collator. See `core/trainers/base.py`.
|
||||
|
||||
### Custom layer types and PEFT
|
||||
Vision towers often use custom module wrappers that PEFT doesn't support:
|
||||
|
||||
| Model | Custom Layer | Wraps | Fix |
|
||||
|-------|-------------|-------|-----|
|
||||
| Gemma4 | `Gemma4ClippableLinear` | `nn.Linear` | Redirect to `.linear` child |
|
||||
|
||||
Fix location: `src/axolotl/loaders/adapter.py` `_patch_peft_clippable_linear()`.
|
||||
|
||||
## Sample Packing
|
||||
|
||||
### How packed sequence detection works (transformers ≥ 5.x)
|
||||
`transformers.masking_utils._preprocess_mask_arguments()` detects packed sequences from `position_ids` resets. But **only when `attention_mask is None`**:
|
||||
|
||||
```python
|
||||
# From masking_utils.py:
|
||||
if position_ids is not None and attention_mask is None and past_key_values is None:
|
||||
packed_sequence_mask = find_packed_sequence_indices(position_ids)
|
||||
```
|
||||
|
||||
If the collator provides an all-ones `attention_mask`, packing detection is **skipped** and the model builds a single causal mask spanning all packed sequences → cross-sequence attention leakage → very high loss.
|
||||
|
||||
### Fix for models using `create_causal_mask_mapping`
|
||||
For Gemma3, Gemma4, and similar models that use the new transformers masking system, remove `attention_mask` from inputs when sample packing is active:
|
||||
|
||||
```python
|
||||
# In compute_loss():
|
||||
if (
|
||||
self.args.sample_packing
|
||||
and model_type in ("gemma4", "gemma3")
|
||||
and "attention_mask" in inputs
|
||||
and "position_ids" in inputs
|
||||
):
|
||||
del inputs["attention_mask"]
|
||||
```
|
||||
|
||||
Fix location: `src/axolotl/core/trainers/base.py` `compute_loss()`.
|
||||
|
||||
### Models that DON'T need this fix
|
||||
Older models that use `_prepare_4d_causal_attention_mask` (Llama, Mistral, Qwen2, etc.) handle sample packing via axolotl's multipack attention monkeypatch instead. Only models using the new `create_causal_mask_mapping` / `create_causal_mask` masking system need the `attention_mask` removal.
|
||||
|
||||
## Attention Backend Selection
|
||||
|
||||
| Backend | Config | head_dim limit | torch_compile | Notes |
|
||||
|---------|--------|---------------|---------------|-------|
|
||||
| FA2 | `attn_implementation: flash_attention_2` | 256 | ✅ | Fastest when supported |
|
||||
| FA4 | auto with `attn_implementation: flash_attention_2` | 256 (SM90+) | ✅ | Auto-detected on H100+ |
|
||||
| SDPA | `attn_implementation: sdpa` | None | ✅ | Universal fallback |
|
||||
| flex | `attn_implementation: flex_attention` | None | ⚠️ Triton OOM for large head_dim | Good for variable head dims |
|
||||
| eager | `attn_implementation: eager` | None | ✅ | Slowest, always works |
|
||||
|
||||
**Check model support**: Look at `_supports_flash_attn_2`, `_supports_flex_attn`, `_supports_sdpa` attributes on the model class.
|
||||
|
||||
**head_dim gotcha**: The 256 limit is specific to flash-attn CUDA kernels, NOT PyTorch-level. SDPA and flex_attention both handle arbitrary head_dim. Models with `global_head_dim > 256` (Gemma4: 512) must use SDPA or flex.
|
||||
|
||||
**flex + compile gotcha**: `torch_compile` with flex_attention can hit Triton shared memory OOM for large head_dim. Falls back to eager per-function (not a crash, but slower). Unsloth disables flex for Gemma4 for this reason.
|
||||
|
||||
## Cut Cross Entropy (CCE)
|
||||
|
||||
### How CCE patches work
|
||||
CCE replaces the model's `forward()` with a fused version that computes loss from hidden states + lm_head weight without materializing the full logits tensor. This saves ~`batch × seq_len × vocab_size × dtype_bytes` of VRAM.
|
||||
|
||||
### Adding CCE for a new model
|
||||
1. Check if the model type is in `cut_cross_entropy.transformers.patch.PATCH_FNS`
|
||||
2. If not, axolotl's generic fallback (`integrations/cut_cross_entropy/__init__.py` `patch_llama_like()`) patches `{Prefix}ForCausalLM.forward` with `cce_forward`
|
||||
3. For multimodal models (`ForConditionalGeneration`), a model-specific patch is needed in `ml-cross-entropy` repo
|
||||
4. The multimodal `cce_forward` must accept all extra kwargs (pixel_values, mm_token_type_ids, etc.) and pop any that would conflict before calling `self.model()`
|
||||
|
||||
### Common CCE pitfall
|
||||
If CCE appears active (log says "Applying Cut Cross Entropy") but peak VRAM doesn't decrease, check which class was patched. If the model loads as `ForConditionalGeneration` but CCE patched `ForCausalLM`, the patch is silently inactive.
|
||||
|
||||
## MoE Models
|
||||
|
||||
### Dense MLP vs MoE experts
|
||||
Some MoE models (e.g., Gemma4) have BOTH dense MLP layers and MoE expert layers at every decoder layer:
|
||||
- `gate_proj/up_proj/down_proj` → targets the **dense MLP** (`Gemma4TextMLP`)
|
||||
- `experts.gate_up_proj/experts.down_proj` → targets the **MoE experts** (`Gemma4TextExperts`)
|
||||
|
||||
LoRA on the dense MLP works normally. Expert LoRA via `lora_target_parameters` requires PEFT support for the specific expert module type (may warn "Unsupported layer type").
|
||||
|
||||
### ScatterMoE kernels
|
||||
`use_scattermoe: true` with `experts_implementation: scattermoe` registers fused expert kernels via transformers' `ExpertsInterface`. Significant speedup for MoE models. Requires the kernels plugin:
|
||||
```yaml
|
||||
plugins:
|
||||
- axolotl.integrations.kernels.KernelsPlugin
|
||||
use_kernels: true
|
||||
use_scattermoe: true
|
||||
experts_implementation: scattermoe
|
||||
```
|
||||
|
||||
## Where to Add Model-Specific Fixes
|
||||
|
||||
| What | Where | Example |
|
||||
|------|-------|---------|
|
||||
| Missing forward inputs | `core/trainers/base.py` `compute_loss()` | mm_token_type_ids injection |
|
||||
| Attention mask fixes | `core/trainers/base.py` `compute_loss()` | Sample packing mask removal |
|
||||
| Loss logging fixes | `core/trainers/base.py` `__init__()` | model_accepts_loss_kwargs override |
|
||||
| PEFT/LoRA patches | `loaders/adapter.py` | ClippableLinear redirect |
|
||||
| Attention patches | `monkeypatch/attention/` | FA4 tuple fix |
|
||||
| Model-specific patches | `loaders/patch_manager.py` `_apply_model_specific_patches()` | Llama4, Kimi, NemotronH |
|
||||
| CCE patches | `ml-cross-entropy` repo `transformers/` | Per-model cce_forward |
|
||||
| Example configs | `examples/<model>/` | Validated YAML |
|
||||
| Config validation | `utils/schemas/validation.py` | Compatibility checks |
|
||||
@@ -1,121 +0,0 @@
|
||||
# Preference Learning (RLHF) — Agent Reference
|
||||
|
||||
Reference for DPO, IPO, KTO, ORPO, and SimPO. For config templates and dataset format examples, see [rlhf.qmd](../rlhf.qmd). For GRPO, see [grpo.qmd](../grpo.qmd). For EBFT, see [ebft.qmd](../ebft.qmd).
|
||||
|
||||
## Method Overview
|
||||
|
||||
| Method | Data Requirement | Key Idea | Best For |
|
||||
|--------|-----------------|----------|----------|
|
||||
| **DPO** | Paired (chosen + rejected) | Implicit reward via preference pairs | General alignment, most common |
|
||||
| **IPO** | Paired (chosen + rejected) | DPO with different loss (avoids overfitting) | When DPO overfits |
|
||||
| **KTO** | Unpaired (completion + binary label) | Kahneman-Tversky loss, no pairs needed | When you only have thumbs-up/down |
|
||||
| **ORPO** | Paired (chosen + rejected) | Combined SFT + preference, no ref model | Single-stage alignment, saves VRAM |
|
||||
| **SimPO** | Paired (chosen + rejected) | Length-normalized, no ref model | Simple setup, length-robust |
|
||||
|
||||
Default: start with DPO. All methods require `sample_packing: false`.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌──────────────┐ ┌───────────────┐ ┌───────────────┐
|
||||
│ Policy Model │ │ Reference │ │ Preference │
|
||||
│ (trainable) │ │ Model (frozen)│ │ Dataset │
|
||||
└──────┬───────┘ └──────┬────────┘ └──────┬────────┘
|
||||
└──────────┬───────┘ │
|
||||
v │
|
||||
Forward pass on chosen + rejected <─────┘
|
||||
│
|
||||
Preference Loss (DPO/IPO/KTO/...)
|
||||
│
|
||||
Backprop + Update
|
||||
|
||||
Exception: ORPO and SimPO do NOT use a reference model (~50% less VRAM).
|
||||
```
|
||||
|
||||
No vLLM server needed (unlike GRPO). Offline RL with pre-collected preference data.
|
||||
|
||||
## Method Selection
|
||||
|
||||
1. Paired preference data (chosen + rejected)?
|
||||
- Default → `rl: dpo`
|
||||
- Overfitting → `rl: dpo, dpo_loss_type: ["ipo"]`
|
||||
- VRAM-limited → `rl: orpo` (no ref model)
|
||||
- Length-sensitive → `rl: simpo` (no ref model)
|
||||
2. Only binary labels (good/bad)? → `rl: kto`
|
||||
3. Single-stage training (no separate SFT)? → `rl: orpo`
|
||||
|
||||
| | DPO | IPO | KTO | ORPO | SimPO |
|
||||
|---|---|---|---|---|---|
|
||||
| **Reference model** | Yes | Yes | Yes | No | No |
|
||||
| **VRAM overhead** | ~2x model | ~2x model | ~2x model | ~1x model | ~1x model |
|
||||
| **TRL trainer class** | DPOTrainer | DPOTrainer | KTOTrainer | ORPOTrainer | CPOTrainer |
|
||||
|
||||
## Prompt Strategy Resolution
|
||||
|
||||
The `type` field resolves to a Python function:
|
||||
|
||||
```
|
||||
type: "chatml.intel"
|
||||
→ axolotl.prompt_strategies.dpo.chatml.intel(cfg, **kwargs)
|
||||
→ returns transform_fn(sample) → {"prompt", "chosen", "rejected"}
|
||||
|
||||
type: "chat_template.default"
|
||||
→ axolotl.prompt_strategies.dpo.chat_template.default(cfg, dataset_idx, **kwargs)
|
||||
|
||||
type: {"field_prompt": "prompt", ...} (dict)
|
||||
→ axolotl.prompt_strategies.dpo.user_defined.default(...)
|
||||
```
|
||||
|
||||
Module base: `axolotl.prompt_strategies.{rl_method}` — replace `dpo` with `kto` or `orpo`.
|
||||
|
||||
## Healthy Training Indicators
|
||||
|
||||
| Metric | Healthy Range | Problem |
|
||||
|--------|--------------|---------|
|
||||
| `train/loss` | Decreasing, 0.3-0.7 | Flat or increasing = broken data or too high LR |
|
||||
| `rewards/chosen` | Increasing | Flat = model not learning preferences |
|
||||
| `rewards/rejected` | Decreasing | Increasing = model prefers wrong responses |
|
||||
| `rewards/margins` | Positive and increasing | Negative = prefers rejected over chosen |
|
||||
| `rewards/accuracies` | > 0.5, toward 0.7+ | < 0.5 = worse than random |
|
||||
| `logps/rejected` | Decreasing | Increasing = reward hacking |
|
||||
| `grad_norm` | 0.01 - 10.0 | > 100 = exploding gradients |
|
||||
|
||||
Method-specific: DPO/IPO watch `rewards/margins`; KTO loss is noisier; ORPO monitor SFT + odds ratio components; SimPO check length-normalized reward separation.
|
||||
|
||||
## Known Issues
|
||||
|
||||
| Issue | Fix |
|
||||
|-------|-----|
|
||||
| Sample packing crash | Set `sample_packing: false` (required for all preference methods) |
|
||||
| KTO `KeyError: 'label'` | Ensure dataset has boolean `label` column |
|
||||
| ORPO/KTO `KeyError` during tokenization | Add `remove_unused_columns: false` |
|
||||
| ORPO template not applied | ORPO requires explicit `chat_template` setting |
|
||||
| OOM with ref model (DPO/IPO/KTO) | Use LoRA/QLoRA, or switch to ORPO/SimPO (no ref model) |
|
||||
| IPO + label_smoothing | Do not set `dpo_label_smoothing` when `rl: ipo` |
|
||||
|
||||
Full troubleshooting: [training_stability.qmd](../training_stability.qmd)
|
||||
|
||||
## File Map
|
||||
|
||||
```
|
||||
src/axolotl/
|
||||
core/trainers/dpo/ # DPO trainer, args, strategy
|
||||
core/builders/rl.py # HFRLTrainerBuilder — routes rl type → trainer class
|
||||
core/training_args.py # AxolotlKTOConfig, AxolotlORPOConfig, AxolotlCPOConfig
|
||||
prompt_strategies/
|
||||
dpo/ # DPO/IPO/SimPO dataset strategies
|
||||
chat_template.py # chat_template.default, chat_template.argilla_chat
|
||||
chatml.py # chatml.default/intel/icr/argilla_chat/prompt_pairs/ultra
|
||||
llama3.py # llama3 variants (same subtypes as chatml)
|
||||
user_defined.py # Custom field mapping
|
||||
passthrough.py # No transform
|
||||
kto/ # KTO dataset strategies (chatml, llama3, user_defined)
|
||||
orpo/ # ORPO dataset strategies (chat_template.argilla)
|
||||
utils/schemas/enums.py # RLType enum (dpo, ipo, kto, orpo, simpo, grpo, gdpo, ebft)
|
||||
utils/schemas/config.py # All rl/dpo/kto/orpo/simpo config fields
|
||||
|
||||
docs/rlhf.qmd # Full user docs: all dataset formats, config templates
|
||||
docs/choosing_method.qmd # SFT vs DPO vs GRPO decision guide
|
||||
examples/qwen2/dpo.yaml # DPO example
|
||||
examples/llama-3/qlora-1b-kto.yaml # KTO example
|
||||
```
|
||||
@@ -1,75 +0,0 @@
|
||||
# Pretraining / Continual Pretraining — Agent Reference
|
||||
|
||||
Train on raw text with no input masking. Two approaches depending on dataset size.
|
||||
|
||||
## When to Use
|
||||
|
||||
- Continual pretraining on domain-specific corpora
|
||||
- Adapting a base model to a new language or domain before fine-tuning
|
||||
- Pretraining-style data where the entire text is the training signal
|
||||
|
||||
## Choosing an Approach
|
||||
|
||||
| | Non-streaming (`type: completion`) | Streaming (`pretraining_dataset`) |
|
||||
|---|---|---|
|
||||
| **Dataset size** | Fits in memory | Too large to fit in memory |
|
||||
| **Tokenization** | Pre-tokenized before training | On-demand during training |
|
||||
| **Config key** | `datasets:` | `pretraining_dataset:` |
|
||||
| **Long text handling** | Splits texts exceeding `sequence_len` | Concatenates into fixed-length sequences |
|
||||
| **Benefit** | Can preprocess on CPU, transfer to GPU | Start training immediately, no preprocessing |
|
||||
|
||||
## Non-Streaming: `type: completion`
|
||||
|
||||
For smaller datasets that fit in memory. Pre-tokenizes the entire dataset.
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
- path: my_corpus
|
||||
type: completion
|
||||
# field: text # Column name (default: "text")
|
||||
```
|
||||
|
||||
## Streaming: `pretraining_dataset`
|
||||
|
||||
For large corpora. Streams data on-demand without loading everything into memory.
|
||||
|
||||
```yaml
|
||||
pretraining_dataset:
|
||||
- path: HuggingFaceFW/fineweb-edu
|
||||
type: pretrain
|
||||
text_column: text
|
||||
split: train
|
||||
|
||||
max_steps: 1000 # Required — axolotl can't infer dataset size
|
||||
streaming_multipack_buffer_size: 10000 # Buffer for sample packing
|
||||
pretrain_multipack_attn: true # Prevent cross-attention between packed samples
|
||||
```
|
||||
|
||||
`max_steps` is required for streaming — one step = `sequence_len * micro_batch_size * gradient_accumulation_steps * num_gpus` tokens.
|
||||
|
||||
Full streaming docs: [streaming.qmd](../streaming.qmd)
|
||||
|
||||
## Dataset Format
|
||||
|
||||
```json
|
||||
{"text": "The complete document text goes here."}
|
||||
```
|
||||
|
||||
## Key Settings
|
||||
|
||||
- `sample_packing: true` + `pad_to_sequence_len: true` — pack documents into fixed-length sequences
|
||||
- `flash_attention: true` — required for sample packing
|
||||
- No adapter — typically full fine-tune for pretraining
|
||||
- `train_on_inputs: true` — default for completion (all tokens trained on)
|
||||
|
||||
## File Map
|
||||
|
||||
```
|
||||
src/axolotl/
|
||||
prompt_strategies/completion.py # Non-streaming: completion prompt strategy (no masking)
|
||||
utils/data/sft.py # Non-streaming: dataset loading and processing
|
||||
utils/data/streaming.py # Streaming: encode_streaming(), wrap_streaming_dataset()
|
||||
utils/schemas/config.py # Config fields: pretraining_dataset, pretrain_multipack_attn, etc.
|
||||
|
||||
examples/streaming/pretrain.yaml # Full streaming pretraining example config
|
||||
```
|
||||
@@ -1,48 +0,0 @@
|
||||
# Reward Modelling — Agent Reference
|
||||
|
||||
Train models to score responses for use as reward signals in RL. For full docs, see [reward_modelling.qmd](../reward_modelling.qmd).
|
||||
|
||||
## Types
|
||||
|
||||
### Outcome Reward Models (ORM)
|
||||
|
||||
Train a classifier to predict preference over entire interactions. Uses `AutoModelForSequenceClassification`.
|
||||
|
||||
```yaml
|
||||
base_model: google/gemma-2-2b
|
||||
model_type: AutoModelForSequenceClassification
|
||||
num_labels: 1
|
||||
reward_model: true
|
||||
chat_template: gemma
|
||||
datasets:
|
||||
- path: argilla/distilabel-intel-orca-dpo-pairs
|
||||
type: bradley_terry.chat_template
|
||||
```
|
||||
|
||||
Dataset format: `{"system": "...", "input": "...", "chosen": "...", "rejected": "..."}`
|
||||
|
||||
### Process Reward Models (PRM)
|
||||
|
||||
Train a token classifier to score each reasoning step. Uses `AutoModelForTokenClassification`.
|
||||
|
||||
```yaml
|
||||
base_model: Qwen/Qwen2.5-3B
|
||||
model_type: AutoModelForTokenClassification
|
||||
num_labels: 2
|
||||
process_reward_model: true
|
||||
datasets:
|
||||
- path: trl-lib/math_shepherd
|
||||
type: stepwise_supervised
|
||||
```
|
||||
|
||||
Dataset format: see [stepwise_supervised.qmd](../dataset-formats/stepwise_supervised.qmd).
|
||||
|
||||
## File Map
|
||||
|
||||
```
|
||||
src/axolotl/
|
||||
core/builders/causal.py # Handles reward_model flag in trainer builder
|
||||
prompt_strategies/bradley_terry/ # Bradley-Terry prompt strategies
|
||||
prompt_strategies/stepwise_supervised.py # PRM dataset strategy
|
||||
utils/schemas/config.py # reward_model, process_reward_model config fields
|
||||
```
|
||||
@@ -1,139 +0,0 @@
|
||||
# SFT — Agent Reference
|
||||
|
||||
Supervised fine-tuning pipeline reference. For config templates and dataset format examples, see [getting-started.qmd](../getting-started.qmd) and [dataset-formats/](../dataset-formats/).
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
YAML Config → axolotl train config.yaml
|
||||
|
||||
1. Load base model (+ quantization if QLoRA/8-bit)
|
||||
2. Apply adapter layers (LoRA/QLoRA) if configured
|
||||
3. Load + tokenize dataset(s)
|
||||
- Apply prompt template (chat_template / alpaca / custom)
|
||||
- Mask inputs (train_on_inputs: false)
|
||||
- Pack samples into sequences (sample_packing: true)
|
||||
4. Training loop (HuggingFace Trainer)
|
||||
- forward → loss → backward → optimizer step → lr scheduler step
|
||||
5. Save model / adapter weights + tokenizer
|
||||
|
||||
Multi-GPU: FSDP or DeepSpeed shards model across GPUs automatically.
|
||||
```
|
||||
|
||||
## Components Required
|
||||
|
||||
1. A YAML config — model, dataset(s), adapter settings, hyperparameters
|
||||
2. A dataset — HuggingFace Hub, local JSONL/JSON/Parquet, or S3/GCS path
|
||||
3. (Optional) A custom prompt strategy — for non-standard dataset formats
|
||||
|
||||
No external server processes needed (unlike GRPO which requires vLLM).
|
||||
|
||||
## Dataset Format Decision Tree
|
||||
|
||||
```
|
||||
Is your data in chat/message format?
|
||||
├─ YES: OpenAI message format (role/content)?
|
||||
│ ├─ YES ──────────────────────> type: chat_template (recommended)
|
||||
│ └─ NO (custom field names) ──> type: chat_template + message_property_mappings
|
||||
└─ NO: Instruction/response pairs?
|
||||
├─ YES ──> type: alpaca (instruction, input, output)
|
||||
└─ NO: Raw text?
|
||||
├─ YES with segments ─────> type: input_output (template-free masking)
|
||||
└─ YES continuous ────────> type: completion (pretraining-style)
|
||||
```
|
||||
|
||||
Full format specs: [dataset-formats/](../dataset-formats/)
|
||||
|
||||
## Model Size to Adapter Choice
|
||||
|
||||
| Model Size | LoRA | QLoRA (4-bit) | Full Fine-Tune | VRAM (approx) |
|
||||
|-----------|------|---------------|----------------|---------------|
|
||||
| 1-3B | Preferred | Low-budget option | Single GPU OK | 8-16 GB (LoRA) |
|
||||
| 7-8B | Preferred | Good balance | Needs multi-GPU | 16-24 GB (LoRA) |
|
||||
| 13-14B | Preferred | Good balance | Multi-GPU required | 24-40 GB (LoRA) |
|
||||
| 30-70B | LoRA or QLoRA | Preferred for single GPU | Multi-node | 40-80 GB (QLoRA) |
|
||||
|
||||
## Hyperparameter Ranges
|
||||
|
||||
| Parameter | LoRA | QLoRA | Full FT |
|
||||
|-----------|------|-------|---------|
|
||||
| `learning_rate` | 1e-4 to 3e-4 | 1e-4 to 3e-4 | 1e-5 to 5e-5 |
|
||||
| `lora_r` | 16-64 | 16-64 | N/A |
|
||||
| `lora_alpha` | 1-2x `lora_r` | 1-2x `lora_r` | N/A |
|
||||
| `micro_batch_size` | 2-8 | 2-4 | 1-2 |
|
||||
| `gradient_accumulation_steps` | 2-8 | 4-16 | 4-16 |
|
||||
| `num_epochs` | 1-3 | 1-3 | 1-3 |
|
||||
| `optimizer` | `adamw_8bit` | `adamw_bnb_8bit` | `adamw_torch_fused` |
|
||||
|
||||
Effective batch = micro_batch * grad_accum * num_gpus. Lower LR for larger models.
|
||||
|
||||
## Healthy Training Indicators
|
||||
|
||||
| Metric | Healthy | Problem |
|
||||
|--------|---------|---------|
|
||||
| `train_loss` | Decreasing, starting ~2-4 for chat models | Flat or increasing from step 1 — data or LR issue |
|
||||
| `eval_loss` | Decreasing, tracks train_loss | Increasing while train_loss decreases — overfitting |
|
||||
| `grad_norm` | 0.1-10, relatively stable | Spikes >100 — instability. 0.0 — frozen weights |
|
||||
| `learning_rate` | Follows scheduler curve | Flat or NaN — config issue |
|
||||
|
||||
Watch for: loss never decreasing (check `train_on_inputs`, dataset, LR), loss goes to 0 quickly (overfitting), eval_loss diverging (reduce epochs, add regularization). See [training_stability.qmd](../training_stability.qmd).
|
||||
|
||||
## Known Issues
|
||||
|
||||
| Issue | Fix |
|
||||
|-------|-----|
|
||||
| OOM during training | Reduce `micro_batch_size`, enable `gradient_checkpointing`, reduce `sequence_len` |
|
||||
| `sample_packing` + SDPA + bf16 = 0.0 loss | Use `attn_implementation: flash_attention_2` or disable `sample_packing` |
|
||||
| Missing chat template error | Set `chat_template: chatml` explicitly |
|
||||
| Label masking wrong | Run `axolotl preprocess config.yaml --debug` and inspect labels |
|
||||
| Loss NaN | Use `bf16: auto`, lower LR, check data for empty samples |
|
||||
| Tokenizer pad token / infinite loss | Set `special_tokens: pad_token: "<\|end_of_text\|>"` |
|
||||
| FSDP save hangs | Use `fsdp_state_dict_type: FULL_STATE_DICT` |
|
||||
| DeepSpeed CheckpointError | Set `use_reentrant: true` in `gradient_checkpointing_kwargs` |
|
||||
|
||||
## Profiling
|
||||
|
||||
To profile training and identify optimization opportunities:
|
||||
|
||||
```yaml
|
||||
# Profile steps 3-7 (after warmup/autotuning settles)
|
||||
profiler_steps_start: 3
|
||||
profiler_steps: 5
|
||||
```
|
||||
|
||||
This produces `profiler_trace.json` (Chrome trace) and `snapshot.pickle` (memory snapshot) in `output_dir`.
|
||||
View the Chrome trace at `chrome://tracing`.
|
||||
|
||||
To programmatically inspect the trace:
|
||||
```bash
|
||||
python scripts/analyze_profile.py output_dir/
|
||||
```
|
||||
|
||||
The trace shows per-kernel CUDA times, memory allocations, and operator-level breakdown. Look for:
|
||||
- **Large matmul kernels**: candidates for fusion or quantization
|
||||
- **Memory copies (H2D/D2H)**: unnecessary data movement
|
||||
- **Small frequent kernels**: candidates for kernel fusion
|
||||
- **Gaps between kernels**: pipeline bubbles from CPU overhead
|
||||
|
||||
Full troubleshooting: [training_stability.qmd](../training_stability.qmd), [debugging.qmd](../debugging.qmd)
|
||||
|
||||
## File Map
|
||||
|
||||
```
|
||||
src/axolotl/
|
||||
cli/train.py # Entry point for `axolotl train`
|
||||
cli/preprocess.py # Entry point for `axolotl preprocess`
|
||||
core/builders/causal.py # HFCausalTrainerBuilder — wires config → SFT trainer
|
||||
core/trainers/base.py # AxolotlTrainer — base trainer class
|
||||
core/trainers/mixins/ # Packing, optimizer, scheduler, checkpoints
|
||||
prompt_strategies/ # Format handlers: chat_template, alpaca, completion, input_output
|
||||
utils/schemas/config.py # AxolotlInputConfig — main config schema
|
||||
utils/schemas/datasets.py # SFTDataset, DatasetConfig
|
||||
utils/schemas/peft.py # LoraConfig — LoRA parameters
|
||||
integrations/liger/ # Liger kernel plugin
|
||||
|
||||
examples/llama-3/ # LoRA, QLoRA, full FT example configs
|
||||
docs/getting-started.qmd # Quickstart with config templates
|
||||
docs/optimizations.qmd # Flash attention, gradient checkpointing, sample packing
|
||||
docs/multi-gpu.qmd # FSDP and DeepSpeed setup
|
||||
```
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user