Compare commits
4 Commits
sdpa-cp
...
attention_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef883b6960 | ||
|
|
d0c4930dd5 | ||
|
|
6ee7cb30fa | ||
|
|
ba47adc24b |
76
.github/workflows/base.yml
vendored
76
.github/workflows/base.yml
vendored
@@ -17,7 +17,7 @@ jobs:
|
|||||||
build-base:
|
build-base:
|
||||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
runs-on: ubuntu-latest-m
|
runs-on: axolotl-gpu-runner
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
@@ -28,50 +28,42 @@ jobs:
|
|||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.5.1
|
pytorch: 2.5.1
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
dockerfile: "Dockerfile-base"
|
|
||||||
- cuda: "124"
|
- cuda: "124"
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
cudnn_version: ""
|
cudnn_version: ""
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.6.0
|
pytorch: 2.6.0
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
dockerfile: "Dockerfile-base"
|
|
||||||
- cuda: "126"
|
- cuda: "126"
|
||||||
cuda_version: 12.6.3
|
cuda_version: 12.6.3
|
||||||
cudnn_version: ""
|
cudnn_version: ""
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.6.0
|
pytorch: 2.6.0
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
dockerfile: "Dockerfile-base"
|
|
||||||
- cuda: "126"
|
- cuda: "126"
|
||||||
cuda_version: 12.6.3
|
cuda_version: 12.6.3
|
||||||
cudnn_version: ""
|
cudnn_version: ""
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.0
|
pytorch: 2.7.0
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
dockerfile: "Dockerfile-base"
|
|
||||||
- cuda: "128"
|
- cuda: "128"
|
||||||
cuda_version: 12.6.3
|
cuda_version: 12.6.3
|
||||||
cudnn_version: ""
|
cudnn_version: ""
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.0
|
pytorch: 2.7.0
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
dockerfile: "Dockerfile-base"
|
|
||||||
- cuda: "128"
|
- cuda: "128"
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
cudnn_version: ""
|
cudnn_version: ""
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: nightly
|
pytorch: nightly
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
dockerfile: "Dockerfile-base-nightly"
|
- cuda: "128"
|
||||||
# # "next" is for release candidates of pytorch
|
cuda_version: 12.8.1
|
||||||
# - cuda: "128"
|
cudnn_version: ""
|
||||||
# cuda_version: 12.8.1
|
python_version: "3.11"
|
||||||
# cudnn_version: ""
|
pytorch: next
|
||||||
# python_version: "3.11"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
# pytorch: next
|
|
||||||
# torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
# dockerfile: "Dockerfile-base-next"
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -93,59 +85,7 @@ jobs:
|
|||||||
uses: docker/build-push-action@v4
|
uses: docker/build-push-action@v4
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
file: ./docker/${{ matrix.dockerfile }}
|
file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || matrix.pytorch == 'next' && './docker/Dockerfile-base-next' || './docker/Dockerfile-base' }}
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
|
||||||
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
|
||||||
build-args: |
|
|
||||||
CUDA_VERSION=${{ matrix.cuda_version }}
|
|
||||||
CUDNN_VERSION=${{ matrix.cudnn_version }}
|
|
||||||
CUDA=${{ matrix.cuda }}
|
|
||||||
PYTHON_VERSION=${{ matrix.python_version }}
|
|
||||||
PYTORCH_VERSION=${{ matrix.pytorch }}
|
|
||||||
TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
|
|
||||||
build-base-uv:
|
|
||||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
|
||||||
runs-on: ubuntu-latest-m
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
include:
|
|
||||||
- cuda: "126"
|
|
||||||
cuda_version: 12.6.3
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.6.0
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-uv-base"
|
|
||||||
- cuda: "128"
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.7.0
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-uv-base"
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Docker metadata
|
|
||||||
id: metadata
|
|
||||||
uses: docker/metadata-action@v5
|
|
||||||
with:
|
|
||||||
images: |
|
|
||||||
axolotlai/axolotl-base-uv
|
|
||||||
- name: Login to Docker Hub
|
|
||||||
uses: docker/login-action@v2
|
|
||||||
with:
|
|
||||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
|
||||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
|
||||||
- name: Set up Docker Buildx
|
|
||||||
uses: docker/setup-buildx-action@v3
|
|
||||||
- name: Build
|
|
||||||
uses: docker/build-push-action@v4
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
file: ./docker/${{ matrix.dockerfile }}
|
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
|
|||||||
1
.github/workflows/lint.yml
vendored
1
.github/workflows/lint.yml
vendored
@@ -9,7 +9,6 @@ on:
|
|||||||
- '.github/workflows/*.yml'
|
- '.github/workflows/*.yml'
|
||||||
- "*.[q]md"
|
- "*.[q]md"
|
||||||
- "examples/**/*.y[a]?ml"
|
- "examples/**/*.y[a]?ml"
|
||||||
- ".pre-commit-config.yaml"
|
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|||||||
10
.github/workflows/main.yml
vendored
10
.github/workflows/main.yml
vendored
@@ -31,11 +31,6 @@ jobs:
|
|||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.0
|
pytorch: 2.7.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.7.0
|
|
||||||
axolotl_extras:
|
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -99,11 +94,6 @@ jobs:
|
|||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.0
|
pytorch: 2.7.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.7.0
|
|
||||||
axolotl_extras:
|
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
|
|||||||
6
.github/workflows/multi-gpu-e2e.yml
vendored
6
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -3,12 +3,12 @@ name: docker-multigpu-tests-biweekly
|
|||||||
on:
|
on:
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
- 'tests/e2e/multigpu/**.py'
|
- 'tests/e2e/multigpu/*.py'
|
||||||
- 'requirements.txt'
|
- 'requirements.txt'
|
||||||
- 'setup.py'
|
- 'setup.py'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
- '.github/workflows/multi-gpu-e2e.yml'
|
- '.github/workflows/multi-gpu-e2e.yml'
|
||||||
- 'src/axolotl/core/trainers/mixins/context_parallel.py'
|
- 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
|
||||||
- 'src/axolotl/utils/distributed.py'
|
- 'src/axolotl/utils/distributed.py'
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
schedule:
|
schedule:
|
||||||
@@ -59,7 +59,7 @@ jobs:
|
|||||||
- name: Install Modal
|
- name: Install Modal
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install modal==1.0.2 jinja2
|
pip install modal==0.71.8 jinja2
|
||||||
- name: Update env vars
|
- name: Update env vars
|
||||||
run: |
|
run: |
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
|
|||||||
9
.github/workflows/precommit-autoupdate.yml
vendored
9
.github/workflows/precommit-autoupdate.yml
vendored
@@ -25,6 +25,7 @@ jobs:
|
|||||||
pre-commit autoupdate
|
pre-commit autoupdate
|
||||||
if [[ -n $(git status --porcelain) ]]; then
|
if [[ -n $(git status --porcelain) ]]; then
|
||||||
echo "changes=true" >> $GITHUB_OUTPUT
|
echo "changes=true" >> $GITHUB_OUTPUT
|
||||||
|
git diff .pre-commit-config.yaml > pre-commit-update.diff
|
||||||
fi
|
fi
|
||||||
|
|
||||||
- name: Create Pull Request
|
- name: Create Pull Request
|
||||||
@@ -38,3 +39,11 @@ jobs:
|
|||||||
commit-message: "chore: update pre-commit hooks"
|
commit-message: "chore: update pre-commit hooks"
|
||||||
body: |
|
body: |
|
||||||
Automated PR to update pre-commit hooks to their latest versions.
|
Automated PR to update pre-commit hooks to their latest versions.
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Changes:</summary>
|
||||||
|
|
||||||
|
```diff
|
||||||
|
${{ steps.update.outputs.diff }}
|
||||||
|
```
|
||||||
|
</details>
|
||||||
|
|||||||
87
.github/workflows/tests-nightly.yml
vendored
87
.github/workflows/tests-nightly.yml
vendored
@@ -18,96 +18,9 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
SKIP: no-commit-to-branch
|
SKIP: no-commit-to-branch
|
||||||
|
|
||||||
preload-cache:
|
|
||||||
name: Preload HF cache
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python_version: ["3.11"]
|
|
||||||
pytorch_version: ["2.6.0"]
|
|
||||||
timeout-minutes: 20
|
|
||||||
|
|
||||||
env:
|
|
||||||
AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Check out repository code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Restore HF cache
|
|
||||||
id: hf-cache-restore
|
|
||||||
uses: actions/cache/restore@v4
|
|
||||||
with:
|
|
||||||
path: |
|
|
||||||
/home/runner/.cache/huggingface/hub/datasets--*
|
|
||||||
/home/runner/.cache/huggingface/hub/models--*
|
|
||||||
key: ${{ runner.os }}-hf-hub-cache-v2
|
|
||||||
|
|
||||||
- name: Setup Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python_version }}
|
|
||||||
cache: 'pip' # caching pip dependencies
|
|
||||||
|
|
||||||
- name: upgrade pip
|
|
||||||
run: |
|
|
||||||
pip3 install --upgrade pip
|
|
||||||
pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
|
|
||||||
|
|
||||||
- name: Install PyTorch
|
|
||||||
run: |
|
|
||||||
pip3 install torch==${{ matrix.pytorch_version }}
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip3 show torch
|
|
||||||
pip3 install --no-build-isolation -U -e .
|
|
||||||
python scripts/unsloth_install.py | sh
|
|
||||||
python scripts/cutcrossentropy_install.py | sh
|
|
||||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
|
||||||
|
|
||||||
- name: Make sure PyTorch version wasn't clobbered
|
|
||||||
run: |
|
|
||||||
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
|
|
||||||
|
|
||||||
- name: Ensure axolotl CLI was installed
|
|
||||||
run: |
|
|
||||||
axolotl --help
|
|
||||||
|
|
||||||
- name: Pre-Download dataset fixture
|
|
||||||
run: |
|
|
||||||
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
|
||||||
|
|
||||||
- name: Run tests
|
|
||||||
run: |
|
|
||||||
pytest -v tests/conftest.py
|
|
||||||
|
|
||||||
- name: Upload coverage to Codecov
|
|
||||||
uses: codecov/codecov-action@v5
|
|
||||||
with:
|
|
||||||
token: ${{ secrets.CODECOV_TOKEN }}
|
|
||||||
files: ./coverage.xml
|
|
||||||
flags: unittests,pytorch-${{ matrix.pytorch_version }}
|
|
||||||
fail_ci_if_error: false
|
|
||||||
|
|
||||||
- name: cleanup pip cache
|
|
||||||
run: |
|
|
||||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
|
||||||
|
|
||||||
- name: Save HF cache
|
|
||||||
id: hf-cache
|
|
||||||
uses: actions/cache/save@v4
|
|
||||||
with:
|
|
||||||
path: |
|
|
||||||
/home/runner/.cache/huggingface/hub/datasets--*
|
|
||||||
/home/runner/.cache/huggingface/hub/models--*
|
|
||||||
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
|
|
||||||
|
|
||||||
pytest:
|
pytest:
|
||||||
name: PyTest
|
name: PyTest
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: [preload-cache]
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
max-parallel: 2
|
max-parallel: 2
|
||||||
|
|||||||
183
.github/workflows/tests.yml
vendored
183
.github/workflows/tests.yml
vendored
@@ -44,10 +44,96 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
SKIP: no-commit-to-branch
|
SKIP: no-commit-to-branch
|
||||||
|
|
||||||
|
preload-cache:
|
||||||
|
name: Preload HF cache
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
python_version: ["3.11"]
|
||||||
|
pytorch_version: ["2.6.0"]
|
||||||
|
timeout-minutes: 20
|
||||||
|
|
||||||
|
env:
|
||||||
|
AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Check out repository code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Restore HF cache
|
||||||
|
id: hf-cache-restore
|
||||||
|
uses: actions/cache/restore@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
|
key: ${{ runner.os }}-hf-hub-cache-v2
|
||||||
|
|
||||||
|
- name: Setup Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python_version }}
|
||||||
|
cache: 'pip' # caching pip dependencies
|
||||||
|
|
||||||
|
- name: upgrade pip
|
||||||
|
run: |
|
||||||
|
pip3 install --upgrade pip
|
||||||
|
pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
|
||||||
|
|
||||||
|
- name: Install PyTorch
|
||||||
|
run: |
|
||||||
|
pip3 install torch==${{ matrix.pytorch_version }}
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip3 show torch
|
||||||
|
pip3 install --no-build-isolation -U -e .
|
||||||
|
python scripts/unsloth_install.py | sh
|
||||||
|
python scripts/cutcrossentropy_install.py | sh
|
||||||
|
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
||||||
|
|
||||||
|
- name: Make sure PyTorch version wasn't clobbered
|
||||||
|
run: |
|
||||||
|
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
|
||||||
|
|
||||||
|
- name: Ensure axolotl CLI was installed
|
||||||
|
run: |
|
||||||
|
axolotl --help
|
||||||
|
|
||||||
|
- name: Pre-Download dataset fixture
|
||||||
|
run: |
|
||||||
|
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
run: |
|
||||||
|
pytest -v tests/conftest.py
|
||||||
|
|
||||||
|
- name: Upload coverage to Codecov
|
||||||
|
uses: codecov/codecov-action@v5
|
||||||
|
with:
|
||||||
|
token: ${{ secrets.CODECOV_TOKEN }}
|
||||||
|
files: ./coverage.xml
|
||||||
|
flags: unittests,pytorch-${{ matrix.pytorch_version }}
|
||||||
|
fail_ci_if_error: false
|
||||||
|
|
||||||
|
- name: cleanup pip cache
|
||||||
|
run: |
|
||||||
|
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||||
|
|
||||||
|
- name: Save HF cache
|
||||||
|
id: hf-cache
|
||||||
|
uses: actions/cache/save@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
|
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
|
||||||
|
|
||||||
pytest:
|
pytest:
|
||||||
name: PyTest
|
name: PyTest
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# needs: [preload-cache]
|
needs: [preload-cache]
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
@@ -59,11 +145,14 @@ jobs:
|
|||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Restore Cache from S3
|
- name: Restore HF cache
|
||||||
id: hf-cache-restore-s3
|
id: hf-cache-restore
|
||||||
run: |
|
uses: actions/cache/restore@v4
|
||||||
mkdir -p /home/runner/.cache/huggingface/hub
|
with:
|
||||||
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
path: |
|
||||||
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
|
key: ${{ runner.os }}-hf-hub-cache-v2
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
@@ -121,6 +210,7 @@ jobs:
|
|||||||
pytest-sdist:
|
pytest-sdist:
|
||||||
name: PyTest from Source Dist
|
name: PyTest from Source Dist
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
needs: [preload-cache]
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
@@ -132,11 +222,14 @@ jobs:
|
|||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Restore Cache from S3
|
- name: Restore HF cache
|
||||||
id: hf-cache-restore-s3
|
id: hf-cache-restore
|
||||||
run: |
|
uses: actions/cache/restore@v4
|
||||||
mkdir -p /home/runner/.cache/huggingface/hub
|
with:
|
||||||
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
path: |
|
||||||
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
|
key: ${{ runner.os }}-hf-hub-cache-v2
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
@@ -184,7 +277,6 @@ jobs:
|
|||||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||||
|
|
||||||
docker-e2e-tests-1st:
|
docker-e2e-tests-1st:
|
||||||
# Run this job first as a gate for running the remainder of the test matrix
|
|
||||||
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
runs-on: [self-hosted, modal]
|
runs-on: [self-hosted, modal]
|
||||||
@@ -201,13 +293,6 @@ jobs:
|
|||||||
pytorch: 2.6.0
|
pytorch: 2.6.0
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
axolotl_extras: vllm
|
axolotl_extras: vllm
|
||||||
- cuda: 126
|
|
||||||
cuda_version: 12.6.3
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.6.0
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras:
|
|
||||||
dockerfile: "Dockerfile-uv.jinja"
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -218,7 +303,7 @@ jobs:
|
|||||||
- name: Install Modal
|
- name: Install Modal
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install modal==1.0.2 jinja2
|
pip install modal==0.71.8 jinja2
|
||||||
- name: Update env vars
|
- name: Update env vars
|
||||||
run: |
|
run: |
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
@@ -229,7 +314,6 @@ jobs:
|
|||||||
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||||
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
||||||
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
|
|
||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
run: |
|
run: |
|
||||||
modal run cicd.e2e_tests
|
modal run cicd.e2e_tests
|
||||||
@@ -239,8 +323,6 @@ jobs:
|
|||||||
# this job needs to be run on self-hosted GPU runners...
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
runs-on: [self-hosted, modal]
|
runs-on: [self-hosted, modal]
|
||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
# Only run the remainder of the matrix if the first e2e check passed;
|
|
||||||
# this is to save on wasted compute costs for known failures that get caught in the first run
|
|
||||||
needs: [pre-commit, pytest, docker-e2e-tests-1st]
|
needs: [pre-commit, pytest, docker-e2e-tests-1st]
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
@@ -253,6 +335,12 @@ jobs:
|
|||||||
pytorch: 2.6.0
|
pytorch: 2.6.0
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
axolotl_extras: llmcompressor
|
axolotl_extras: llmcompressor
|
||||||
|
- cuda: 124
|
||||||
|
cuda_version: 12.4.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.4.1
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras:
|
||||||
- cuda: 124
|
- cuda: 124
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
@@ -265,12 +353,6 @@ jobs:
|
|||||||
pytorch: 2.7.0
|
pytorch: 2.7.0
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.7.0
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras:
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -281,7 +363,7 @@ jobs:
|
|||||||
- name: Install Modal
|
- name: Install Modal
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install modal==1.0.2 jinja2
|
pip install modal==0.71.8 jinja2
|
||||||
- name: Update env vars
|
- name: Update env vars
|
||||||
run: |
|
run: |
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
@@ -292,47 +374,6 @@ jobs:
|
|||||||
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||||
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
||||||
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
|
|
||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
run: |
|
run: |
|
||||||
modal run cicd.e2e_tests
|
modal run cicd.e2e_tests
|
||||||
|
|
||||||
docker-e2e-cleanup:
|
|
||||||
runs-on: [self-hosted, modal]
|
|
||||||
timeout-minutes: 90
|
|
||||||
needs: [docker-e2e-tests]
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
include:
|
|
||||||
- cuda: 124
|
|
||||||
cuda_version: 12.4.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.6.0
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras: vllm
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
- name: Install Modal
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install modal==1.0.2 jinja2
|
|
||||||
- name: Update env vars
|
|
||||||
run: |
|
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
|
||||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
|
||||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
|
||||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
|
||||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
|
||||||
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
|
||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
|
||||||
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
|
||||||
- name: Run tests job on Modal
|
|
||||||
run: |
|
|
||||||
modal run cicd.cleanup
|
|
||||||
|
|||||||
@@ -19,15 +19,15 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
- repo: https://github.com/PyCQA/flake8
|
- repo: https://github.com/PyCQA/flake8
|
||||||
rev: 7.2.0
|
rev: 7.1.2
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
- repo: https://github.com/pylint-dev/pylint
|
- repo: https://github.com/pylint-dev/pylint
|
||||||
rev: v3.3.7
|
rev: v3.3.6
|
||||||
hooks:
|
hooks:
|
||||||
- id: pylint
|
- id: pylint
|
||||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||||
rev: v1.16.0
|
rev: v1.15.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: mypy
|
- id: mypy
|
||||||
additional_dependencies:
|
additional_dependencies:
|
||||||
|
|||||||
@@ -242,12 +242,16 @@
|
|||||||
# early_stopping_patience: 3
|
# early_stopping_patience: 3
|
||||||
|
|
||||||
# # Specify a scheduler and kwargs to use with the optimizer
|
# # Specify a scheduler and kwargs to use with the optimizer
|
||||||
# lr_scheduler: # 'one_cycle' | empty for cosine
|
# lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
|
||||||
# lr_scheduler_kwargs:
|
# lr_scheduler_kwargs:
|
||||||
|
|
||||||
# # For one_cycle optim
|
# # For one_cycle optim
|
||||||
# lr_div_factor: # Learning rate div factor
|
# lr_div_factor: # Learning rate div factor
|
||||||
|
|
||||||
|
# # For log_sweep optim
|
||||||
|
# log_sweep_min_lr:
|
||||||
|
# log_sweep_max_lr:
|
||||||
|
|
||||||
# # Specify optimizer
|
# # Specify optimizer
|
||||||
# # Valid values are driven by the Transformers OptimizerNames class, see:
|
# # Valid values are driven by the Transformers OptimizerNames class, see:
|
||||||
# # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
|
# # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
|
||||||
|
|||||||
@@ -57,10 +57,8 @@ async def handler(job):
|
|||||||
logger.info("Training Complete.")
|
logger.info("Training Complete.")
|
||||||
|
|
||||||
# Cleanup
|
# Cleanup
|
||||||
if "WANDB_API_KEY" in os.environ:
|
del os.environ["WANDB_API_KEY"]
|
||||||
del os.environ["WANDB_API_KEY"]
|
del os.environ["HF_TOKEN"]
|
||||||
if "HF_TOKEN" in os.environ:
|
|
||||||
del os.environ["HF_TOKEN"]
|
|
||||||
|
|
||||||
|
|
||||||
runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
|
runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ Features:
|
|||||||
|
|
||||||
- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
|
- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
|
||||||
- Python 3.11
|
- Python 3.11
|
||||||
- PyTorch ≥2.5.1
|
- PyTorch ≥2.4.1
|
||||||
|
|
||||||
### Installation
|
### Installation
|
||||||
|
|
||||||
|
|||||||
45
_quarto.yml
45
_quarto.yml
@@ -17,9 +17,7 @@ quartodoc:
|
|||||||
- convert
|
- convert
|
||||||
- prompt_tokenizers
|
- prompt_tokenizers
|
||||||
- logging_config
|
- logging_config
|
||||||
- core.builders.base
|
- core.trainer_builder
|
||||||
- core.builders.causal
|
|
||||||
- core.builders.rl
|
|
||||||
- core.training_args
|
- core.training_args
|
||||||
- core.chat.messages
|
- core.chat.messages
|
||||||
- core.chat.format.chatml
|
- core.chat.format.chatml
|
||||||
@@ -45,37 +43,13 @@ quartodoc:
|
|||||||
- cli.vllm_serve
|
- cli.vllm_serve
|
||||||
- cli.cloud.base
|
- cli.cloud.base
|
||||||
- cli.cloud.modal_
|
- cli.cloud.modal_
|
||||||
- cli.quantize
|
|
||||||
- title: Trainers
|
- title: Trainers
|
||||||
desc: Training implementations
|
desc: Training implementations
|
||||||
contents:
|
contents:
|
||||||
- core.trainers.base
|
- core.trainers.base
|
||||||
- core.trainers.trl
|
- core.trainers.trl
|
||||||
- core.trainers.mamba
|
|
||||||
- core.trainers.relora
|
|
||||||
- core.trainers.dpo.trainer
|
- core.trainers.dpo.trainer
|
||||||
- core.trainers.grpo.trainer
|
- core.trainers.grpo.trainer
|
||||||
- core.trainers.grpo.sampler
|
|
||||||
- core.trainers.utils
|
|
||||||
- title: Model Loading
|
|
||||||
desc: Functionality for loading and patching models, tokenizers, etc.
|
|
||||||
contents:
|
|
||||||
- loaders.model
|
|
||||||
- loaders.tokenizer
|
|
||||||
- loaders.processor
|
|
||||||
- loaders.adapter
|
|
||||||
- loaders.patch_manager
|
|
||||||
- loaders.constants
|
|
||||||
- title: Mixins
|
|
||||||
desc: Mixin classes for augmenting trainers
|
|
||||||
contents:
|
|
||||||
- core.trainers.mixins.optimizer
|
|
||||||
- core.trainers.mixins.rng_state_loader
|
|
||||||
- core.trainers.mixins.scheduler
|
|
||||||
- title: Context Managers
|
|
||||||
desc: Context managers for altering trainer behaviors
|
|
||||||
contents:
|
|
||||||
- utils.ctx_managers.context_parallel
|
|
||||||
- title: Prompt Strategies
|
- title: Prompt Strategies
|
||||||
desc: Prompt formatting strategies
|
desc: Prompt formatting strategies
|
||||||
contents:
|
contents:
|
||||||
@@ -112,7 +86,7 @@ quartodoc:
|
|||||||
- kernels.swiglu
|
- kernels.swiglu
|
||||||
- kernels.quantize
|
- kernels.quantize
|
||||||
- kernels.utils
|
- kernels.utils
|
||||||
- title: Monkey Patches
|
- title: MonkeyPatches
|
||||||
desc: Runtime patches for model optimizations
|
desc: Runtime patches for model optimizations
|
||||||
contents:
|
contents:
|
||||||
- monkeypatch.llama_attn_hijack_flash
|
- monkeypatch.llama_attn_hijack_flash
|
||||||
@@ -129,16 +103,17 @@ quartodoc:
|
|||||||
- monkeypatch.trainer_fsdp_optim
|
- monkeypatch.trainer_fsdp_optim
|
||||||
- monkeypatch.transformers_fa_utils
|
- monkeypatch.transformers_fa_utils
|
||||||
- monkeypatch.unsloth_
|
- monkeypatch.unsloth_
|
||||||
|
- monkeypatch.attention.mllama
|
||||||
- monkeypatch.data.batch_dataset_fetcher
|
- monkeypatch.data.batch_dataset_fetcher
|
||||||
- monkeypatch.mixtral
|
- monkeypatch.mixtral
|
||||||
- monkeypatch.gradient_checkpointing.offload_cpu
|
|
||||||
- monkeypatch.gradient_checkpointing.offload_disk
|
|
||||||
- title: Utils
|
- title: Utils
|
||||||
desc: Utility functions
|
desc: Utility functions
|
||||||
contents:
|
contents:
|
||||||
|
- utils.models
|
||||||
- utils.tokenization
|
- utils.tokenization
|
||||||
- utils.chat_templates
|
- utils.chat_templates
|
||||||
- utils.lora
|
- utils.lora
|
||||||
|
- utils.lora_embeddings
|
||||||
- utils.model_shard_quant
|
- utils.model_shard_quant
|
||||||
- utils.bench
|
- utils.bench
|
||||||
- utils.freeze
|
- utils.freeze
|
||||||
@@ -149,7 +124,7 @@ quartodoc:
|
|||||||
- utils.optimizers.adopt
|
- utils.optimizers.adopt
|
||||||
- utils.data.pretraining
|
- utils.data.pretraining
|
||||||
- utils.data.sft
|
- utils.data.sft
|
||||||
- utils.quantization
|
- utils.gradient_checkpointing.unsloth
|
||||||
- title: Schemas
|
- title: Schemas
|
||||||
desc: Pydantic data models for Axolotl config
|
desc: Pydantic data models for Axolotl config
|
||||||
contents:
|
contents:
|
||||||
@@ -199,14 +174,12 @@ quartodoc:
|
|||||||
- utils.callbacks.lisa
|
- utils.callbacks.lisa
|
||||||
- utils.callbacks.mlflow_
|
- utils.callbacks.mlflow_
|
||||||
- utils.callbacks.comet_
|
- utils.callbacks.comet_
|
||||||
- utils.callbacks.qat
|
|
||||||
website:
|
website:
|
||||||
title: "Axolotl"
|
title: "Axolotl"
|
||||||
description: "We make fine-tuning accessible, scalable, and fun"
|
description: "We make fine-tuning accessible, scalable, and fun"
|
||||||
favicon: favicon.jpg
|
favicon: favicon.jpg
|
||||||
|
|
||||||
google-analytics: "G-9KYCVJBNMQ"
|
|
||||||
|
|
||||||
navbar:
|
navbar:
|
||||||
logo: image/axolotl_logo_digital_white.svg
|
logo: image/axolotl_logo_digital_white.svg
|
||||||
title: false
|
title: false
|
||||||
@@ -259,8 +232,6 @@ website:
|
|||||||
- docs/lr_groups.qmd
|
- docs/lr_groups.qmd
|
||||||
- docs/lora_optims.qmd
|
- docs/lora_optims.qmd
|
||||||
- docs/dataset_loading.qmd
|
- docs/dataset_loading.qmd
|
||||||
- docs/qat.qmd
|
|
||||||
- docs/quantize.qmd
|
|
||||||
|
|
||||||
- section: "Core Concepts"
|
- section: "Core Concepts"
|
||||||
contents:
|
contents:
|
||||||
@@ -274,7 +245,7 @@ website:
|
|||||||
- docs/unsloth.qmd
|
- docs/unsloth.qmd
|
||||||
- docs/torchao.qmd
|
- docs/torchao.qmd
|
||||||
- docs/custom_integrations.qmd
|
- docs/custom_integrations.qmd
|
||||||
- docs/context_parallelism.qmd
|
- docs/sequence_parallelism.qmd
|
||||||
|
|
||||||
- section: "Troubleshooting"
|
- section: "Troubleshooting"
|
||||||
contents:
|
contents:
|
||||||
|
|||||||
@@ -1,52 +0,0 @@
|
|||||||
FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}
|
|
||||||
|
|
||||||
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
|
||||||
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
|
|
||||||
ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
|
|
||||||
ENV CUDA="{{ CUDA }}"
|
|
||||||
ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
|
|
||||||
ENV GITHUB_REF="{{ GITHUB_REF }}"
|
|
||||||
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
|
|
||||||
ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
|
|
||||||
ENV HF_HOME="{{ HF_HOME }}"
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
|
|
||||||
|
|
||||||
WORKDIR /workspace/axolotl
|
|
||||||
|
|
||||||
RUN git fetch origin +$GITHUB_REF && \
|
|
||||||
git checkout FETCH_HEAD
|
|
||||||
|
|
||||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
|
||||||
RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
|
|
||||||
sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
|
|
||||||
sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
|
|
||||||
sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
|
|
||||||
sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
|
|
||||||
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
RUN uv pip install packaging==23.2 setuptools==75.8.0
|
|
||||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
|
||||||
uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
|
||||||
else \
|
|
||||||
uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
RUN python scripts/unsloth_install.py --uv | sh
|
|
||||||
RUN python scripts/cutcrossentropy_install.py --uv | sh
|
|
||||||
|
|
||||||
# So we can test the Docker image
|
|
||||||
RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt
|
|
||||||
|
|
||||||
# fix so that git fetch/pull from remote works
|
|
||||||
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
|
||||||
git config --get remote.origin.fetch
|
|
||||||
|
|
||||||
# helper for huggingface-login cli
|
|
||||||
RUN git config --global credential.helper store
|
|
||||||
@@ -18,7 +18,7 @@ pytest -v --durations=10 \
|
|||||||
--cov-append
|
--cov-append
|
||||||
|
|
||||||
# Run patched tests excluding lora kernels with coverage append
|
# Run patched tests excluding lora kernels with coverage append
|
||||||
pytest --full-trace -vvv --durations=10 \
|
pytest -v --durations=10 \
|
||||||
--ignore=tests/e2e/patched/lora_kernels \
|
--ignore=tests/e2e/patched/lora_kernels \
|
||||||
/workspace/axolotl/tests/e2e/patched \
|
/workspace/axolotl/tests/e2e/patched \
|
||||||
--cov=axolotl \
|
--cov=axolotl \
|
||||||
|
|||||||
@@ -1,19 +0,0 @@
|
|||||||
"""Modal app to run axolotl GPU cleanup"""
|
|
||||||
|
|
||||||
from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
|
|
||||||
|
|
||||||
|
|
||||||
@app.function(
|
|
||||||
image=cicd_image,
|
|
||||||
timeout=60 * 60,
|
|
||||||
cpu=8.0,
|
|
||||||
memory=131072,
|
|
||||||
volumes=VOLUME_CONFIG,
|
|
||||||
)
|
|
||||||
def cleanup():
|
|
||||||
run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
|
|
||||||
|
|
||||||
|
|
||||||
@app.local_entrypoint()
|
|
||||||
def main():
|
|
||||||
cleanup.remote()
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# cleanup old cache files for datasets processing and intermediate mappings
|
|
||||||
find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
|
|
||||||
find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;
|
|
||||||
@@ -1,12 +1,75 @@
|
|||||||
"""Modal app to run axolotl GPU tests"""
|
"""Modal app to run axolotl GPU tests"""
|
||||||
|
|
||||||
from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
|
# pylint: disable=duplicate-code
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import jinja2
|
||||||
|
import modal
|
||||||
|
from jinja2 import select_autoescape
|
||||||
|
from modal import App, Image
|
||||||
|
|
||||||
|
cicd_path = pathlib.Path(__file__).parent.resolve()
|
||||||
|
|
||||||
|
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
|
||||||
|
template_env = jinja2.Environment(
|
||||||
|
loader=template_loader, autoescape=select_autoescape()
|
||||||
|
)
|
||||||
|
df_template = template_env.get_template("Dockerfile.jinja")
|
||||||
|
|
||||||
|
df_args = {
|
||||||
|
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
||||||
|
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
||||||
|
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
|
||||||
|
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
|
||||||
|
"CUDA": os.environ.get("CUDA", "121"),
|
||||||
|
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
||||||
|
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
||||||
|
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
|
||||||
|
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
|
||||||
|
"HF_HOME": "/workspace/data/huggingface-cache/hub",
|
||||||
|
}
|
||||||
|
|
||||||
|
dockerfile_contents = df_template.render(**df_args)
|
||||||
|
|
||||||
|
temp_dir = tempfile.mkdtemp()
|
||||||
|
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
|
||||||
|
f.write(dockerfile_contents)
|
||||||
|
|
||||||
|
cicd_image = Image.from_dockerfile(
|
||||||
|
pathlib.Path(temp_dir) / "Dockerfile",
|
||||||
|
context_mount=None,
|
||||||
|
force_build=True,
|
||||||
|
gpu="A10G",
|
||||||
|
).env(df_args)
|
||||||
|
|
||||||
|
app = App("Axolotl CI/CD", secrets=[])
|
||||||
|
|
||||||
|
hf_cache_volume = modal.Volume.from_name(
|
||||||
|
"axolotl-ci-hf-hub-cache", create_if_missing=True
|
||||||
|
)
|
||||||
|
VOLUME_CONFIG = {
|
||||||
|
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
|
||||||
|
}
|
||||||
|
|
||||||
|
N_GPUS = int(os.environ.get("N_GPUS", 1))
|
||||||
|
GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
|
||||||
|
|
||||||
|
|
||||||
|
def run_cmd(cmd: str, run_folder: str):
|
||||||
|
import subprocess # nosec
|
||||||
|
|
||||||
|
# Propagate errors from subprocess.
|
||||||
|
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
|
||||||
|
exit(exit_code) # pylint: disable=consider-using-sys-exit
|
||||||
|
|
||||||
|
|
||||||
@app.function(
|
@app.function(
|
||||||
image=cicd_image,
|
image=cicd_image,
|
||||||
gpu=GPU_CONFIG,
|
gpu=GPU_CONFIG,
|
||||||
timeout=90 * 60, # 90 min
|
timeout=60 * 60,
|
||||||
cpu=8.0,
|
cpu=8.0,
|
||||||
memory=131072,
|
memory=131072,
|
||||||
volumes=VOLUME_CONFIG,
|
volumes=VOLUME_CONFIG,
|
||||||
|
|||||||
@@ -24,9 +24,9 @@ df_template = template_env.get_template("Dockerfile.jinja")
|
|||||||
df_args = {
|
df_args = {
|
||||||
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
||||||
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
||||||
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
|
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
|
||||||
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
|
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
|
||||||
"CUDA": os.environ.get("CUDA", "124"),
|
"CUDA": os.environ.get("CUDA", "121"),
|
||||||
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
||||||
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
||||||
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
|
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
|
||||||
@@ -55,7 +55,7 @@ VOLUME_CONFIG = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
N_GPUS = int(os.environ.get("N_GPUS", 2))
|
N_GPUS = int(os.environ.get("N_GPUS", 2))
|
||||||
GPU_CONFIG = f"H100:{N_GPUS}"
|
GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
|
||||||
|
|
||||||
|
|
||||||
def run_cmd(cmd: str, run_folder: str):
|
def run_cmd(cmd: str, run_folder: str):
|
||||||
@@ -70,7 +70,7 @@ def run_cmd(cmd: str, run_folder: str):
|
|||||||
image=cicd_image,
|
image=cicd_image,
|
||||||
gpu=GPU_CONFIG,
|
gpu=GPU_CONFIG,
|
||||||
timeout=90 * 60,
|
timeout=90 * 60,
|
||||||
cpu=16.0,
|
cpu=8.0,
|
||||||
memory=131072 * N_GPUS,
|
memory=131072 * N_GPUS,
|
||||||
volumes=VOLUME_CONFIG,
|
volumes=VOLUME_CONFIG,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,68 +0,0 @@
|
|||||||
"""Modal app to run axolotl GPU tests"""
|
|
||||||
|
|
||||||
# pylint: disable=duplicate-code
|
|
||||||
|
|
||||||
import os
|
|
||||||
import pathlib
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
import jinja2
|
|
||||||
import modal
|
|
||||||
import modal.experimental
|
|
||||||
from jinja2 import select_autoescape
|
|
||||||
from modal import App
|
|
||||||
|
|
||||||
cicd_path = pathlib.Path(__file__).parent.resolve()
|
|
||||||
|
|
||||||
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
|
|
||||||
template_env = jinja2.Environment(
|
|
||||||
loader=template_loader, autoescape=select_autoescape()
|
|
||||||
)
|
|
||||||
dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
|
|
||||||
df_template = template_env.get_template(dockerfile)
|
|
||||||
|
|
||||||
df_args = {
|
|
||||||
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
|
||||||
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
|
||||||
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
|
|
||||||
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
|
|
||||||
"CUDA": os.environ.get("CUDA", "124"),
|
|
||||||
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
|
||||||
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
|
||||||
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
|
|
||||||
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
|
|
||||||
"HF_HOME": "/workspace/data/huggingface-cache/hub",
|
|
||||||
}
|
|
||||||
|
|
||||||
dockerfile_contents = df_template.render(**df_args)
|
|
||||||
|
|
||||||
temp_dir = tempfile.mkdtemp()
|
|
||||||
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
|
|
||||||
f.write(dockerfile_contents)
|
|
||||||
|
|
||||||
cicd_image = modal.experimental.raw_dockerfile_image(
|
|
||||||
pathlib.Path(temp_dir) / "Dockerfile",
|
|
||||||
# context_mount=None,
|
|
||||||
force_build=True,
|
|
||||||
# gpu="A10G",
|
|
||||||
).env(df_args)
|
|
||||||
|
|
||||||
app = App("Axolotl CI/CD", secrets=[])
|
|
||||||
|
|
||||||
hf_cache_volume = modal.Volume.from_name(
|
|
||||||
"axolotl-ci-hf-hub-cache", create_if_missing=True
|
|
||||||
)
|
|
||||||
VOLUME_CONFIG = {
|
|
||||||
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
|
|
||||||
}
|
|
||||||
|
|
||||||
N_GPUS = int(os.environ.get("N_GPUS", 1))
|
|
||||||
GPU_CONFIG = f"L40S:{N_GPUS}"
|
|
||||||
|
|
||||||
|
|
||||||
def run_cmd(cmd: str, run_folder: str):
|
|
||||||
import subprocess # nosec
|
|
||||||
|
|
||||||
# Propagate errors from subprocess.
|
|
||||||
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
|
|
||||||
exit(exit_code) # pylint: disable=consider-using-sys-exit
|
|
||||||
@@ -19,7 +19,7 @@ coverage:
|
|||||||
if_no_uploads: error
|
if_no_uploads: error
|
||||||
if_not_found: success
|
if_not_found: success
|
||||||
if_ci_failed: error
|
if_ci_failed: error
|
||||||
only_pulls: true
|
only_pulls: false
|
||||||
flags: null
|
flags: null
|
||||||
paths: null
|
paths: null
|
||||||
patch:
|
patch:
|
||||||
|
|||||||
@@ -1,36 +0,0 @@
|
|||||||
ARG CUDA_VERSION="12.6.3"
|
|
||||||
ARG CUDNN_VERSION=""
|
|
||||||
ARG UBUNTU_VERSION="22.04"
|
|
||||||
ARG MAX_JOBS=4
|
|
||||||
|
|
||||||
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
|
||||||
|
|
||||||
ARG PYTHON_VERSION="3.11"
|
|
||||||
ARG PYTORCH_VERSION="2.6.0"
|
|
||||||
ARG CUDA="126"
|
|
||||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
|
||||||
|
|
||||||
ENV PYTHON_VERSION=$PYTHON_VERSION
|
|
||||||
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
|
|
||||||
ENV UV_TORCH_BACKEND="cu${CUDA}"
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
|
|
||||||
&& git lfs install --skip-repo \
|
|
||||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
||||||
|
|
||||||
ENV PATH="/root/.local/bin:${PATH}"
|
|
||||||
|
|
||||||
RUN uv python install ${PYTHON_VERSION}
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
RUN uv venv --no-project --relocatable axolotl-venv
|
|
||||||
|
|
||||||
ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
|
|
||||||
|
|
||||||
RUN uv pip install packaging setuptools wheel \
|
|
||||||
&& uv pip install torch==${PYTORCH_VERSION} \
|
|
||||||
&& uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
|
|
||||||
&& uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
|
|
||||||
&& uv pip install awscli pydantic
|
|
||||||
10
docs/cli.qmd
10
docs/cli.qmd
@@ -209,16 +209,6 @@ axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
|
|||||||
|
|
||||||
This would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.
|
This would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.
|
||||||
|
|
||||||
### quantize
|
|
||||||
|
|
||||||
Quantizes a model using the quantization configuration specified in your YAML file.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
axolotl quantize config.yml
|
|
||||||
```
|
|
||||||
|
|
||||||
See [Quantization](./quantize.qmd) for more details.
|
|
||||||
|
|
||||||
|
|
||||||
## Legacy CLI Usage
|
## Legacy CLI Usage
|
||||||
|
|
||||||
|
|||||||
@@ -65,20 +65,6 @@ bnb_config_kwargs:
|
|||||||
bnb_4bit_quant_type: nf4
|
bnb_4bit_quant_type: nf4
|
||||||
bnb_4bit_use_double_quant: true
|
bnb_4bit_use_double_quant: true
|
||||||
|
|
||||||
# quantization aware training
|
|
||||||
qat:
|
|
||||||
activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
|
|
||||||
weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
|
|
||||||
group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
|
|
||||||
fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
|
|
||||||
|
|
||||||
# post-training quantization
|
|
||||||
quantization:
|
|
||||||
weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
|
|
||||||
activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
|
|
||||||
group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
|
|
||||||
quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
|
|
||||||
|
|
||||||
|
|
||||||
# Whether you are training a 4-bit GPTQ quantized model
|
# Whether you are training a 4-bit GPTQ quantized model
|
||||||
gptq: true
|
gptq: true
|
||||||
@@ -112,10 +98,8 @@ plugins:
|
|||||||
# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||||
|
|
||||||
# A list of one or more datasets to finetune the model with
|
# A list of one or more datasets to finetune the model with
|
||||||
# See https://docs.axolotl.ai/docs/dataset_loading.html for guide on loading datasets
|
|
||||||
# See https://docs.axolotl.ai/docs/dataset-formats/ for guide on dataset formats
|
|
||||||
datasets:
|
datasets:
|
||||||
# HuggingFace dataset repo | s3:// | gs:// | path to local file or directory
|
# HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
|
||||||
- path: vicgalle/alpaca-gpt4
|
- path: vicgalle/alpaca-gpt4
|
||||||
# The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
|
# The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
|
||||||
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
||||||
@@ -237,7 +221,7 @@ datasets:
|
|||||||
# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
|
# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
|
||||||
shuffle_merged_datasets: true
|
shuffle_merged_datasets: true
|
||||||
|
|
||||||
# Deduplicates datasets and test_datasets with identical entries.
|
Deduplicates datasets and test_datasets with identical entries.
|
||||||
dataset_exact_deduplication: true
|
dataset_exact_deduplication: true
|
||||||
|
|
||||||
# A list of one or more datasets to eval the model with.
|
# A list of one or more datasets to eval the model with.
|
||||||
@@ -286,25 +270,10 @@ trl:
|
|||||||
|
|
||||||
num_generations: # Optional[int]. Number of generations to sample.
|
num_generations: # Optional[int]. Number of generations to sample.
|
||||||
log_completions: # Optional[bool]. Whether to log completions.
|
log_completions: # Optional[bool]. Whether to log completions.
|
||||||
num_completions_to_print: # Optional[int]. Number of completions to print when log_completions is True.
|
|
||||||
|
|
||||||
sync_ref_model: # Optional[bool]. Whether to sync the reference model.
|
sync_ref_model: # Optional[bool]. Whether to sync the reference model.
|
||||||
ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.
|
ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.
|
||||||
ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.
|
ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.
|
||||||
scale_rewards: # Optional[bool]. Whether to scale rewards by their standard deviation.
|
|
||||||
|
|
||||||
temperature: # Optional[float]. Sampling temperature for the GRPO policy.
|
|
||||||
top_p: # Optional[float]. Top-p sampling probability for the generation policy.
|
|
||||||
top_k: # Optional[int]. Top-k sampling for the generation policy.
|
|
||||||
min_p: # Optional[float]. Minimum probability for the generation policy.
|
|
||||||
repetition_penalty: # Optional[float]. Penalty for tokens that appear in prompt and generated text.
|
|
||||||
|
|
||||||
num_iterations: # Optional[int]. Number of iterations per batch (μ) for GRPO.
|
|
||||||
epsilon: # Optional[float]. Epsilon value for clipping in the GRPO algorithm.
|
|
||||||
epsilon_high: # Optional[float]. Upper-bound epsilon value for clipping in the GRPO algorithm.
|
|
||||||
use_liger_loss: # Optional[bool]. Whether to use Liger loss for GRPO.
|
|
||||||
loss_type: # Optional[str]. Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.
|
|
||||||
mask_truncated_completions: # Optional[bool]. Whether to exclude truncated completions from loss calculation.
|
|
||||||
|
|
||||||
|
|
||||||
# reward modelling: `True` or `False`
|
# reward modelling: `True` or `False`
|
||||||
@@ -514,7 +483,6 @@ output_dir: ./completed-model
|
|||||||
# setting to `auto` will enable torch compile when torch>=2.5.1
|
# setting to `auto` will enable torch compile when torch>=2.5.1
|
||||||
torch_compile: # Optional[Union[Literal["auto"], bool]]
|
torch_compile: # Optional[Union[Literal["auto"], bool]]
|
||||||
torch_compile_backend: # Optional[str]
|
torch_compile_backend: # Optional[str]
|
||||||
torch_compile_mode: # 'default' | 'reduce-overhead' | 'max-autotune'
|
|
||||||
|
|
||||||
# Training hyperparameters
|
# Training hyperparameters
|
||||||
|
|
||||||
@@ -537,7 +505,6 @@ save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of eac
|
|||||||
save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
|
save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
|
||||||
saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
|
saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
|
||||||
save_total_limit: # Checkpoints saved at a time
|
save_total_limit: # Checkpoints saved at a time
|
||||||
save_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints.
|
|
||||||
# Maximum number of iterations to train for. It precedes num_epochs which means that
|
# Maximum number of iterations to train for. It precedes num_epochs which means that
|
||||||
# if both are set, num_epochs will not be guaranteed.
|
# if both are set, num_epochs will not be guaranteed.
|
||||||
# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
|
# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
|
||||||
@@ -561,7 +528,7 @@ profiler_steps: # enable the pytorch profiler to capture the first N steps of tr
|
|||||||
loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
|
loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
|
||||||
loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
|
loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
|
||||||
|
|
||||||
# Save model as safetensors (require safetensors package). Default True
|
# Save model as safetensors (require safetensors package)
|
||||||
save_safetensors:
|
save_safetensors:
|
||||||
|
|
||||||
# Whether to mask out or include the human's prompt from the training labels
|
# Whether to mask out or include the human's prompt from the training labels
|
||||||
@@ -571,7 +538,7 @@ train_on_inputs: false
|
|||||||
# Note that training loss may have an oscillating pattern with this enabled.
|
# Note that training loss may have an oscillating pattern with this enabled.
|
||||||
group_by_length: false
|
group_by_length: false
|
||||||
|
|
||||||
# Whether to use gradient checkpointing. Available options are: true, false, "offload", "offload_disk".
|
# Whether to use gradient checkpointing. Available options are: true, false, "offload".
|
||||||
# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
|
# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
|
||||||
gradient_checkpointing: false
|
gradient_checkpointing: false
|
||||||
# additional kwargs to pass to the trainer for gradient checkpointing
|
# additional kwargs to pass to the trainer for gradient checkpointing
|
||||||
@@ -583,24 +550,7 @@ gradient_checkpointing: false
|
|||||||
early_stopping_patience: 3
|
early_stopping_patience: 3
|
||||||
|
|
||||||
# Specify a scheduler and kwargs to use with the optimizer
|
# Specify a scheduler and kwargs to use with the optimizer
|
||||||
# Valid values are driven by the Transformers SchedulerType class, see:
|
lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine
|
||||||
# https://github.com/huggingface/transformers/blob/5f4ecf2d9f867a1255131d2461d75793c0cf1db2/src/transformers/trainer_utils.py#L420
|
|
||||||
# Valid values include
|
|
||||||
# - 'linear'
|
|
||||||
# - 'cosine' (default)
|
|
||||||
# - 'cosine_with_restarts'
|
|
||||||
# - 'polynomial'
|
|
||||||
# - 'constant'
|
|
||||||
# - 'constant_with_warmup'
|
|
||||||
# - 'inverse_sqrt'
|
|
||||||
# - 'reduce_lr_on_plateau'
|
|
||||||
# - 'cosine_with_min_lr'
|
|
||||||
# - 'warmup_stable_decay'
|
|
||||||
|
|
||||||
# Additional schedulers include:
|
|
||||||
# - 'one_cycle'
|
|
||||||
# - 'rex'
|
|
||||||
lr_scheduler:
|
|
||||||
lr_scheduler_kwargs:
|
lr_scheduler_kwargs:
|
||||||
cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
|
cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
|
||||||
cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
|
cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
|
||||||
@@ -618,7 +568,7 @@ lr_div_factor: # Learning rate div factor
|
|||||||
#
|
#
|
||||||
# Valid values for 'optimizer' include:
|
# Valid values for 'optimizer' include:
|
||||||
# - adamw_torch
|
# - adamw_torch
|
||||||
# - adamw_torch_fused (default)
|
# - adamw_torch_fused
|
||||||
# - adamw_torch_xla
|
# - adamw_torch_xla
|
||||||
# - adamw_torch_npu_fused
|
# - adamw_torch_npu_fused
|
||||||
# - adamw_apex_fused
|
# - adamw_apex_fused
|
||||||
@@ -662,7 +612,6 @@ lr_div_factor: # Learning rate div factor
|
|||||||
# - optimi_adamw
|
# - optimi_adamw
|
||||||
# - ao_adamw_8bit
|
# - ao_adamw_8bit
|
||||||
# - ao_adamw_fp8
|
# - ao_adamw_fp8
|
||||||
# - came_pytorch
|
|
||||||
optimizer:
|
optimizer:
|
||||||
# Dictionary of arguments to pass to the optimizer
|
# Dictionary of arguments to pass to the optimizer
|
||||||
optim_args:
|
optim_args:
|
||||||
@@ -682,9 +631,7 @@ weight_decay:
|
|||||||
# adamw hyperparams
|
# adamw hyperparams
|
||||||
adam_beta1:
|
adam_beta1:
|
||||||
adam_beta2:
|
adam_beta2:
|
||||||
adam_beta3: # only used for CAME Optimizer
|
|
||||||
adam_epsilon:
|
adam_epsilon:
|
||||||
adam_epsilon2: # only used for CAME Optimizer
|
|
||||||
# Gradient clipping max norm
|
# Gradient clipping max norm
|
||||||
max_grad_norm:
|
max_grad_norm:
|
||||||
|
|
||||||
@@ -764,13 +711,13 @@ ddp_timeout:
|
|||||||
ddp_bucket_cap_mb:
|
ddp_bucket_cap_mb:
|
||||||
ddp_broadcast_buffers:
|
ddp_broadcast_buffers:
|
||||||
|
|
||||||
# Context parallelism
|
# Sequence parallelism
|
||||||
# Set to a divisor of the number of GPUs available to split sequences into chunks of equal size.
|
# Set to a divisor of the number of GPUs available to split sequences into chunks of equal size.
|
||||||
# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
|
# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
|
||||||
# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
|
# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
|
||||||
# subsequences, or set to 4 to split into four equal-sized subsequences.
|
# subsequences, or set to 4 to split into four equal-sized subsequences.
|
||||||
# See https://docs.axolotl.ai/docs/context_parallelism.html for more details.
|
# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details.
|
||||||
context_parallel_degree:
|
sequence_parallel_degree:
|
||||||
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
||||||
# Must evenly divide the number of KV heads in your model.
|
# Must evenly divide the number of KV heads in your model.
|
||||||
heads_k_stride: 1
|
heads_k_stride: 1
|
||||||
|
|||||||
@@ -36,6 +36,10 @@ It is typically recommended to save your dataset as `.jsonl` due to its flexibil
|
|||||||
|
|
||||||
Axolotl supports loading from a Hugging Face hub repo or from local files.
|
Axolotl supports loading from a Hugging Face hub repo or from local files.
|
||||||
|
|
||||||
|
::: {.callout-important}
|
||||||
|
For pre-training only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts.
|
||||||
|
:::
|
||||||
|
|
||||||
### Pre-training from Hugging Face hub datasets
|
### Pre-training from Hugging Face hub datasets
|
||||||
|
|
||||||
As an example, to train using a Hugging Face dataset `hf_org/name`, you can pass the following config:
|
As an example, to train using a Hugging Face dataset `hf_org/name`, you can pass the following config:
|
||||||
@@ -73,21 +77,18 @@ datasets:
|
|||||||
type: completion
|
type: completion
|
||||||
```
|
```
|
||||||
|
|
||||||
From local files:
|
From local files (either example works):
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
datasets:
|
datasets:
|
||||||
- path: A.jsonl
|
- path: A.jsonl
|
||||||
type: completion
|
type: completion
|
||||||
|
|
||||||
- path: B.jsonl
|
- path: json
|
||||||
|
data_files: ["A.jsonl", "B.jsonl", "C.jsonl"]
|
||||||
type: completion
|
type: completion
|
||||||
```
|
```
|
||||||
|
|
||||||
::: {.callout-important}
|
|
||||||
For `completion` only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for `pretraining_dataset` too, please let us know or help make a PR!
|
|
||||||
:::
|
|
||||||
|
|
||||||
### Pre-training dataset configuration tips
|
### Pre-training dataset configuration tips
|
||||||
|
|
||||||
#### Setting max_steps
|
#### Setting max_steps
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ datasets:
|
|||||||
|
|
||||||
#### Files
|
#### Files
|
||||||
|
|
||||||
To load a JSON file, you would do something like this:
|
Usually, to load a JSON file, you would do something like this:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
@@ -66,11 +66,19 @@ Which translates to the following config:
|
|||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
datasets:
|
datasets:
|
||||||
- path: data.json
|
- path: json
|
||||||
ds_type: json
|
data_files: /path/to/your/file.jsonl
|
||||||
```
|
```
|
||||||
|
|
||||||
In the example above, it can be seen that we can just point the `path` to the file or directory along with the `ds_type` to load the dataset.
|
However, to make things easier, we have added a few shortcuts for loading local dataset files.
|
||||||
|
|
||||||
|
You can just point the `path` to the file or directory along with the `ds_type` to load the dataset. The below example shows for a JSON file:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
datasets:
|
||||||
|
- path: /path/to/your/file.jsonl
|
||||||
|
ds_type: json
|
||||||
|
```
|
||||||
|
|
||||||
This works for CSV, JSON, Parquet, and Arrow files.
|
This works for CSV, JSON, Parquet, and Arrow files.
|
||||||
|
|
||||||
|
|||||||
@@ -8,10 +8,6 @@ format:
|
|||||||
|
|
||||||
This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).
|
This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).
|
||||||
|
|
||||||
::: {.callout-important}
|
|
||||||
For Blackwell GPUs, please use the tags with Pytorch 2.7.0 and CUDA 12.8.
|
|
||||||
:::
|
|
||||||
|
|
||||||
## Base
|
## Base
|
||||||
|
|
||||||
The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
|
The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
|
||||||
@@ -36,6 +32,7 @@ Tags examples:
|
|||||||
- `main-base-py3.11-cu126-2.7.0`
|
- `main-base-py3.11-cu126-2.7.0`
|
||||||
- `main-base-py3.11-cu124-2.6.0`
|
- `main-base-py3.11-cu124-2.6.0`
|
||||||
- `main-base-py3.11-cu124-2.5.1`
|
- `main-base-py3.11-cu124-2.5.1`
|
||||||
|
- `main-base-py3.11-cu124-2.4.1`
|
||||||
|
|
||||||
## Main
|
## Main
|
||||||
|
|
||||||
@@ -76,10 +73,12 @@ Tags examples:
|
|||||||
- `main-py3.11-cu126-2.7.0`
|
- `main-py3.11-cu126-2.7.0`
|
||||||
- `main-py3.11-cu124-2.6.0`
|
- `main-py3.11-cu124-2.6.0`
|
||||||
- `main-py3.11-cu124-2.5.1`
|
- `main-py3.11-cu124-2.5.1`
|
||||||
|
- `main-py3.11-cu124-2.4.1`
|
||||||
- `main-latest`
|
- `main-latest`
|
||||||
- `main-20250303-py3.11-cu124-2.6.0`
|
- `main-20250303-py3.11-cu124-2.6.0`
|
||||||
- `main-20250303-py3.11-cu124-2.5.1`
|
- `main-20250303-py3.11-cu124-2.5.1`
|
||||||
- `0.9.2`
|
- `main-20250303-py3.11-cu124-2.4.1`
|
||||||
|
- `0.7.1`
|
||||||
|
|
||||||
## Cloud
|
## Cloud
|
||||||
|
|
||||||
|
|||||||
14
docs/faq.qmd
14
docs/faq.qmd
@@ -110,17 +110,3 @@ description: Frequently asked questions
|
|||||||
> A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.
|
> A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.
|
||||||
|
|
||||||
> Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens.
|
> Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens.
|
||||||
|
|
||||||
**Q: `Data processing error: CAS service error`**
|
|
||||||
|
|
||||||
> A: Try disabling XET with `export HF_HUB_DISABLE_XET=1`
|
|
||||||
|
|
||||||
**Q: `torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice. `**
|
|
||||||
|
|
||||||
> A: Depending on the version of torch, you may need to include this in your YAML:
|
|
||||||
|
|
||||||
> ```yaml
|
|
||||||
> flex_attn_compile_kwargs:
|
|
||||||
> dynamic: false
|
|
||||||
> mode: max-autotune-no-cudagraphs
|
|
||||||
> ```
|
|
||||||
|
|||||||
@@ -104,7 +104,7 @@ the `alpaca` dataset format, which has the following format:
|
|||||||
Please see our [Dataset Formats](dataset-formats) for more dataset formats and how to
|
Please see our [Dataset Formats](dataset-formats) for more dataset formats and how to
|
||||||
format them.
|
format them.
|
||||||
|
|
||||||
2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca`
|
2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca
|
||||||
format):
|
format):
|
||||||
|
|
||||||
```json
|
```json
|
||||||
@@ -120,12 +120,6 @@ axolotl train my_training.yml
|
|||||||
|
|
||||||
## Common Tasks {#sec-common-tasks}
|
## Common Tasks {#sec-common-tasks}
|
||||||
|
|
||||||
::: {.callout-tip}
|
|
||||||
|
|
||||||
The same yaml file is used for training, inference, and merging.
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
### Testing Your Model {#sec-testing}
|
### Testing Your Model {#sec-testing}
|
||||||
|
|
||||||
After training, test your model:
|
After training, test your model:
|
||||||
@@ -134,16 +128,6 @@ After training, test your model:
|
|||||||
axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out"
|
axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out"
|
||||||
```
|
```
|
||||||
|
|
||||||
More details can be found in [Inference](inference.qmd).
|
|
||||||
|
|
||||||
### Using a UI {#sec-ui}
|
|
||||||
|
|
||||||
Launch a Gradio interface:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
|
|
||||||
```
|
|
||||||
|
|
||||||
### Preprocessing Data {#sec-preprocessing}
|
### Preprocessing Data {#sec-preprocessing}
|
||||||
|
|
||||||
For large datasets, preprocess first:
|
For large datasets, preprocess first:
|
||||||
@@ -152,22 +136,14 @@ For large datasets, preprocess first:
|
|||||||
axolotl preprocess my_training.yml
|
axolotl preprocess my_training.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
Please make sure to set `dataset_prepared_path: ` in your config to set the path to save the prepared dataset.
|
### Using a UI {#sec-ui}
|
||||||
|
|
||||||
More details can be found in [Dataset Preprocessing](dataset_preprocessing.qmd).
|
Launch a Gradio interface:
|
||||||
|
|
||||||
### Merging LoRA weights {#sec-merging-lora}
|
|
||||||
|
|
||||||
To merge the LoRA weights back into the base model, run:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
axolotl merge-lora my_training.yml --lora-model-dir="./outputs/lora-out"
|
axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
|
||||||
```
|
```
|
||||||
|
|
||||||
The merged model will be saved in the `{output_dir}/merged` directory.
|
|
||||||
|
|
||||||
More details can be found in [Merging LoRA weights](inference.qmd#sec-merging).
|
|
||||||
|
|
||||||
## Next Steps {#sec-next-steps}
|
## Next Steps {#sec-next-steps}
|
||||||
|
|
||||||
Now that you have the basics, you might want to:
|
Now that you have the basics, you might want to:
|
||||||
@@ -180,7 +156,6 @@ Now that you have the basics, you might want to:
|
|||||||
Check our other guides for details on these topics:
|
Check our other guides for details on these topics:
|
||||||
|
|
||||||
- [Configuration Guide](config.qmd) - Full configuration options
|
- [Configuration Guide](config.qmd) - Full configuration options
|
||||||
- [Dataset Loading](dataset_loading.qmd) - Loading datasets from various sources
|
|
||||||
- [Dataset Formats](dataset-formats) - Working with different data formats
|
- [Dataset Formats](dataset-formats) - Working with different data formats
|
||||||
- [Multi-GPU Training](multi-gpu.qmd)
|
- [Multi-GPU Training](multi-gpu.qmd)
|
||||||
- [Multi-Node Training](multi-node.qmd)
|
- [Multi-Node Training](multi-node.qmd)
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ This guide covers all the ways you can install and set up Axolotl for your envir
|
|||||||
|
|
||||||
- NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
|
- NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
|
||||||
- Python ≥3.10
|
- Python ≥3.10
|
||||||
- PyTorch ≥2.5.1
|
- PyTorch ≥2.4.1
|
||||||
|
|
||||||
## Installation Methods {#sec-installation-methods}
|
## Installation Methods {#sec-installation-methods}
|
||||||
|
|
||||||
@@ -25,10 +25,6 @@ Please make sure to have Pytorch installed before installing Axolotl in your loc
|
|||||||
Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
|
Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
|
||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.callout-important}
|
|
||||||
For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
|
|
||||||
:::
|
|
||||||
|
|
||||||
### PyPI Installation (Recommended) {#sec-pypi}
|
### PyPI Installation (Recommended) {#sec-pypi}
|
||||||
|
|
||||||
```{.bash}
|
```{.bash}
|
||||||
@@ -41,40 +37,6 @@ installed) in order not to clobber it, and so that we set the correct version of
|
|||||||
dependencies that are specific to the PyTorch version or other installed
|
dependencies that are specific to the PyTorch version or other installed
|
||||||
co-dependencies.
|
co-dependencies.
|
||||||
|
|
||||||
### uv Installation {#sec-uv}
|
|
||||||
|
|
||||||
uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
|
|
||||||
|
|
||||||
Install uv if not already installed
|
|
||||||
```{.bash}
|
|
||||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
||||||
source $HOME/.local/bin/env
|
|
||||||
```
|
|
||||||
|
|
||||||
Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
|
|
||||||
then create the venv and activate
|
|
||||||
```{.bash}
|
|
||||||
export UV_TORCH_BACKEND=cu126
|
|
||||||
uv venv --no-project --relocatable
|
|
||||||
source .venv/bin/activate
|
|
||||||
```
|
|
||||||
|
|
||||||
Install PyTorch
|
|
||||||
- PyTorch 2.6.0 recommended
|
|
||||||
```{.bash}
|
|
||||||
uv pip install packaging setuptools wheel
|
|
||||||
uv pip install torch==2.6.0
|
|
||||||
uv pip install awscli pydantic
|
|
||||||
```
|
|
||||||
|
|
||||||
Install axolotl from PyPi
|
|
||||||
```{.bash}
|
|
||||||
uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
|
|
||||||
|
|
||||||
# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
|
|
||||||
uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
|
|
||||||
```
|
|
||||||
|
|
||||||
### Edge/Development Build {#sec-edge-build}
|
### Edge/Development Build {#sec-edge-build}
|
||||||
|
|
||||||
For the latest features between releases:
|
For the latest features between releases:
|
||||||
@@ -110,10 +72,6 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
|
|||||||
```
|
```
|
||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.callout-important}
|
|
||||||
For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.7.0` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0`.
|
|
||||||
:::
|
|
||||||
|
|
||||||
Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
|
Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
|
||||||
|
|
||||||
## Cloud Environments {#sec-cloud}
|
## Cloud Environments {#sec-cloud}
|
||||||
|
|||||||
@@ -84,10 +84,6 @@ lora_qkv_kernel: true
|
|||||||
lora_o_kernel: true
|
lora_o_kernel: true
|
||||||
```
|
```
|
||||||
|
|
||||||
::: {.callout-note}
|
|
||||||
Currently, LoRA kernels are not supported for RLHF training, only SFT.
|
|
||||||
:::
|
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
- One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
|
- One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ Axolotl supports several methods for multi-GPU training:
|
|||||||
|
|
||||||
- DeepSpeed (recommended)
|
- DeepSpeed (recommended)
|
||||||
- FSDP (Fully Sharded Data Parallel)
|
- FSDP (Fully Sharded Data Parallel)
|
||||||
- Context parallelism
|
- Sequence parallelism
|
||||||
- FSDP + QLoRA
|
- FSDP + QLoRA
|
||||||
|
|
||||||
## DeepSpeed {#sec-deepspeed}
|
## DeepSpeed {#sec-deepspeed}
|
||||||
@@ -80,14 +80,27 @@ fsdp_config:
|
|||||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||||
```
|
```
|
||||||
|
|
||||||
## Context parallelism {#sec-sequence-parallelism}
|
## Sequence parallelism {#sec-sequence-parallelism}
|
||||||
|
|
||||||
We support context parallelism (SP) via the
|
We support sequence parallelism (SP) via the
|
||||||
[ring-flash-attention](https://github.com/zhuzilin/ring-flash-attention) project. This
|
[ring-flash-attention](https://github.com/zhuzilin/ring-flash-attention) project. This
|
||||||
allows one to split up sequences across GPUs, which is useful in the event that a
|
allows one to split up sequences across GPUs, which is useful in the event that a
|
||||||
single sequence causes OOM errors during model training.
|
single sequence causes OOM errors during model training.
|
||||||
|
|
||||||
See our [dedicated guide](context_parallelism.qmd) for more information.
|
First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
|
||||||
|
or from source with `pip install .[ring-flash-attn]`.
|
||||||
|
|
||||||
|
Your Axolotl YAML config should contain the following lines:
|
||||||
|
|
||||||
|
```{.yaml}
|
||||||
|
sequence_parallel_degree: 4 # Split each sequence into 4 parts, one per GPU
|
||||||
|
flash_attention: true # Required with sequence parallelism
|
||||||
|
|
||||||
|
# Optional; strides across the key dimension. Larger values use more memory but will make training faster.
|
||||||
|
heads_k_stride: 1
|
||||||
|
```
|
||||||
|
|
||||||
|
See our [dedicated guide](sequence_parallelism.qmd) for more details.
|
||||||
|
|
||||||
### FSDP + QLoRA {#sec-fsdp-qlora}
|
### FSDP + QLoRA {#sec-fsdp-qlora}
|
||||||
|
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ datasets:
|
|||||||
# leave the vision model and vision tower frozen
|
# leave the vision model and vision tower frozen
|
||||||
# load_in_8bit: true
|
# load_in_8bit: true
|
||||||
adapter: lora
|
adapter: lora
|
||||||
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||||
|
|
||||||
# (optional) if you want to resize images to a set size
|
# (optional) if you want to resize images to a set size
|
||||||
image_size: 512
|
image_size: 512
|
||||||
|
|||||||
32
docs/qat.qmd
32
docs/qat.qmd
@@ -1,32 +0,0 @@
|
|||||||
---
|
|
||||||
title: "Quantization Aware Training (QAT)"
|
|
||||||
back-to-top-navigation: true
|
|
||||||
toc: true
|
|
||||||
toc-expand: 2
|
|
||||||
toc-depth: 4
|
|
||||||
---
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
[Quantization Aware Training](https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#quantization-aware-training) (QAT) is a technique for improving the accuracy of models which are quantized
|
|
||||||
by applying "fake" quantizations to the model's weights (and optionally, activations) during training. This fake
|
|
||||||
quantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually
|
|
||||||
quantized, the accuracy loss is minimized. We use the quantization techniques implemented in [torchao](https://github.com/pytorch/ao) to provide
|
|
||||||
support for QAT and post-training quantization (PTQ) in axolotl.
|
|
||||||
|
|
||||||
We recommend reviewing the excellent QAT tutorial in the [torchtune library](https://pytorch.org/torchtune/main/tutorials/qat_finetune.html#quantizing-the-qat-model),
|
|
||||||
and the QAT documentation in the [torchao library](https://github.com/pytorch/ao/tree/main/torchao/quantization/qat), for more details.
|
|
||||||
|
|
||||||
## Configuring QAT in Axolotl
|
|
||||||
|
|
||||||
To enable QAT in axolotl, add the following to your configuration file:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
qat:
|
|
||||||
activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
|
|
||||||
weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
|
|
||||||
group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
|
|
||||||
fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
|
|
||||||
```
|
|
||||||
|
|
||||||
Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize` command](./quantize.md) to do this.
|
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
---
|
|
||||||
title: "Quantization with torchao"
|
|
||||||
back-to-top-navigation: true
|
|
||||||
toc: true
|
|
||||||
toc-expand: 2
|
|
||||||
toc-depth: 4
|
|
||||||
---
|
|
||||||
|
|
||||||
Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the [torchao](https://github.com/pytorch/ao) library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT).
|
|
||||||
|
|
||||||
|
|
||||||
::: {.callout-note}
|
|
||||||
|
|
||||||
We do not currently support quantization techniques such as GGUF/GPTQ,EXL2 at the moment.
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
## Configuring Quantization in Axolotl
|
|
||||||
|
|
||||||
Quantization is configured using the `quantization` key in your configuration file.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
base_model: # The path to the model to quantize.
|
|
||||||
quantization:
|
|
||||||
weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
|
|
||||||
activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
|
|
||||||
group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
|
|
||||||
quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
|
|
||||||
|
|
||||||
output_dir: # The path to the output directory.
|
|
||||||
```
|
|
||||||
|
|
||||||
Once quantization is complete, your quantized model will be saved in the `{output_dir}/quantized` directory.
|
|
||||||
|
|
||||||
You may also use the `quantize` command to quantize a model which has been trained with [QAT](./qat.md) - you can do this by using the existing QAT configuration file which
|
|
||||||
you used to train the model:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# qat.yml
|
|
||||||
qat:
|
|
||||||
activation_dtype: int8
|
|
||||||
weight_dtype: int8
|
|
||||||
group_size: 256
|
|
||||||
quantize_embedding: true
|
|
||||||
|
|
||||||
output_dir: # The path to the output directory used during training where the final checkpoint has been saved.
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
axolotl quantize qat.yml
|
|
||||||
```
|
|
||||||
|
|
||||||
This ensures that an identical quantization configuration is used to quantize the model as was used to train it.
|
|
||||||
@@ -16,8 +16,7 @@ feedback. Various methods include, but not limited to:
|
|||||||
- [Identity Preference Optimization (IPO)](#ipo)
|
- [Identity Preference Optimization (IPO)](#ipo)
|
||||||
- [Kahneman-Tversky Optimization (KTO)](#kto)
|
- [Kahneman-Tversky Optimization (KTO)](#kto)
|
||||||
- [Odds Ratio Preference Optimization (ORPO)](#orpo)
|
- [Odds Ratio Preference Optimization (ORPO)](#orpo)
|
||||||
- [Group Relative Policy Optimization (GRPO)](#grpo)
|
- Proximal Policy Optimization (PPO) (not yet supported in axolotl)
|
||||||
- Proximal Policy Optimization (PPO) (not yet supported in axolotl, if you're interested in contributing, please reach out!)
|
|
||||||
|
|
||||||
|
|
||||||
## RLHF using Axolotl
|
## RLHF using Axolotl
|
||||||
@@ -583,20 +582,7 @@ datasets:
|
|||||||
|
|
||||||
To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function).
|
To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function).
|
||||||
|
|
||||||
To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
|
To see description of the configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/models/input/v0_4_1/trl.py).
|
||||||
|
|
||||||
#### GRPO with DAPO/Dr. GRPO loss
|
|
||||||
|
|
||||||
The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
trl:
|
|
||||||
loss_type: dr_grpo
|
|
||||||
# Normalizes loss based on max completion length (default: 256)
|
|
||||||
max_completion_length:
|
|
||||||
```
|
|
||||||
|
|
||||||
For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types).
|
|
||||||
|
|
||||||
### SimPO
|
### SimPO
|
||||||
|
|
||||||
|
|||||||
@@ -1,16 +1,18 @@
|
|||||||
---
|
---
|
||||||
title: Context Parallelism
|
title: Sequence Parallelism
|
||||||
description: Train with long sequences split across multiple GPUs.
|
description: Train with long sequences split across multiple GPUs.
|
||||||
---
|
---
|
||||||
|
|
||||||
Context parallelism is a technique that splits sequences across multiple GPUs,
|
# Sequence Parallelism
|
||||||
|
|
||||||
|
Sequence parallelism is a technique that splits sequences across multiple GPUs,
|
||||||
allowing you to train with very long sequences that wouldn't fit on a single GPU. Each
|
allowing you to train with very long sequences that wouldn't fit on a single GPU. Each
|
||||||
GPU processes a different portion of the sequence, and the results are aggregated
|
GPU processes a different portion of the sequence, and the results are aggregated
|
||||||
through a ring communication pattern.
|
through a ring communication pattern.
|
||||||
|
|
||||||
## When to Use Context Parallelism
|
## When to Use Sequence Parallelism
|
||||||
|
|
||||||
Use context parallelism when:
|
Use sequence parallelism when:
|
||||||
|
|
||||||
- You need to train with sequence lengths that don't fit into a single GPU's memory
|
- You need to train with sequence lengths that don't fit into a single GPU's memory
|
||||||
- You have multiple GPUs available
|
- You have multiple GPUs available
|
||||||
@@ -18,35 +20,35 @@ Use context parallelism when:
|
|||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
To enable context parallelism, add the following to your configuration file:
|
To enable sequence parallelism, add the following to your configuration file:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
# Set to a divisor (> 1) of the number of GPUs available
|
# Set to a divisor (> 1) of the number of GPUs available
|
||||||
context_parallel_degree: 4 # Split sequences across 4 GPUs
|
sequence_parallel_degree: 4 # Split sequences across 4 GPUs
|
||||||
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
||||||
heads_k_stride: 1
|
heads_k_stride: 1
|
||||||
# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
|
# Optional; one of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to
|
||||||
# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
|
# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
|
||||||
ring_attn_func:
|
ring_attn_func:
|
||||||
```
|
```
|
||||||
|
|
||||||
The `context_parallel_degree` should be a divisor of the total number of GPUs. For example:
|
The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
|
||||||
|
|
||||||
- With 8 GPUs, valid values would be 2, 4, or 8
|
- With 8 GPUs, valid values would be 2, 4, or 8
|
||||||
- With 4 GPUs, valid values would be 2 or 4
|
- With 4 GPUs, valid values would be 2 or 4
|
||||||
|
|
||||||
## Implementation Details
|
## Implementation Details
|
||||||
|
|
||||||
When context parallelism is enabled:
|
When sequence parallelism is enabled:
|
||||||
|
|
||||||
1. Each sequence is divided into equal chunks across the GPUs in a context parallel group
|
1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group
|
||||||
2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
|
2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
|
||||||
3. Position IDs are adjusted to maintain proper relative positions
|
3. Position IDs are adjusted to maintain proper relative positions, especially for packed sequences
|
||||||
4. The trainer uses special ring communication patterns for attention operations
|
4. The trainer uses special ring communication patterns for attention operations
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
To use context parallelism, you need:
|
To use sequence parallelism, you need:
|
||||||
|
|
||||||
- Multiple GPUs (at least 2)
|
- Multiple GPUs (at least 2)
|
||||||
- The `ring-flash-attn` package. Install with:
|
- The `ring-flash-attn` package. Install with:
|
||||||
@@ -66,12 +68,10 @@ sequence_len: 8192
|
|||||||
|
|
||||||
...
|
...
|
||||||
|
|
||||||
context_parallel_degree: 4 # Split each sequence into 4 parts, one per GPU
|
sequence_parallel_degree: 4 # Split each sequence into 4 parts, one per GPU
|
||||||
|
flash_attention: true # Required with sequence parallelism
|
||||||
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
||||||
heads_k_stride: 1
|
heads_k_stride: 1
|
||||||
# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
|
|
||||||
# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
|
|
||||||
ring_attn_func:
|
|
||||||
|
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
@@ -79,22 +79,22 @@ ring_attn_func:
|
|||||||
This will train the Llama 3 8B model with 8K context length, with each sequence split
|
This will train the Llama 3 8B model with 8K context length, with each sequence split
|
||||||
into 2 subsequences of length 4096 across 2 GPUs.
|
into 2 subsequences of length 4096 across 2 GPUs.
|
||||||
|
|
||||||
## Sample Packing with Context Parallelism
|
## Sample Packing with Sequence Parallelism
|
||||||
|
|
||||||
Context parallelism is compatible with Axolotl's sample packing functionality. When using both features together:
|
Sequence parallelism is compatible with Axolotl's sample packing functionality. When using both features together:
|
||||||
|
|
||||||
1. Samples are first packed together
|
1. Samples are first packed together
|
||||||
2. The packed sequences are then divided across GPUs in the context parallel group
|
2. The packed sequences are then divided across GPUs in the sequence parallel group
|
||||||
3. Position IDs are automatically adjusted to maintain proper relative positions
|
3. Position IDs are automatically adjusted to maintain proper relative positions
|
||||||
|
|
||||||
## Effect on Batch Size
|
## Effect on Batch Size
|
||||||
|
|
||||||
When using context parallelism, your effective global batch size is **divided** by the `context_parallel_degree`. This happens because:
|
When using sequence parallelism, your effective global batch size is **divided** by the `sequence_parallel_degree`. This happens because:
|
||||||
|
|
||||||
- Each group of `context_parallel_degree` GPUs works on the same batch (just different parts of each sequence)
|
- Each group of `sequence_parallel_degree` GPUs works on the same batch (just different parts of each sequence)
|
||||||
- The number of batches processed per step decreases
|
- The number of batches processed per step decreases
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
- With 8 GPUs and no context parallelism: 8 different batches processed per step
|
- With 8 GPUs and no sequence parallelism: 8 different batches processed per step
|
||||||
- With 8 GPUs and `context_parallel_degree=4`: Only 2 different batches processed per step (each split across 4 GPUs)
|
- With 8 GPUs and `sequence_parallel_degree=4`: Only 2 different batches processed per step (each split across 4 GPUs)
|
||||||
- If your per-GPU `micro_batch_size` is 2, the global batch size decreases from 16 to 4
|
- If your per-GPU `micro_batch_size` is 2, the global batch size decreases from 16 to 4
|
||||||
|
|||||||
@@ -59,9 +59,7 @@ gradient_checkpointing: false
|
|||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
|
||||||
flash_attention: true
|
attention: flash
|
||||||
sdp_attention:
|
|
||||||
flash_optimum:
|
|
||||||
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
|
|||||||
@@ -39,8 +39,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
attention: xformers
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
|
|||||||
@@ -45,7 +45,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -46,7 +46,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -45,7 +45,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -46,7 +46,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -45,7 +45,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -46,7 +46,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -49,7 +49,8 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -112,9 +112,7 @@
|
|||||||
"early_stopping_patience:\n",
|
"early_stopping_patience:\n",
|
||||||
"resume_from_checkpoint:\n",
|
"resume_from_checkpoint:\n",
|
||||||
"logging_steps: 1\n",
|
"logging_steps: 1\n",
|
||||||
"xformers_attention:\n",
|
"attention: sdpa\n",
|
||||||
"flash_attention: false\n",
|
|
||||||
"sdp_attention: true\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"warmup_steps: 1\n",
|
"warmup_steps: 1\n",
|
||||||
"max_steps: 25\n",
|
"max_steps: 25\n",
|
||||||
|
|||||||
@@ -52,7 +52,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -55,7 +55,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -39,7 +39,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -35,7 +35,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
|
|||||||
@@ -59,7 +59,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
|
|||||||
@@ -43,8 +43,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
attention: xformers
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 40
|
warmup_steps: 40
|
||||||
|
|||||||
@@ -73,8 +73,7 @@ early_stopping_patience: 3
|
|||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
auto_resume_from_checkpoints: true
|
auto_resume_from_checkpoints: true
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
attention: xformers
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
|
|||||||
@@ -40,8 +40,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
attention: xformers
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 40
|
warmup_steps: 40
|
||||||
|
|||||||
@@ -47,7 +47,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -53,7 +53,8 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -43,7 +43,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -57,7 +57,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -51,8 +51,7 @@ gradient_checkpointing: true
|
|||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
eager_attention:
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ pad_to_sequence_len: false
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -53,8 +53,7 @@ gradient_checkpointing: true
|
|||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
eager_attention:
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -36,8 +36,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
attention: xformers
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
|
|||||||
@@ -47,7 +47,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -46,7 +46,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -45,7 +45,8 @@ gradient_checkpointing: true
|
|||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: true
|
use_reentrant: true
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -37,8 +37,7 @@ bf16: auto
|
|||||||
tf32: true
|
tf32: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 5
|
logging_steps: 5
|
||||||
xformers_attention: true
|
attention: xformers
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 20
|
warmup_steps: 20
|
||||||
|
|||||||
@@ -42,7 +42,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
flash_attn_cross_entropy: false
|
flash_attn_cross_entropy: false
|
||||||
flash_attn_rms_norm: true
|
flash_attn_rms_norm: true
|
||||||
flash_attn_fuse_qkv: false
|
flash_attn_fuse_qkv: false
|
||||||
|
|||||||
@@ -53,9 +53,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention:
|
attention: flash
|
||||||
sdp_attention:
|
|
||||||
flash_optimum:
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|||||||
@@ -46,7 +46,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
flash_attn_cross_entropy: false
|
flash_attn_cross_entropy: false
|
||||||
flash_attn_rms_norm: true
|
flash_attn_rms_norm: true
|
||||||
flash_attn_fuse_qkv: false
|
flash_attn_fuse_qkv: false
|
||||||
|
|||||||
@@ -45,7 +45,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -45,7 +45,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -48,7 +48,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: true
|
use_reentrant: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -46,7 +46,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -48,7 +48,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ pad_to_sequence_len: false
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -50,8 +50,7 @@ tf32: true
|
|||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
eager_attention:
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -1,79 +0,0 @@
|
|||||||
base_model: meta-llama/Llama-3.2-3B
|
|
||||||
# Automatically upload checkpoint and final model to HF
|
|
||||||
# hub_model_id: username/custom_model_name
|
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.liger.LigerPlugin
|
|
||||||
|
|
||||||
liger_rope: true
|
|
||||||
liger_rms_norm: true
|
|
||||||
liger_glu_activation: true
|
|
||||||
liger_layer_norm: true
|
|
||||||
liger_fused_linear_cross_entropy: true
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: yahma/alpaca-cleaned
|
|
||||||
type: alpaca
|
|
||||||
|
|
||||||
output_dir: ./outputs/qat_out/
|
|
||||||
|
|
||||||
sample_packing: true
|
|
||||||
pad_to_sequence_len: true
|
|
||||||
sequence_len: 512
|
|
||||||
|
|
||||||
flex_attention: true
|
|
||||||
flex_attn_compile_kwargs:
|
|
||||||
dynamic: false
|
|
||||||
mode: max-autotune-no-cudagraphs
|
|
||||||
|
|
||||||
qat:
|
|
||||||
activation_dtype: int8
|
|
||||||
weight_dtype: int4
|
|
||||||
group_size: 32
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 1
|
|
||||||
micro_batch_size: 16
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_torch_fused
|
|
||||||
|
|
||||||
cosine_constant_lr_ratio: 0
|
|
||||||
cosine_min_lr_ratio: 1.0
|
|
||||||
learning_rate: 2e-5
|
|
||||||
save_only_model: true
|
|
||||||
bf16: true
|
|
||||||
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
|
|
||||||
evals_per_epoch: 1
|
|
||||||
saves_per_epoch: 1
|
|
||||||
|
|
||||||
warmup_steps: 10
|
|
||||||
weight_decay: 0.0
|
|
||||||
fsdp:
|
|
||||||
- full_shard
|
|
||||||
- auto_wrap
|
|
||||||
|
|
||||||
fsdp_config:
|
|
||||||
fsdp_version: 2
|
|
||||||
fsdp_offload_params: false
|
|
||||||
fsdp_cpu_ram_efficient_loading: true
|
|
||||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
|
||||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
|
||||||
fsdp_state_dict_type: FULL_STATE_DICT
|
|
||||||
fsdp_sharding_strategy: FULL_SHARD
|
|
||||||
fsdp_reshard_after_forward: true
|
|
||||||
fsdp_activation_checkpointing: true
|
|
||||||
|
|
||||||
special_tokens:
|
|
||||||
pad_token: <|end_of_text|>
|
|
||||||
@@ -49,7 +49,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
|
|||||||
@@ -34,7 +34,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
|
|||||||
@@ -61,7 +61,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -56,7 +56,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -77,7 +77,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -53,7 +53,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -54,7 +54,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|||||||
@@ -48,7 +48,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|||||||
@@ -55,7 +55,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ base_model: NousResearch/Llama-3.2-1B
|
|||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca
|
type: alpaca
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
val_set_size: 0.1
|
val_set_size: 0.1
|
||||||
output_dir: ./outputs/lora-out
|
output_dir: ./outputs/lora-out
|
||||||
|
|
||||||
@@ -38,7 +38,6 @@ wandb_log_model:
|
|||||||
gradient_accumulation_steps: 2
|
gradient_accumulation_steps: 2
|
||||||
micro_batch_size: 2
|
micro_batch_size: 2
|
||||||
num_epochs: 1
|
num_epochs: 1
|
||||||
|
|
||||||
optimizer: adamw_8bit
|
optimizer: adamw_8bit
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
@@ -49,7 +48,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|||||||
@@ -49,7 +49,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -53,7 +53,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 20
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -51,7 +51,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|||||||
@@ -39,7 +39,8 @@ gradient_checkpointing: true
|
|||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: true
|
use_reentrant: true
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -48,7 +48,8 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: true
|
use_reentrant: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -46,7 +46,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -34,5 +34,3 @@ We provide a script to delinearize Llama 4 linearized models into regular Huggin
|
|||||||
```bash
|
```bash
|
||||||
axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
|
axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
|
||||||
```
|
```
|
||||||
|
|
||||||
Note: This only works with the non-quantized linearized model. If you have an adapter, merge it with the *non-quantized linearized* model before delinearizing.
|
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ pad_to_sequence_len: false
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -46,8 +46,7 @@ tf32: true
|
|||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
eager_attention:
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ tf32: true
|
|||||||
gradient_checkpointing: false
|
gradient_checkpointing: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention:
|
attention: eager
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -42,7 +42,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
save_total_limit: 1
|
save_total_limit: 1
|
||||||
save_steps:
|
save_steps:
|
||||||
|
|||||||
@@ -36,7 +36,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -53,8 +53,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: false
|
attention: sdpa
|
||||||
sdp_attention: true
|
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|||||||
@@ -54,7 +54,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: false
|
attention: eager
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -51,7 +51,8 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attention: flash
|
||||||
|
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user