Compare commits
84 Commits
pre-commit
...
smaller-ra
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a0670abc94 | ||
|
|
08f287b57f | ||
|
|
b4c7d9c29d | ||
|
|
d2637fb01d | ||
|
|
9da730d6a4 | ||
|
|
32637fad00 | ||
|
|
f776f889a1 | ||
|
|
69eda209a6 | ||
|
|
b8c633aa97 | ||
|
|
682a9cf79b | ||
|
|
271b24cccc | ||
|
|
198d775d6d | ||
|
|
e4307fb7d7 | ||
|
|
dd8bad06d0 | ||
|
|
de8a625dd7 | ||
|
|
51267ded04 | ||
|
|
756a0559c1 | ||
|
|
9a8e3e9c7b | ||
|
|
7e7180fa10 | ||
|
|
22c562533d | ||
|
|
16823e1de6 | ||
|
|
e0420b3528 | ||
|
|
9f986f5e71 | ||
|
|
f85861a0b2 | ||
|
|
630e40dd13 | ||
|
|
bf9efe2a09 | ||
|
|
0dac2ddeac | ||
|
|
a6c03217f5 | ||
|
|
59cd472504 | ||
|
|
9b89591ead | ||
|
|
31498d0230 | ||
|
|
d25daebea9 | ||
|
|
e0e5d9b1d6 | ||
|
|
8bbad21bfd | ||
|
|
5f4af3665d | ||
|
|
a8f38c367c | ||
|
|
e7e0cd97ce | ||
|
|
949471039f | ||
|
|
de451f99a5 | ||
|
|
9f824ef76a | ||
|
|
dd66fb163c | ||
|
|
e0cc4f1a87 | ||
|
|
64d8035f50 | ||
|
|
5249e98058 | ||
|
|
3877c5c69d | ||
|
|
adb593abac | ||
|
|
a0117c9bce | ||
|
|
e6cfb093d2 | ||
|
|
7abc71dc0b | ||
|
|
45bf634d17 | ||
|
|
80ba4b69f1 | ||
|
|
0bfa180f7d | ||
|
|
9e22c4ca6a | ||
|
|
990b5896bc | ||
|
|
7d0eb66b54 | ||
|
|
df119e3724 | ||
|
|
f4ae8816bb | ||
|
|
9b95e06cbb | ||
|
|
e0aba74dd0 | ||
|
|
328d598114 | ||
|
|
4d36ecc724 | ||
|
|
7acf93b59f | ||
|
|
b6fc46ada8 | ||
|
|
b35992262e | ||
|
|
ef6eb77cc8 | ||
|
|
5410195e0b | ||
|
|
cf0c79d52e | ||
|
|
4ba80a0e5a | ||
|
|
c49682132b | ||
|
|
e46239f8d3 | ||
|
|
05f03b541a | ||
|
|
a4e430e7c4 | ||
|
|
6cdcb8ddd5 | ||
|
|
a7811ad4a0 | ||
|
|
e2da821e67 | ||
|
|
2c34a4634e | ||
|
|
a9b0733f2c | ||
|
|
9f00465a5c | ||
|
|
86bac48d14 | ||
|
|
e44953d50c | ||
|
|
23f0c51d88 | ||
|
|
113e9cd193 | ||
|
|
61825a464a | ||
|
|
c907ac173e |
14
.coveragerc
Normal file
14
.coveragerc
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
[run]
|
||||||
|
source = axolotl
|
||||||
|
omit =
|
||||||
|
*/tests/*
|
||||||
|
setup.py
|
||||||
|
|
||||||
|
[report]
|
||||||
|
exclude_lines =
|
||||||
|
pragma: no cover
|
||||||
|
def __repr__
|
||||||
|
raise NotImplementedError
|
||||||
|
if __name__ == .__main__.:
|
||||||
|
pass
|
||||||
|
raise ImportError
|
||||||
14
.github/workflows/base.yml
vendored
14
.github/workflows/base.yml
vendored
@@ -40,12 +40,24 @@ jobs:
|
|||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.6.0
|
pytorch: 2.6.0
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
|
- cuda: "126"
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
cudnn_version: ""
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
- cuda: "128"
|
- cuda: "128"
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
cudnn_version: ""
|
cudnn_version: ""
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: nightly
|
pytorch: nightly
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
|
- cuda: "128"
|
||||||
|
cuda_version: 12.8.1
|
||||||
|
cudnn_version: ""
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: next
|
||||||
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -67,7 +79,7 @@ jobs:
|
|||||||
uses: docker/build-push-action@v4
|
uses: docker/build-push-action@v4
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || './docker/Dockerfile-base' }}
|
file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || matrix.pytorch == 'next' && './docker/Dockerfile-base-next' || './docker/Dockerfile-base' }}
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
|
|||||||
7
.github/workflows/docs.yml
vendored
7
.github/workflows/docs.yml
vendored
@@ -20,9 +20,12 @@ jobs:
|
|||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: '3.11'
|
python-version: '3.11'
|
||||||
- name: install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install jupyter
|
python3 -m pip install jupyter quartodoc
|
||||||
|
python3 -m pip install -e . --no-deps
|
||||||
|
- name: Build autodoc
|
||||||
|
run: quartodoc build
|
||||||
- name: Publish to GitHub Pages (and render)
|
- name: Publish to GitHub Pages (and render)
|
||||||
uses: quarto-dev/quarto-actions/publish@v2
|
uses: quarto-dev/quarto-actions/publish@v2
|
||||||
with:
|
with:
|
||||||
|
|||||||
6
.github/workflows/main.yml
vendored
6
.github/workflows/main.yml
vendored
@@ -25,12 +25,12 @@ jobs:
|
|||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.5.1
|
pytorch: 2.5.1
|
||||||
axolotl_extras: vllm
|
axolotl_extras: vllm
|
||||||
is_latest: true
|
|
||||||
- cuda: 124
|
- cuda: 124
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.6.0
|
pytorch: 2.6.0
|
||||||
axolotl_extras:
|
axolotl_extras: vllm
|
||||||
|
is_latest: true
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -87,12 +87,12 @@ jobs:
|
|||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.5.1
|
pytorch: 2.5.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
is_latest: true
|
|
||||||
- cuda: 124
|
- cuda: 124
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.6.0
|
pytorch: 2.6.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
|
is_latest: true
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
|
|||||||
15
.github/workflows/multi-gpu-e2e.yml
vendored
15
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -24,6 +24,13 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
|
- cuda: 124
|
||||||
|
cuda_version: 12.4.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
axolotl_extras: vllm
|
||||||
|
num_gpus: 2
|
||||||
|
nightly_build: "true"
|
||||||
- cuda: 124
|
- cuda: 124
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
@@ -38,14 +45,6 @@ jobs:
|
|||||||
axolotl_extras: vllm
|
axolotl_extras: vllm
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
nightly_build: "true"
|
nightly_build: "true"
|
||||||
- cuda: 124
|
|
||||||
cuda_version: 12.4.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.6.0
|
|
||||||
# awaiting vllm#12721
|
|
||||||
axolotl_extras:
|
|
||||||
num_gpus: 2
|
|
||||||
nightly_build: "true"
|
|
||||||
runs-on: [self-hosted, modal]
|
runs-on: [self-hosted, modal]
|
||||||
timeout-minutes: 120
|
timeout-minutes: 120
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
49
.github/workflows/precommit-autoupdate.yml
vendored
Normal file
49
.github/workflows/precommit-autoupdate.yml
vendored
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
name: Pre-commit auto-update
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 0 * * 0' # Run weekly
|
||||||
|
workflow_dispatch: # Manual kickoff
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
auto-update:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
pull-requests: write
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.11'
|
||||||
|
|
||||||
|
- name: Update pre-commit hooks
|
||||||
|
id: update
|
||||||
|
run: |
|
||||||
|
pip install pre-commit
|
||||||
|
pre-commit autoupdate
|
||||||
|
if [[ -n $(git status --porcelain) ]]; then
|
||||||
|
echo "changes=true" >> $GITHUB_OUTPUT
|
||||||
|
git diff .pre-commit-config.yaml > pre-commit-update.diff
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Create Pull Request
|
||||||
|
if: steps.update.outputs.changes == 'true'
|
||||||
|
uses: peter-evans/create-pull-request@v6
|
||||||
|
with:
|
||||||
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
branch: update/pre-commit-hooks
|
||||||
|
delete-branch: true
|
||||||
|
title: "chore: update pre-commit hooks"
|
||||||
|
commit-message: "chore: update pre-commit hooks"
|
||||||
|
body: |
|
||||||
|
Automated PR to update pre-commit hooks to their latest versions.
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Changes:</summary>
|
||||||
|
|
||||||
|
```diff
|
||||||
|
${{ steps.update.outputs.diff }}
|
||||||
|
```
|
||||||
|
</details>
|
||||||
25
.github/workflows/tests-nightly.yml
vendored
25
.github/workflows/tests-nightly.yml
vendored
@@ -33,6 +33,15 @@ jobs:
|
|||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Restore HF cache
|
||||||
|
id: hf-cache-restore
|
||||||
|
uses: actions/cache/restore@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
|
key: ${{ runner.os }}-hf-hub-cache-v2
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
@@ -46,7 +55,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install PyTorch
|
- name: Install PyTorch
|
||||||
run: |
|
run: |
|
||||||
pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
|
pip3 install torch==${{ matrix.pytorch_version }}
|
||||||
|
|
||||||
- name: Update requirements.txt
|
- name: Update requirements.txt
|
||||||
run: |
|
run: |
|
||||||
@@ -58,8 +67,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip3 install --upgrade pip
|
pip3 show torch
|
||||||
pip3 install --upgrade packaging==23.2
|
|
||||||
pip3 install --no-build-isolation -U -e .
|
pip3 install --no-build-isolation -U -e .
|
||||||
python scripts/unsloth_install.py | sh
|
python scripts/unsloth_install.py | sh
|
||||||
python scripts/cutcrossentropy_install.py | sh
|
python scripts/cutcrossentropy_install.py | sh
|
||||||
@@ -73,10 +81,15 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
axolotl --help
|
axolotl --help
|
||||||
|
|
||||||
|
- name: Pre-Download dataset fixture
|
||||||
|
run: |
|
||||||
|
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
pytest -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
|
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
|
||||||
pytest tests/patched/
|
pytest -v tests/patched/
|
||||||
|
pytest -v tests/cli/
|
||||||
|
|
||||||
- name: cleanup pip cache
|
- name: cleanup pip cache
|
||||||
run: |
|
run: |
|
||||||
@@ -136,4 +149,4 @@ jobs:
|
|||||||
echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
|
echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
|
||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
run: |
|
run: |
|
||||||
modal run cicd.tests
|
modal run cicd.e2e_tests
|
||||||
|
|||||||
122
.github/workflows/tests.yml
vendored
122
.github/workflows/tests.yml
vendored
@@ -63,7 +63,7 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
/home/runner/.cache/huggingface/hub/datasets--*
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
/home/runner/.cache/huggingface/hub/models--*
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}
|
key: ${{ runner.os }}-hf-hub-cache-v2
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
@@ -96,10 +96,22 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
axolotl --help
|
axolotl --help
|
||||||
|
|
||||||
|
- name: Pre-Download dataset fixture
|
||||||
|
run: |
|
||||||
|
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
|
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml
|
||||||
pytest -v tests/patched/
|
pytest -v tests/patched/ --cov=axolotl --cov-append --cov-report=xml
|
||||||
|
pytest -v tests/cli/ --cov=axolotl --cov-append --cov-report=xml
|
||||||
|
|
||||||
|
- name: Upload coverage to Codecov
|
||||||
|
uses: codecov/codecov-action@v5
|
||||||
|
with:
|
||||||
|
files: ./coverage.xml
|
||||||
|
flags: unittests,pytorch-${{ matrix.pytorch_version }}
|
||||||
|
fail_ci_if_error: false
|
||||||
|
|
||||||
- name: cleanup pip cache
|
- name: cleanup pip cache
|
||||||
run: |
|
run: |
|
||||||
@@ -136,7 +148,7 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
/home/runner/.cache/huggingface/hub/datasets--*
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
/home/runner/.cache/huggingface/hub/models--*
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}
|
key: ${{ runner.os }}-hf-hub-cache-v2
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
@@ -170,10 +182,14 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
axolotl --help
|
axolotl --help
|
||||||
|
|
||||||
|
- name: Show HF cache
|
||||||
|
run: huggingface-cli scan-cache
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
|
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
|
||||||
pytest -v tests/patched/
|
pytest -v tests/patched/
|
||||||
|
pytest -v tests/cli/
|
||||||
|
|
||||||
- name: cleanup pip cache
|
- name: cleanup pip cache
|
||||||
run: |
|
run: |
|
||||||
@@ -199,6 +215,53 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
|
- cuda: 124
|
||||||
|
cuda_version: 12.4.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras: vllm
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.11"
|
||||||
|
- name: Install Modal
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install modal==0.71.8 jinja2
|
||||||
|
- name: Update env vars
|
||||||
|
run: |
|
||||||
|
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
|
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||||
|
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||||
|
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||||
|
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||||
|
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
||||||
|
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||||
|
- name: Run tests job on Modal
|
||||||
|
run: |
|
||||||
|
modal run cicd.e2e_tests
|
||||||
|
|
||||||
|
docker-e2e-tests:
|
||||||
|
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||||
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
|
runs-on: [self-hosted, modal]
|
||||||
|
timeout-minutes: 90
|
||||||
|
needs: [pre-commit, pytest, docker-e2e-tests-1st]
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- cuda: 124
|
||||||
|
cuda_version: 12.4.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.4.1
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras:
|
||||||
- cuda: 124
|
- cuda: 124
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
@@ -227,51 +290,4 @@ jobs:
|
|||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
run: |
|
run: |
|
||||||
modal run cicd.tests
|
modal run cicd.e2e_tests
|
||||||
|
|
||||||
docker-e2e-tests:
|
|
||||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
|
||||||
runs-on: [self-hosted, modal]
|
|
||||||
timeout-minutes: 90
|
|
||||||
needs: [pre-commit, pytest, docker-e2e-tests-1st]
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
include:
|
|
||||||
- cuda: 124
|
|
||||||
cuda_version: 12.4.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.4.1
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras:
|
|
||||||
- cuda: 124
|
|
||||||
cuda_version: 12.4.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.6.0
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras:
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
- name: Install Modal
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install modal==0.71.8 jinja2
|
|
||||||
- name: Update env vars
|
|
||||||
run: |
|
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
|
||||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
|
||||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
|
||||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
|
||||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
|
||||||
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
|
||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
|
||||||
- name: Run tests job on Modal
|
|
||||||
run: |
|
|
||||||
modal run cicd.tests
|
|
||||||
|
|||||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -181,6 +181,10 @@ prepared-datasets/
|
|||||||
submit.sh
|
submit.sh
|
||||||
*.out*
|
*.out*
|
||||||
|
|
||||||
|
# Quartodoc generated files
|
||||||
|
objects.json
|
||||||
|
site_libs/
|
||||||
|
|
||||||
typings/
|
typings/
|
||||||
out/
|
out/
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
[settings]
|
[settings]
|
||||||
profile=black
|
profile=black
|
||||||
known_third_party=wandb,comet_ml
|
known_third_party=wandb,comet_ml
|
||||||
|
known_local_folder=src,tests
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ default_language_version:
|
|||||||
|
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
rev: v4.4.0
|
rev: v5.0.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: check-yaml
|
- id: check-yaml
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
@@ -11,23 +11,23 @@ repos:
|
|||||||
- id: no-commit-to-branch
|
- id: no-commit-to-branch
|
||||||
args: ['--branch', 'main']
|
args: ['--branch', 'main']
|
||||||
- repo: https://github.com/psf/black
|
- repo: https://github.com/psf/black
|
||||||
rev: 23.3.0
|
rev: 25.1.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: black
|
- id: black
|
||||||
- repo: https://github.com/pycqa/isort
|
- repo: https://github.com/pycqa/isort
|
||||||
rev: 5.12.0
|
rev: 6.0.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
- repo: https://github.com/PyCQA/flake8
|
- repo: https://github.com/PyCQA/flake8
|
||||||
rev: 6.1.0
|
rev: 7.1.2
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
- repo: https://github.com/pylint-dev/pylint
|
- repo: https://github.com/pylint-dev/pylint
|
||||||
rev: c8c96d20cde3552a79858c7456bb1483bf83d633
|
rev: v3.3.6
|
||||||
hooks:
|
hooks:
|
||||||
- id: pylint
|
- id: pylint
|
||||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||||
rev: v1.3.0
|
rev: v1.15.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: mypy
|
- id: mypy
|
||||||
additional_dependencies:
|
additional_dependencies:
|
||||||
@@ -36,7 +36,7 @@ repos:
|
|||||||
'pydantic>=2.5.3',
|
'pydantic>=2.5.3',
|
||||||
]
|
]
|
||||||
- repo: https://github.com/PyCQA/bandit
|
- repo: https://github.com/PyCQA/bandit
|
||||||
rev: 1.7.5
|
rev: 1.8.3
|
||||||
hooks:
|
hooks:
|
||||||
- id: bandit
|
- id: bandit
|
||||||
args: [
|
args: [
|
||||||
|
|||||||
22
README.md
22
README.md
@@ -9,6 +9,7 @@
|
|||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
|
<img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
|
||||||
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
|
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
|
||||||
|
<a href="https://codecov.io/gh/axolotl-ai-cloud/axolotl"><img src="https://codecov.io/gh/axolotl-ai-cloud/axolotl/branch/main/graph/badge.svg" alt="codecov"></a>
|
||||||
<a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
|
<a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
|
||||||
<br/>
|
<br/>
|
||||||
<a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
|
<a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
|
||||||
@@ -63,7 +64,7 @@ axolotl fetch examples
|
|||||||
axolotl fetch deepspeed_configs # OPTIONAL
|
axolotl fetch deepspeed_configs # OPTIONAL
|
||||||
```
|
```
|
||||||
|
|
||||||
Other installation approaches are described [here](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html).
|
Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
|
||||||
|
|
||||||
### Your First Fine-tune
|
### Your First Fine-tune
|
||||||
|
|
||||||
@@ -78,7 +79,7 @@ axolotl fetch examples --dest path/to/folder
|
|||||||
axolotl train examples/llama-3/lora-1b.yml
|
axolotl train examples/llama-3/lora-1b.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
That's it! Check out our [Getting Started Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/getting-started.html) for a more detailed walkthrough.
|
That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.
|
||||||
|
|
||||||
## ✨ Key Features
|
## ✨ Key Features
|
||||||
|
|
||||||
@@ -91,19 +92,20 @@ That's it! Check out our [Getting Started Guide](https://axolotl-ai-cloud.github
|
|||||||
|
|
||||||
## 📚 Documentation
|
## 📚 Documentation
|
||||||
|
|
||||||
- [Installation Options](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html) - Detailed setup instructions for different environments
|
- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
|
||||||
- [Configuration Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html) - Full configuration options and examples
|
- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
|
||||||
- [Dataset Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/) - Supported formats and how to use them
|
- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
|
||||||
- [Multi-GPU Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-gpu.html)
|
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
||||||
- [Multi-Node Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html)
|
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
||||||
- [Multipacking](https://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html)
|
- [Multipacking](https://docs.axolotl.ai/docs/multipack.html)
|
||||||
- [FAQ](https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html) - Frequently asked questions
|
- [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
|
||||||
|
- [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions
|
||||||
|
|
||||||
## 🤝 Getting Help
|
## 🤝 Getting Help
|
||||||
|
|
||||||
- Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
|
- Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
|
||||||
- Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
|
- Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
|
||||||
- Read our [Debugging Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html)
|
- Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html)
|
||||||
- Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options
|
- Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options
|
||||||
|
|
||||||
## 🌟 Contributing
|
## 🌟 Contributing
|
||||||
|
|||||||
197
_quarto.yml
197
_quarto.yml
@@ -1,6 +1,180 @@
|
|||||||
project:
|
project:
|
||||||
type: website
|
type: website
|
||||||
|
|
||||||
|
quartodoc:
|
||||||
|
dir: docs/api
|
||||||
|
package: axolotl
|
||||||
|
title: API Reference
|
||||||
|
parser: google
|
||||||
|
|
||||||
|
sections:
|
||||||
|
- title: Core
|
||||||
|
desc: Core functionality for training
|
||||||
|
contents:
|
||||||
|
- train
|
||||||
|
- evaluate
|
||||||
|
- datasets
|
||||||
|
- convert
|
||||||
|
- prompt_tokenizers
|
||||||
|
- logging_config
|
||||||
|
- core.trainer_builder
|
||||||
|
- core.training_args
|
||||||
|
- core.chat.messages
|
||||||
|
- core.chat.format.chatml
|
||||||
|
- core.chat.format.llama3x
|
||||||
|
- core.chat.format.shared
|
||||||
|
- core.datasets.chat
|
||||||
|
- core.datasets.transforms.chat_builder
|
||||||
|
- title: CLI
|
||||||
|
desc: Command-line interface
|
||||||
|
contents:
|
||||||
|
- cli.main
|
||||||
|
- cli.train
|
||||||
|
- cli.evaluate
|
||||||
|
- cli.args
|
||||||
|
- cli.checks
|
||||||
|
- cli.config
|
||||||
|
- cli.inference
|
||||||
|
- cli.merge_lora
|
||||||
|
- cli.merge_sharded_fsdp_weights
|
||||||
|
- cli.preprocess
|
||||||
|
- cli.sweeps
|
||||||
|
- cli.utils
|
||||||
|
- cli.vllm_serve
|
||||||
|
- cli.cloud.base
|
||||||
|
- cli.cloud.modal_
|
||||||
|
- title: Trainers
|
||||||
|
desc: Training implementations
|
||||||
|
contents:
|
||||||
|
- core.trainers.base
|
||||||
|
- core.trainers.trl
|
||||||
|
- core.trainers.dpo.trainer
|
||||||
|
- core.trainers.grpo.trainer
|
||||||
|
- title: Prompt Strategies
|
||||||
|
desc: Prompt formatting strategies
|
||||||
|
contents:
|
||||||
|
- prompt_strategies.base
|
||||||
|
- prompt_strategies.chat_template
|
||||||
|
- prompt_strategies.alpaca_chat
|
||||||
|
- prompt_strategies.alpaca_instruct
|
||||||
|
- prompt_strategies.alpaca_w_system
|
||||||
|
- prompt_strategies.user_defined
|
||||||
|
- prompt_strategies.llama2_chat
|
||||||
|
- prompt_strategies.completion
|
||||||
|
- prompt_strategies.input_output
|
||||||
|
- prompt_strategies.stepwise_supervised
|
||||||
|
- prompt_strategies.metharme
|
||||||
|
- prompt_strategies.orcamini
|
||||||
|
- prompt_strategies.pygmalion
|
||||||
|
- prompt_strategies.messages.chat
|
||||||
|
- prompt_strategies.dpo.chat_template
|
||||||
|
- prompt_strategies.dpo.llama3
|
||||||
|
- prompt_strategies.dpo.chatml
|
||||||
|
- prompt_strategies.dpo.zephyr
|
||||||
|
- prompt_strategies.dpo.user_defined
|
||||||
|
- prompt_strategies.dpo.passthrough
|
||||||
|
- prompt_strategies.kto.llama3
|
||||||
|
- prompt_strategies.kto.chatml
|
||||||
|
- prompt_strategies.kto.user_defined
|
||||||
|
- prompt_strategies.orpo.chat_template
|
||||||
|
- prompt_strategies.bradley_terry.llama3
|
||||||
|
- title: Kernels
|
||||||
|
desc: Low-level performance optimizations
|
||||||
|
contents:
|
||||||
|
- kernels.lora
|
||||||
|
- kernels.geglu
|
||||||
|
- kernels.swiglu
|
||||||
|
- kernels.quantize
|
||||||
|
- kernels.utils
|
||||||
|
- title: MonkeyPatches
|
||||||
|
desc: Runtime patches for model optimizations
|
||||||
|
contents:
|
||||||
|
- monkeypatch.llama_attn_hijack_flash
|
||||||
|
- monkeypatch.llama_attn_hijack_xformers
|
||||||
|
- monkeypatch.mistral_attn_hijack_flash
|
||||||
|
- monkeypatch.multipack
|
||||||
|
- monkeypatch.relora
|
||||||
|
- monkeypatch.llama_expand_mask
|
||||||
|
- monkeypatch.lora_kernels
|
||||||
|
- monkeypatch.utils
|
||||||
|
- monkeypatch.btlm_attn_hijack_flash
|
||||||
|
- monkeypatch.llama_patch_multipack
|
||||||
|
- monkeypatch.stablelm_attn_hijack_flash
|
||||||
|
- monkeypatch.trainer_fsdp_optim
|
||||||
|
- monkeypatch.transformers_fa_utils
|
||||||
|
- monkeypatch.unsloth_
|
||||||
|
- monkeypatch.attention.mllama
|
||||||
|
- monkeypatch.data.batch_dataset_fetcher
|
||||||
|
- monkeypatch.mixtral
|
||||||
|
- title: Utils
|
||||||
|
desc: Utility functions
|
||||||
|
contents:
|
||||||
|
- utils.models
|
||||||
|
- utils.tokenization
|
||||||
|
- utils.chat_templates
|
||||||
|
- utils.lora
|
||||||
|
- utils.lora_embeddings
|
||||||
|
- utils.model_shard_quant
|
||||||
|
- utils.bench
|
||||||
|
- utils.freeze
|
||||||
|
- utils.trainer
|
||||||
|
- utils.schedulers
|
||||||
|
- utils.distributed
|
||||||
|
- utils.dict
|
||||||
|
- utils.optimizers.adopt
|
||||||
|
- utils.data.pretraining
|
||||||
|
- utils.data.sft
|
||||||
|
- utils.gradient_checkpointing.unsloth
|
||||||
|
- title: Schemas
|
||||||
|
desc: Pydantic data models for Axolotl config
|
||||||
|
contents:
|
||||||
|
- utils.schemas.config
|
||||||
|
- utils.schemas.model
|
||||||
|
- utils.schemas.training
|
||||||
|
- utils.schemas.datasets
|
||||||
|
- utils.schemas.peft
|
||||||
|
- utils.schemas.trl
|
||||||
|
- utils.schemas.multimodal
|
||||||
|
- utils.schemas.integrations
|
||||||
|
- utils.schemas.enums
|
||||||
|
- utils.schemas.utils
|
||||||
|
- title: Integrations
|
||||||
|
desc: Third-party integrations and extensions
|
||||||
|
contents:
|
||||||
|
- integrations.base
|
||||||
|
- integrations.cut_cross_entropy.args
|
||||||
|
- integrations.grokfast.optimizer
|
||||||
|
- integrations.kd.trainer
|
||||||
|
- integrations.liger.args
|
||||||
|
- integrations.lm_eval.args
|
||||||
|
- integrations.spectrum.args
|
||||||
|
- title: Common
|
||||||
|
desc: Common utilities and shared functionality
|
||||||
|
contents:
|
||||||
|
- common.architectures
|
||||||
|
- common.const
|
||||||
|
- common.datasets
|
||||||
|
- title: Models
|
||||||
|
desc: Custom model implementations
|
||||||
|
contents:
|
||||||
|
- models.mamba.modeling_mamba
|
||||||
|
- title: Data Processing
|
||||||
|
desc: Data processing utilities
|
||||||
|
contents:
|
||||||
|
- utils.collators.core
|
||||||
|
- utils.collators.batching
|
||||||
|
- utils.collators.mamba
|
||||||
|
- utils.collators.mm_chat
|
||||||
|
- utils.samplers.multipack
|
||||||
|
- title: Callbacks
|
||||||
|
desc: Training callbacks
|
||||||
|
contents:
|
||||||
|
- utils.callbacks.perplexity
|
||||||
|
- utils.callbacks.profiler
|
||||||
|
- utils.callbacks.lisa
|
||||||
|
- utils.callbacks.mlflow_
|
||||||
|
- utils.callbacks.comet_
|
||||||
|
|
||||||
website:
|
website:
|
||||||
title: "Axolotl"
|
title: "Axolotl"
|
||||||
description: "We make fine-tuning accessible, scalable, and fun"
|
description: "We make fine-tuning accessible, scalable, and fun"
|
||||||
@@ -35,6 +209,8 @@ website:
|
|||||||
- docs/inference.qmd
|
- docs/inference.qmd
|
||||||
- docs/cli.qmd
|
- docs/cli.qmd
|
||||||
- docs/config.qmd
|
- docs/config.qmd
|
||||||
|
- text: "API Reference"
|
||||||
|
href: docs/api
|
||||||
|
|
||||||
- section: "Dataset Formats"
|
- section: "Dataset Formats"
|
||||||
contents: docs/dataset-formats/*
|
contents: docs/dataset-formats/*
|
||||||
@@ -55,6 +231,7 @@ website:
|
|||||||
- docs/reward_modelling.qmd
|
- docs/reward_modelling.qmd
|
||||||
- docs/lr_groups.qmd
|
- docs/lr_groups.qmd
|
||||||
- docs/lora_optims.qmd
|
- docs/lora_optims.qmd
|
||||||
|
- docs/dataset_loading.qmd
|
||||||
|
|
||||||
- section: "Core Concepts"
|
- section: "Core Concepts"
|
||||||
contents:
|
contents:
|
||||||
@@ -68,6 +245,7 @@ website:
|
|||||||
- docs/unsloth.qmd
|
- docs/unsloth.qmd
|
||||||
- docs/torchao.qmd
|
- docs/torchao.qmd
|
||||||
- docs/custom_integrations.qmd
|
- docs/custom_integrations.qmd
|
||||||
|
- docs/sequence_parallelism.qmd
|
||||||
|
|
||||||
- section: "Troubleshooting"
|
- section: "Troubleshooting"
|
||||||
contents:
|
contents:
|
||||||
@@ -80,3 +258,22 @@ format:
|
|||||||
theme: darkly
|
theme: darkly
|
||||||
css: styles.css
|
css: styles.css
|
||||||
toc: true
|
toc: true
|
||||||
|
# Enable better handling of line breaks in markdown
|
||||||
|
preserve-tabs: true
|
||||||
|
html-math-method: mathjax
|
||||||
|
# Improved markdown processing options
|
||||||
|
md-extensions:
|
||||||
|
- markdown_it
|
||||||
|
- def_list
|
||||||
|
- attr_list
|
||||||
|
- fenced_divs
|
||||||
|
- tables
|
||||||
|
- html_admonition
|
||||||
|
- lineblocks
|
||||||
|
- fancy_lists
|
||||||
|
# Control whitespace handling
|
||||||
|
whitespace: preserve
|
||||||
|
# Process newlines in paragraphs
|
||||||
|
wrap: preserve
|
||||||
|
# Better line break handling
|
||||||
|
preserve-linebreaks: true
|
||||||
|
|||||||
@@ -33,9 +33,9 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
|
|||||||
|
|
||||||
RUN pip install packaging==23.2 setuptools==75.8.0
|
RUN pip install packaging==23.2 setuptools==75.8.0
|
||||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||||
else \
|
else \
|
||||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
RUN python scripts/unsloth_install.py | sh
|
RUN python scripts/unsloth_install.py | sh
|
||||||
|
|||||||
62
cicd/cicd.sh
62
cicd/cicd.sh
@@ -3,9 +3,59 @@ set -e
|
|||||||
|
|
||||||
python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
|
python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
|
||||||
|
|
||||||
pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ /workspace/axolotl/tests/
|
# Run unit tests with initial coverage report
|
||||||
pytest -v --durations=10 /workspace/axolotl/tests/e2e/patched/lora_kernels # running these with the other patches causes a failure
|
pytest -v --durations=10 -n8 \
|
||||||
pytest -v --durations=10 --ignore=tests/e2e/patched/lora_kernels /workspace/axolotl/tests/e2e/patched
|
--ignore=tests/e2e/ \
|
||||||
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/solo/
|
--ignore=tests/patched/ \
|
||||||
pytest -v --durations=10 /workspace/axolotl/tests/e2e/integrations/
|
--ignore=tests/cli \
|
||||||
pytest -v --durations=10 --ignore=tests/e2e/solo/ --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
|
/workspace/axolotl/tests/ \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-report=xml:coverage.xml
|
||||||
|
|
||||||
|
# Run lora kernels tests with coverage append
|
||||||
|
pytest -v --durations=10 \
|
||||||
|
/workspace/axolotl/tests/e2e/patched/lora_kernels \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append
|
||||||
|
|
||||||
|
# Run patched tests excluding lora kernels with coverage append
|
||||||
|
pytest -v --durations=10 \
|
||||||
|
--ignore=tests/e2e/patched/lora_kernels \
|
||||||
|
/workspace/axolotl/tests/e2e/patched \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append
|
||||||
|
|
||||||
|
# Run solo tests with coverage append
|
||||||
|
pytest -v --durations=10 -n1 \
|
||||||
|
/workspace/axolotl/tests/e2e/solo/ \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append
|
||||||
|
|
||||||
|
# Run integration tests with coverage append
|
||||||
|
pytest -v --durations=10 \
|
||||||
|
/workspace/axolotl/tests/e2e/integrations/ \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append
|
||||||
|
|
||||||
|
pytest -v --durations=10 /workspace/axolotl/tests/cli \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append
|
||||||
|
|
||||||
|
# Run remaining e2e tests with coverage append and final report
|
||||||
|
pytest -v --durations=10 \
|
||||||
|
--ignore=tests/e2e/solo/ \
|
||||||
|
--ignore=tests/e2e/patched/ \
|
||||||
|
--ignore=tests/e2e/multigpu/ \
|
||||||
|
--ignore=tests/e2e/integrations/ \
|
||||||
|
--ignore=tests/cli \
|
||||||
|
/workspace/axolotl/tests/e2e/ \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append \
|
||||||
|
--cov-report=xml:coverage.xml
|
||||||
|
|
||||||
|
# Upload coverage to Codecov
|
||||||
|
if [ -f e2e-coverage.xml ]; then
|
||||||
|
codecov -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION}
|
||||||
|
else
|
||||||
|
echo "Coverage file not found. Coverage report may have failed."
|
||||||
|
fi
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
"""Modal app to run axolotl GPU tests"""
|
"""Modal app to run axolotl GPU tests"""
|
||||||
|
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
modal application to run axolotl gpu tests in Modal
|
modal application to run axolotl gpu tests in Modal
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@@ -67,7 +68,7 @@ def run_cmd(cmd: str, run_folder: str):
|
|||||||
@app.function(
|
@app.function(
|
||||||
image=cicd_image,
|
image=cicd_image,
|
||||||
gpu=GPU_CONFIG,
|
gpu=GPU_CONFIG,
|
||||||
timeout=60 * 60,
|
timeout=90 * 60,
|
||||||
cpu=8.0,
|
cpu=8.0,
|
||||||
memory=131072 * N_GPUS,
|
memory=131072 * N_GPUS,
|
||||||
volumes=VOLUME_CONFIG,
|
volumes=VOLUME_CONFIG,
|
||||||
|
|||||||
@@ -2,4 +2,24 @@
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
# only run one test at a time so as not to OOM the GPU
|
# only run one test at a time so as not to OOM the GPU
|
||||||
pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/
|
pytest -v --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
|
||||||
|
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
|
||||||
|
|
||||||
|
# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
|
||||||
|
pytest -v -n2 \
|
||||||
|
--ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
|
||||||
|
/workspace/axolotl/tests/e2e/multigpu/ \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-report=xml:multigpu-coverage.xml
|
||||||
|
|
||||||
|
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/ \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append \
|
||||||
|
--cov-report=xml:multigpu-coverage.xml
|
||||||
|
|
||||||
|
# Upload coverage to Codecov
|
||||||
|
if [ -f multigpu-coverage.xml ]; then
|
||||||
|
codecov -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}
|
||||||
|
else
|
||||||
|
echo "Coverage file not found. Coverage report may have failed."
|
||||||
|
fi
|
||||||
|
|||||||
51
codecov.yml
Normal file
51
codecov.yml
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
codecov:
|
||||||
|
require_ci_to_pass: yes
|
||||||
|
|
||||||
|
coverage:
|
||||||
|
precision: 2
|
||||||
|
round: down
|
||||||
|
range: "70...100"
|
||||||
|
status:
|
||||||
|
project:
|
||||||
|
default:
|
||||||
|
# basic
|
||||||
|
target: auto
|
||||||
|
threshold: 0%
|
||||||
|
base: auto
|
||||||
|
# advanced
|
||||||
|
branches: null
|
||||||
|
if_no_uploads: error
|
||||||
|
if_not_found: success
|
||||||
|
if_ci_failed: error
|
||||||
|
only_pulls: false
|
||||||
|
flags: null
|
||||||
|
paths: null
|
||||||
|
patch:
|
||||||
|
default:
|
||||||
|
# basic
|
||||||
|
target: auto
|
||||||
|
threshold: 0%
|
||||||
|
base: auto
|
||||||
|
# advanced
|
||||||
|
branches: null
|
||||||
|
if_no_uploads: error
|
||||||
|
if_not_found: success
|
||||||
|
if_ci_failed: error
|
||||||
|
only_pulls: false
|
||||||
|
flags: null
|
||||||
|
paths: null
|
||||||
|
|
||||||
|
parsers:
|
||||||
|
gcov:
|
||||||
|
branch_detection:
|
||||||
|
conditional: yes
|
||||||
|
loop: yes
|
||||||
|
method: no
|
||||||
|
macro: no
|
||||||
|
|
||||||
|
comment:
|
||||||
|
layout: "reach,diff,flags,files,footer"
|
||||||
|
behavior: default
|
||||||
|
require_changes: no
|
||||||
|
require_base: no
|
||||||
|
require_head: yes
|
||||||
@@ -20,9 +20,9 @@ WORKDIR /workspace/axolotl
|
|||||||
|
|
||||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||||
else \
|
else \
|
||||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
RUN python scripts/unsloth_install.py | sh
|
RUN python scripts/unsloth_install.py | sh
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
|||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
|
RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
|
||||||
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
|
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
|
||||||
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
|
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
|
||||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
|
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
|
||||||
|
|
||||||
|
|||||||
38
docker/Dockerfile-base-next
Normal file
38
docker/Dockerfile-base-next
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
ARG CUDA_VERSION="12.8.1"
|
||||||
|
ARG CUDNN_VERSION="8"
|
||||||
|
ARG UBUNTU_VERSION="22.04"
|
||||||
|
ARG MAX_JOBS=4
|
||||||
|
|
||||||
|
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
||||||
|
|
||||||
|
ENV PATH="/root/miniconda3/bin:${PATH}"
|
||||||
|
|
||||||
|
ARG PYTHON_VERSION="3.11"
|
||||||
|
ARG PYTORCH_VERSION="next"
|
||||||
|
ARG CUDA="128"
|
||||||
|
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
||||||
|
|
||||||
|
ENV PYTHON_VERSION=$PYTHON_VERSION
|
||||||
|
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& wget \
|
||||||
|
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
||||||
|
&& mkdir /root/.conda \
|
||||||
|
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||||
|
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
||||||
|
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||||
|
|
||||||
|
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
||||||
|
|
||||||
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
|
||||||
|
python3 -m pip install --no-cache-dir -U torch==2.7.0 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
|
||||||
|
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
|
||||||
|
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
|
||||||
|
|
||||||
|
RUN git lfs install --skip-repo && \
|
||||||
|
pip3 install awscli && \
|
||||||
|
pip3 install -U --no-cache-dir pydantic==2.10.6
|
||||||
2
docs/.gitignore
vendored
2
docs/.gitignore
vendored
@@ -1,2 +1,4 @@
|
|||||||
/.quarto/
|
/.quarto/
|
||||||
_site/
|
_site/
|
||||||
|
/api/*.qmd
|
||||||
|
/api/*.html
|
||||||
|
|||||||
42
docs/cli.qmd
42
docs/cli.qmd
@@ -1,5 +1,5 @@
|
|||||||
---
|
---
|
||||||
title: "CLI Reference"
|
title: "Command Line Interface (CLI)"
|
||||||
format:
|
format:
|
||||||
html:
|
html:
|
||||||
toc: true
|
toc: true
|
||||||
@@ -170,7 +170,7 @@ axolotl merge-sharded-fsdp-weights config.yml
|
|||||||
|
|
||||||
### evaluate
|
### evaluate
|
||||||
|
|
||||||
Evaluates a model's performance using metrics specified in the config.
|
Evaluates a model's performance (loss etc) on the train and eval datasets.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Basic evaluation
|
# Basic evaluation
|
||||||
@@ -197,6 +197,8 @@ lm_eval_batch_size: # Batch size for evaluation
|
|||||||
output_dir: # Directory to save evaluation results
|
output_dir: # Directory to save evaluation results
|
||||||
```
|
```
|
||||||
|
|
||||||
|
See [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) for more details.
|
||||||
|
|
||||||
## Legacy CLI Usage
|
## Legacy CLI Usage
|
||||||
|
|
||||||
While the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:
|
While the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:
|
||||||
@@ -235,7 +237,7 @@ Create a cloud config YAML with your Modal settings:
|
|||||||
```yaml
|
```yaml
|
||||||
# cloud_config.yml
|
# cloud_config.yml
|
||||||
provider: modal
|
provider: modal
|
||||||
gpu: a100 # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4
|
gpu: a100 # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4
|
||||||
gpu_count: 1 # Number of GPUs to use
|
gpu_count: 1 # Number of GPUs to use
|
||||||
timeout: 86400 # Maximum runtime in seconds (24 hours)
|
timeout: 86400 # Maximum runtime in seconds (24 hours)
|
||||||
branch: main # Git branch to use (optional)
|
branch: main # Git branch to use (optional)
|
||||||
@@ -248,7 +250,7 @@ volumes: # Persistent storage volumes
|
|||||||
- name: axolotl-artifacts
|
- name: axolotl-artifacts
|
||||||
mount: /workspace/artifacts
|
mount: /workspace/artifacts
|
||||||
|
|
||||||
env: # Environment variables
|
secrets: # Secrets to inject
|
||||||
- WANDB_API_KEY
|
- WANDB_API_KEY
|
||||||
- HF_TOKEN
|
- HF_TOKEN
|
||||||
```
|
```
|
||||||
@@ -274,15 +276,27 @@ axolotl lm-eval config.yml --cloud cloud_config.yml
|
|||||||
### Cloud Configuration Options
|
### Cloud Configuration Options
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
provider: # compute provider, currently only `modal` is supported
|
provider: # compute provider, currently only `modal` is supported
|
||||||
gpu: # GPU type to use
|
gpu: # GPU type to use
|
||||||
gpu_count: # Number of GPUs (default: 1)
|
gpu_count: # Number of GPUs (default: 1)
|
||||||
memory: # RAM in GB (default: 128)
|
memory: # RAM in GB (default: 128)
|
||||||
timeout: # Maximum runtime in seconds
|
timeout: # Maximum runtime in seconds
|
||||||
timeout_preprocess: # Preprocessing timeout
|
timeout_preprocess: # Preprocessing timeout
|
||||||
branch: # Git branch to use
|
branch: # Git branch to use
|
||||||
docker_tag: # Custom Docker image tag
|
docker_tag: # Custom Docker image tag
|
||||||
volumes: # List of persistent storage volumes
|
volumes: # List of persistent storage volumes
|
||||||
env: # Environment variables to pass
|
|
||||||
secrets: # Secrets to inject
|
# Environment variables to pass. Can be specified in two ways:
|
||||||
|
# 1. As a string: Will load the value from the host computer's environment variables
|
||||||
|
# 2. As a key-value pair: Will use the specified value directly
|
||||||
|
# Example:
|
||||||
|
# env:
|
||||||
|
# - CUSTOM_VAR # Loads from host's $CUSTOM_VAR
|
||||||
|
# - {CUSTOM_VAR: "value"} # Uses "value" directly
|
||||||
|
env:
|
||||||
|
|
||||||
|
# Secrets to inject. Same input format as `env` but for sensitive data.
|
||||||
|
secrets:
|
||||||
|
# - HF_TOKEN
|
||||||
|
# - WANDB_API_KEY
|
||||||
```
|
```
|
||||||
|
|||||||
128
docs/config.qmd
128
docs/config.qmd
@@ -32,6 +32,9 @@ tokenizer_legacy:
|
|||||||
resize_token_embeddings_to_32x:
|
resize_token_embeddings_to_32x:
|
||||||
# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
|
# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
|
||||||
shrink_embeddings:
|
shrink_embeddings:
|
||||||
|
# Whether to load the model with randomly initialized weights. Useful for
|
||||||
|
# pre-training a model from scratch or debugging purposes.
|
||||||
|
random_init_weights:
|
||||||
|
|
||||||
# (Internal use only)
|
# (Internal use only)
|
||||||
# Used to identify which the model is based on
|
# Used to identify which the model is based on
|
||||||
@@ -87,7 +90,7 @@ lora_on_cpu: true
|
|||||||
|
|
||||||
# List[str]. Add plugins to extend the pipeline.
|
# List[str]. Add plugins to extend the pipeline.
|
||||||
# See `src/axolotl/integrations` for the available plugins or doc below for more details.
|
# See `src/axolotl/integrations` for the available plugins or doc below for more details.
|
||||||
# https://axolotl-ai-cloud.github.io/axolotl/docs/custom_integrations.html
|
# https://docs.axolotl.ai/docs/custom_integrations.html
|
||||||
plugins:
|
plugins:
|
||||||
# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||||
|
|
||||||
@@ -106,7 +109,7 @@ datasets:
|
|||||||
preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)
|
preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)
|
||||||
|
|
||||||
name: # Optional[str] name of dataset configuration to load
|
name: # Optional[str] name of dataset configuration to load
|
||||||
train_on_split: train # Optional[str] name of dataset split to load from
|
split: train # Optional[str] name of dataset split to load from
|
||||||
revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
|
revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
|
||||||
trust_remote_code: # Optional[bool] Trust remote code for untrusted source
|
trust_remote_code: # Optional[bool] Trust remote code for untrusted source
|
||||||
|
|
||||||
@@ -162,7 +165,9 @@ datasets:
|
|||||||
content: value
|
content: value
|
||||||
# ...
|
# ...
|
||||||
|
|
||||||
# Optional[Dict[str, List]]. Roles mapping in the messages. The default is:
|
# Optional[Dict[str, List]]. Roles mapping in the messages.
|
||||||
|
# The format is {target_role: [source_roles]}. All source roles will be mapped to the target role.
|
||||||
|
# The default is:
|
||||||
roles:
|
roles:
|
||||||
user: ["human", "user"]
|
user: ["human", "user"]
|
||||||
assistant: ["gpt", "assistant"]
|
assistant: ["gpt", "assistant"]
|
||||||
@@ -235,10 +240,10 @@ simpo_gamma: 0.5 # Target reward margin for the SimPO loss
|
|||||||
# grpo
|
# grpo
|
||||||
trl:
|
trl:
|
||||||
use_vllm: # Optional[bool]. Whether to use VLLM for RL training.
|
use_vllm: # Optional[bool]. Whether to use VLLM for RL training.
|
||||||
vllm_device: # Optional[str]. Device to use for VLLM.
|
vllm_server_host: # Optional[str]. Host of the vLLM server to connect to.
|
||||||
vllm_gpu_memory_utilization: # Optional[float]. GPU memory utilization for VLLM.
|
vllm_server_port: # Optional[int]. Port of the vLLM server to connect to.
|
||||||
vllm_max_model_len: # Optional[int]. Maximum length of the model for VLLM.
|
vllm_server_timeout: # Optional[int]. Total timeout (in seconds) to wait for the vLLM server to respond.
|
||||||
vllm_dtype: # Optional[str]. Data type for VLLM.
|
vllm_guided_decoding_regex: # Optional[str]. Regex for vLLM guided decoding.
|
||||||
|
|
||||||
beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use
|
beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use
|
||||||
max_completion_length: # Optional[int]. Maximum length of the completion for RL training.
|
max_completion_length: # Optional[int]. Maximum length of the completion for RL training.
|
||||||
@@ -317,9 +322,13 @@ total_num_tokens:
|
|||||||
sample_packing_group_size: 100000
|
sample_packing_group_size: 100000
|
||||||
# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
|
# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
|
||||||
sample_packing_bin_size: 200
|
sample_packing_bin_size: 200
|
||||||
|
sample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially.
|
||||||
|
|
||||||
# whether to concatenate samples during pretraining
|
# whether to concatenate samples during pretraining
|
||||||
pretraining_sample_concatenation:
|
pretraining_sample_concatenation:
|
||||||
|
|
||||||
|
curriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning
|
||||||
|
|
||||||
# Use batch flattening for speedups when not using sample_packing
|
# Use batch flattening for speedups when not using sample_packing
|
||||||
batch_flattening:
|
batch_flattening:
|
||||||
|
|
||||||
@@ -351,7 +360,27 @@ lora_target_modules:
|
|||||||
# - down_proj
|
# - down_proj
|
||||||
# - up_proj
|
# - up_proj
|
||||||
lora_target_linear: # If true, will target all linear modules
|
lora_target_linear: # If true, will target all linear modules
|
||||||
peft_layers_to_transform: # The layer indices to transform, otherwise, apply to all layers
|
|
||||||
|
# List[int] | int. # The layer indices to transform, otherwise, apply to all layers
|
||||||
|
# https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig.layers_to_transform
|
||||||
|
peft_layers_to_transform:
|
||||||
|
|
||||||
|
# Optional[bool]. Whether to use DoRA.
|
||||||
|
# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#weight-decomposed-low-rank-adaptation-dora
|
||||||
|
peft_use_dora:
|
||||||
|
|
||||||
|
# Optional[bool]. Whether to use RSLoRA.
|
||||||
|
# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#rank-stabilized-lora
|
||||||
|
peft_use_rslora:
|
||||||
|
|
||||||
|
# Optional[list[tuple[int, int]]]. List of layer indices to replicate.
|
||||||
|
# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#memory-efficient-layer-replication-with-lora
|
||||||
|
peft_layer_replication:
|
||||||
|
|
||||||
|
# bool | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq"]
|
||||||
|
# How to initialize LoRA weights. Default to True which is MS original implementation.
|
||||||
|
# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#initialization
|
||||||
|
peft_init_lora_weights:
|
||||||
|
|
||||||
# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
|
# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
|
||||||
# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
|
# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
|
||||||
@@ -365,7 +394,7 @@ lora_fan_in_fan_out: false
|
|||||||
|
|
||||||
# Apply custom LoRA autograd functions and activation function Triton kernels for
|
# Apply custom LoRA autograd functions and activation function Triton kernels for
|
||||||
# speed and memory savings
|
# speed and memory savings
|
||||||
# See: https://axolotl-ai-cloud.github.io/axolotl/docs/lora_optims.html
|
# See: https://docs.axolotl.ai/docs/lora_optims.html
|
||||||
lora_mlp_kernel: true
|
lora_mlp_kernel: true
|
||||||
lora_qkv_kernel: true
|
lora_qkv_kernel: true
|
||||||
lora_o_kernel: true
|
lora_o_kernel: true
|
||||||
@@ -463,6 +492,7 @@ auto_find_batch_size: # Optional[bool]
|
|||||||
|
|
||||||
eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
|
eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
|
||||||
eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
|
eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
|
||||||
|
do_causal_lm_eval: # Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`.
|
||||||
eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"]
|
eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"]
|
||||||
|
|
||||||
profiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.
|
profiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.
|
||||||
@@ -482,7 +512,8 @@ train_on_inputs: false
|
|||||||
# Note that training loss may have an oscillating pattern with this enabled.
|
# Note that training loss may have an oscillating pattern with this enabled.
|
||||||
group_by_length: false
|
group_by_length: false
|
||||||
|
|
||||||
# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
|
# Whether to use gradient checkpointing. Available options are: true, false, "offload".
|
||||||
|
# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
|
||||||
gradient_checkpointing: false
|
gradient_checkpointing: false
|
||||||
# additional kwargs to pass to the trainer for gradient checkpointing
|
# additional kwargs to pass to the trainer for gradient checkpointing
|
||||||
# gradient_checkpointing_kwargs:
|
# gradient_checkpointing_kwargs:
|
||||||
@@ -503,36 +534,58 @@ lr_div_factor: # Learning rate div factor
|
|||||||
|
|
||||||
# Specify optimizer
|
# Specify optimizer
|
||||||
# Valid values are driven by the Transformers OptimizerNames class, see:
|
# Valid values are driven by the Transformers OptimizerNames class, see:
|
||||||
# https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
|
# https://github.com/huggingface/transformers/blob/cbf924b76c03828101a34069a96d209314114fd5/src/transformers/training_args.py#L144-L189
|
||||||
#
|
#
|
||||||
# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
|
# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
|
||||||
# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
|
# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
|
||||||
# in the examples/ for your model and fine-tuning use case.
|
# in the examples/ for your model and fine-tuning use case.
|
||||||
#
|
#
|
||||||
# Valid values for 'optimizer' include:
|
# Valid values for 'optimizer' include:
|
||||||
# - adamw_hf
|
|
||||||
# - adamw_torch
|
# - adamw_torch
|
||||||
# - adamw_torch_fused
|
# - adamw_torch_fused
|
||||||
# - adamw_torch_xla
|
# - adamw_torch_xla
|
||||||
|
# - adamw_torch_npu_fused
|
||||||
# - adamw_apex_fused
|
# - adamw_apex_fused
|
||||||
# - adopt_adamw (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)
|
# - adopt_adamw (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)
|
||||||
# - adafactor
|
# - adafactor
|
||||||
# - adamw_anyprecision
|
# - adamw_anyprecision
|
||||||
|
# - adamw_torch_4bit
|
||||||
|
# - ademamix
|
||||||
# - sgd
|
# - sgd
|
||||||
# - adagrad
|
# - adagrad
|
||||||
# - adamw_bnb_8bit
|
# - adamw_bnb_8bit
|
||||||
|
# - adamw_8bit # alias for adamw_bnb_8bit
|
||||||
|
# - ademamix_8bit
|
||||||
# - lion_8bit
|
# - lion_8bit
|
||||||
# - lion_32bit
|
# - lion_32bit
|
||||||
# - paged_adamw_32bit
|
# - paged_adamw_32bit
|
||||||
# - paged_adamw_8bit
|
# - paged_adamw_8bit
|
||||||
|
# - paged_ademamix_32bit
|
||||||
|
# - paged_ademamix_8bit
|
||||||
# - paged_lion_32bit
|
# - paged_lion_32bit
|
||||||
# - paged_lion_8bit
|
# - paged_lion_8bit
|
||||||
|
# - rmsprop
|
||||||
|
# - rmsprop_bnb
|
||||||
|
# - rmsprop_bnb_8bit
|
||||||
|
# - rmsprop_bnb_32bit
|
||||||
# - galore_adamw
|
# - galore_adamw
|
||||||
# - galore_adamw_8bit
|
# - galore_adamw_8bit
|
||||||
# - galore_adafactor
|
# - galore_adafactor
|
||||||
# - galore_adamw_layerwise
|
# - galore_adamw_layerwise
|
||||||
# - galore_adamw_8bit_layerwise
|
# - galore_adamw_8bit_layerwise
|
||||||
# - galore_adafactor_layerwise
|
# - galore_adafactor_layerwise
|
||||||
|
# - lomo
|
||||||
|
# - adalomo
|
||||||
|
# - grokadamw
|
||||||
|
# - schedule_free_adamw
|
||||||
|
# - schedule_free_sgd
|
||||||
|
# - apollo_adamw
|
||||||
|
# - apollo_adamw_layerwise
|
||||||
|
#
|
||||||
|
# Additional custom optimizers include:
|
||||||
|
# - optimi_adamw
|
||||||
|
# - ao_adamw_8bit
|
||||||
|
# - ao_adamw_fp8
|
||||||
optimizer:
|
optimizer:
|
||||||
# Dictionary of arguments to pass to the optimizer
|
# Dictionary of arguments to pass to the optimizer
|
||||||
optim_args:
|
optim_args:
|
||||||
@@ -561,29 +614,42 @@ max_grad_norm:
|
|||||||
# currently only supported on Llama and Mistral
|
# currently only supported on Llama and Mistral
|
||||||
neftune_noise_alpha:
|
neftune_noise_alpha:
|
||||||
|
|
||||||
# Whether to bettertransformers
|
# Optional[bool]. Whether to bettertransformers
|
||||||
flash_optimum:
|
flash_optimum:
|
||||||
# Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
|
|
||||||
|
# Note: Only one of the following attention patches can be used at a time.
|
||||||
|
# For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`.
|
||||||
|
|
||||||
|
# Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
|
||||||
xformers_attention:
|
xformers_attention:
|
||||||
# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
|
# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
|
||||||
flash_attention:
|
flash_attention:
|
||||||
flash_attn_cross_entropy: # Whether to use flash-attention cross entropy implementation - advanced use only
|
flash_attn_cross_entropy: # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only
|
||||||
flash_attn_rms_norm: # Whether to use flash-attention rms norm implementation - advanced use only
|
flash_attn_rms_norm: # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only
|
||||||
flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
|
flash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation
|
||||||
flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
|
flash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation
|
||||||
# Whether to use scaled-dot-product attention
|
# Optional[bool]. Whether to use scaled-dot-product attention
|
||||||
# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
|
# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
|
||||||
sdp_attention:
|
sdp_attention:
|
||||||
# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
|
# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
|
||||||
s2_attention:
|
s2_attention:
|
||||||
|
|
||||||
# Optional[bool]. Whether to use low_cpu_mem_usage
|
# Optional[bool]. Whether to use low_cpu_mem_usage
|
||||||
low_cpu_mem_usage:
|
low_cpu_mem_usage:
|
||||||
# Resume from a specific checkpoint dir
|
# Optional[str]. Resume from a specific checkpoint dir
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
|
# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off.
|
||||||
# Be careful with this being turned on between different models.
|
# Be careful with this being turned on between different models.
|
||||||
auto_resume_from_checkpoints: false
|
auto_resume_from_checkpoints: false
|
||||||
|
|
||||||
|
## Multimodal section
|
||||||
|
# int | tuple[int, int] | None . Size to resize images to, width x height.
|
||||||
|
# Will read from model/processor config if not set.
|
||||||
|
image_size:
|
||||||
|
# str. Algorithm to use for image resizing. "bilinear", "bicubic", "lanczos". Default is "bilinear".
|
||||||
|
image_resize_algorithm: 'bilinear'
|
||||||
|
## End of multimodal section
|
||||||
|
|
||||||
# Don't mess with this, it's here for accelerate and torchrun
|
# Don't mess with this, it's here for accelerate and torchrun
|
||||||
local_rank:
|
local_rank:
|
||||||
|
|
||||||
@@ -617,6 +683,20 @@ ddp_timeout:
|
|||||||
ddp_bucket_cap_mb:
|
ddp_bucket_cap_mb:
|
||||||
ddp_broadcast_buffers:
|
ddp_broadcast_buffers:
|
||||||
|
|
||||||
|
# Sequence parallelism
|
||||||
|
# Set to a divisor of the number of GPUs available to split sequences into chunks of equal size.
|
||||||
|
# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
|
||||||
|
# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
|
||||||
|
# subsequences, or set to 4 to split into four equal-sized subsequences.
|
||||||
|
# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details.
|
||||||
|
sequence_parallel_degree:
|
||||||
|
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
||||||
|
# Must evenly divide the number of KV heads in your model.
|
||||||
|
heads_k_stride: 1
|
||||||
|
# One of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to "varlen_llama3"
|
||||||
|
# in the sample packing case, and "batch_ring" in the non-sample packing case.
|
||||||
|
ring_attn_func:
|
||||||
|
|
||||||
# Path to torch distx for optim 'adamw_anyprecision'
|
# Path to torch distx for optim 'adamw_anyprecision'
|
||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
|
|
||||||
|
|||||||
@@ -13,6 +13,13 @@ As there are a lot of available options in Axolotl, this guide aims to provide a
|
|||||||
|
|
||||||
Axolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.
|
Axolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.
|
||||||
|
|
||||||
|
::: {.callout-tip}
|
||||||
|
|
||||||
|
This guide will mainly use JSONL as an introduction. Please refer to the [dataset loading docs](../dataset_loading.qmd) to understand how to load datasets from other sources.
|
||||||
|
|
||||||
|
For `pretraining_dataset:` specifically, please refer to the [Pre-training section](#pre-training).
|
||||||
|
:::
|
||||||
|
|
||||||
## Pre-training
|
## Pre-training
|
||||||
|
|
||||||
When aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports [streaming](https://huggingface.co/docs/datasets/en/stream) to only load batches into memory at a time.
|
When aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports [streaming](https://huggingface.co/docs/datasets/en/stream) to only load batches into memory at a time.
|
||||||
@@ -450,10 +457,7 @@ datasets:
|
|||||||
type: alpaca
|
type: alpaca
|
||||||
```
|
```
|
||||||
|
|
||||||
Axolotl supports many kinds of instruction dataset. All of them can be found here (https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html) with their respective type and sample row format.
|
Axolotl supports many kinds of instruction dataset. All of them can be found in the [Instruction Dataset Documentation](inst_tune.qmd) with their respective type and sample row format.
|
||||||
|
|
||||||
|
|
||||||
Reference: [Instruction Dataset Documentation](inst_tune.qmd).
|
|
||||||
|
|
||||||
#### Custom Instruct Prompt Format
|
#### Custom Instruct Prompt Format
|
||||||
|
|
||||||
|
|||||||
276
docs/dataset_loading.qmd
Normal file
276
docs/dataset_loading.qmd
Normal file
@@ -0,0 +1,276 @@
|
|||||||
|
---
|
||||||
|
title: Dataset Loading
|
||||||
|
description: Understanding how to load datasets from different sources
|
||||||
|
back-to-top-navigation: true
|
||||||
|
toc: true
|
||||||
|
toc-depth: 5
|
||||||
|
---
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.
|
||||||
|
|
||||||
|
## Loading Datasets
|
||||||
|
|
||||||
|
We use the `datasets` library to load datasets and a mix of `load_dataset` and `load_from_disk` to load them.
|
||||||
|
|
||||||
|
You may recognize the similar named configs between `load_dataset` and the `datasets` section of the config file.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
datasets:
|
||||||
|
- path:
|
||||||
|
name:
|
||||||
|
data_files:
|
||||||
|
split:
|
||||||
|
revision:
|
||||||
|
trust_remote_code:
|
||||||
|
```
|
||||||
|
|
||||||
|
::: {.callout-tip}
|
||||||
|
|
||||||
|
Do not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be `path` and sometimes `data_files`.
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
This matches the API of [`datasets.load_dataset`](https://github.com/huggingface/datasets/blob/0b5998ac62f08e358f8dcc17ec6e2f2a5e9450b6/src/datasets/load.py#L1838-L1858), so if you're familiar with that, you will feel right at home.
|
||||||
|
|
||||||
|
For HuggingFace's guide to load different dataset types, see [here](https://huggingface.co/docs/datasets/loading).
|
||||||
|
|
||||||
|
For full details on the config, see [config.qmd](config.qmd).
|
||||||
|
|
||||||
|
::: {.callout-note}
|
||||||
|
|
||||||
|
You can set multiple datasets in the config file by more than one entry under `datasets`.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
datasets:
|
||||||
|
- path: /path/to/your/dataset
|
||||||
|
- path: /path/to/your/other/dataset
|
||||||
|
```
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
### Local dataset
|
||||||
|
|
||||||
|
#### Files
|
||||||
|
|
||||||
|
Usually, to load a JSON file, you would do something like this:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
dataset = load_dataset("json", data_files="data.json")
|
||||||
|
```
|
||||||
|
|
||||||
|
Which translates to the following config:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
datasets:
|
||||||
|
- path: json
|
||||||
|
data_files: /path/to/your/file.jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
However, to make things easier, we have added a few shortcuts for loading local dataset files.
|
||||||
|
|
||||||
|
You can just point the `path` to the file or directory along with the `ds_type` to load the dataset. The below example shows for a JSON file:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
datasets:
|
||||||
|
- path: /path/to/your/file.jsonl
|
||||||
|
ds_type: json
|
||||||
|
```
|
||||||
|
|
||||||
|
This works for CSV, JSON, Parquet, and Arrow files.
|
||||||
|
|
||||||
|
::: {.callout-tip}
|
||||||
|
|
||||||
|
If `path` points to a file and `ds_type` is not specified, we will automatically infer the dataset type from the file extension, so you could omit `ds_type` if you'd like.
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
#### Directory
|
||||||
|
|
||||||
|
If you're loading a directory, you can point the `path` to the directory.
|
||||||
|
|
||||||
|
Then, you have two options:
|
||||||
|
|
||||||
|
##### Loading entire directory
|
||||||
|
|
||||||
|
You do not need any additional configs.
|
||||||
|
|
||||||
|
We will attempt to load in the following order:
|
||||||
|
- datasets saved with `datasets.save_to_disk`
|
||||||
|
- loading entire directory of files (such as with parquet/arrow files)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
datasets:
|
||||||
|
- path: /path/to/your/directory
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Loading specific files in directory
|
||||||
|
|
||||||
|
Provide `data_files` with a list of files to load.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
datasets:
|
||||||
|
# single file
|
||||||
|
- path: /path/to/your/directory
|
||||||
|
ds_type: csv
|
||||||
|
data_files: file1.csv
|
||||||
|
|
||||||
|
# multiple files
|
||||||
|
- path: /path/to/your/directory
|
||||||
|
ds_type: json
|
||||||
|
data_files:
|
||||||
|
- file1.jsonl
|
||||||
|
- file2.jsonl
|
||||||
|
|
||||||
|
# multiple files for parquet
|
||||||
|
- path: /path/to/your/directory
|
||||||
|
ds_type: parquet
|
||||||
|
data_files:
|
||||||
|
- file1.parquet
|
||||||
|
- file2.parquet
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### HuggingFace Hub
|
||||||
|
|
||||||
|
The method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.
|
||||||
|
|
||||||
|
::: {.callout-note}
|
||||||
|
|
||||||
|
If you're using a private dataset, you will need to enable the `hf_use_auth_token` flag in the root-level of the config file.
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
#### Folder uploaded
|
||||||
|
|
||||||
|
This would mean that the dataset is a single file or file(s) uploaded to the Hub.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
datasets:
|
||||||
|
- path: org/dataset-name
|
||||||
|
data_files:
|
||||||
|
- file1.jsonl
|
||||||
|
- file2.jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
#### HuggingFace Dataset
|
||||||
|
|
||||||
|
This means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via `datasets.push_to_hub`.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
datasets:
|
||||||
|
- path: org/dataset-name
|
||||||
|
```
|
||||||
|
|
||||||
|
::: {.callout-note}
|
||||||
|
|
||||||
|
There are some other configs which may be required like `name`, `split`, `revision`, `trust_remote_code`, etc depending on the dataset.
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
### Remote Filesystems
|
||||||
|
|
||||||
|
Via the `storage_options` config under `load_dataset`, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.
|
||||||
|
|
||||||
|
::: {.callout-warning}
|
||||||
|
|
||||||
|
This is currently experimental. Please let us know if you run into any issues!
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
The only difference between the providers is that you need to prepend the path with the respective protocols.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
datasets:
|
||||||
|
# Single file
|
||||||
|
- path: s3://bucket-name/path/to/your/file.jsonl
|
||||||
|
|
||||||
|
# Directory
|
||||||
|
- path: s3://bucket-name/path/to/your/directory
|
||||||
|
```
|
||||||
|
|
||||||
|
For directory, we load via `load_from_disk`.
|
||||||
|
|
||||||
|
#### S3
|
||||||
|
|
||||||
|
Prepend the path with `s3://`.
|
||||||
|
|
||||||
|
The credentials are pulled in the following order:
|
||||||
|
|
||||||
|
- `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN` environment variables
|
||||||
|
- from the `~/.aws/credentials` file
|
||||||
|
- for nodes on EC2, the IAM metadata provider
|
||||||
|
|
||||||
|
::: {.callout-note}
|
||||||
|
|
||||||
|
We assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
|
Other environment variables that can be set can be found in [boto3 docs](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#using-environment-variables)
|
||||||
|
|
||||||
|
#### GCS
|
||||||
|
|
||||||
|
Prepend the path with `gs://` or `gcs://`.
|
||||||
|
|
||||||
|
The credentials are loaded in the following order:
|
||||||
|
|
||||||
|
- gcloud credentials
|
||||||
|
- for nodes on GCP, the google metadata service
|
||||||
|
- anonymous access
|
||||||
|
|
||||||
|
#### Azure
|
||||||
|
|
||||||
|
##### Gen 1
|
||||||
|
|
||||||
|
Prepend the path with `adl://`.
|
||||||
|
|
||||||
|
Ensure you have the following environment variables set:
|
||||||
|
|
||||||
|
- `AZURE_STORAGE_TENANT_ID`
|
||||||
|
- `AZURE_STORAGE_CLIENT_ID`
|
||||||
|
- `AZURE_STORAGE_CLIENT_SECRET`
|
||||||
|
|
||||||
|
##### Gen 2
|
||||||
|
|
||||||
|
Prepend the path with `abfs://` or `az://`.
|
||||||
|
|
||||||
|
Ensure you have the following environment variables set:
|
||||||
|
|
||||||
|
- `AZURE_STORAGE_ACCOUNT_NAME`
|
||||||
|
- `AZURE_STORAGE_ACCOUNT_KEY`
|
||||||
|
|
||||||
|
Other environment variables that can be set can be found in [adlfs docs](https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials)
|
||||||
|
|
||||||
|
#### OCI
|
||||||
|
|
||||||
|
Prepend the path with `oci://`.
|
||||||
|
|
||||||
|
It would attempt to read in the following order:
|
||||||
|
|
||||||
|
- `OCIFS_IAM_TYPE`, `OCIFS_CONFIG_LOCATION`, and `OCIFS_CONFIG_PROFILE` environment variables
|
||||||
|
- when on OCI resource, resource principal
|
||||||
|
|
||||||
|
Other environment variables:
|
||||||
|
|
||||||
|
- `OCI_REGION_METADATA`
|
||||||
|
|
||||||
|
Please see the [ocifs docs](https://ocifs.readthedocs.io/en/latest/getting-connected.html#Using-Environment-Variables).
|
||||||
|
|
||||||
|
### HTTPS
|
||||||
|
|
||||||
|
The path should start with `https://`.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
datasets:
|
||||||
|
- path: https://path/to/your/dataset/file.jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
This must be publically accessible.
|
||||||
|
|
||||||
|
## Next steps
|
||||||
|
|
||||||
|
Now that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format [dataset formats docs](dataset-formats).
|
||||||
@@ -6,7 +6,7 @@ description: How datasets are processed
|
|||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
Dataset pre-processing is the step where Axolotl takes each dataset you've configured alongside
|
Dataset pre-processing is the step where Axolotl takes each dataset you've configured alongside
|
||||||
the [dataset format](docs/dataset-formats) and prompt strategies to:
|
the [dataset format](dataset-formats) and prompt strategies to:
|
||||||
|
|
||||||
- parse the dataset based on the *dataset format*
|
- parse the dataset based on the *dataset format*
|
||||||
- transform the dataset to how you would interact with the model based on the *prompt strategy*
|
- transform the dataset to how you would interact with the model based on the *prompt strategy*
|
||||||
|
|||||||
@@ -103,8 +103,7 @@ This uses the same tags as the [`main` image](#sec-main-tags).
|
|||||||
|
|
||||||
- `JUPYTER_DISABLE`: Disable Jupyter lab.
|
- `JUPYTER_DISABLE`: Disable Jupyter lab.
|
||||||
- `JUPYTER_PASSWORD`: Set a password for the Jupyter lab.
|
- `JUPYTER_PASSWORD`: Set a password for the Jupyter lab.
|
||||||
- `PUBLIC_KEY`: Add a public key for the SSH service.
|
- `PUBLIC_KEY` / `SSH_KEY`: Add a public key for the SSH service.
|
||||||
- `SSH_KEY`: Add a private key for the SSH service.
|
|
||||||
|
|
||||||
#### Volume mounts
|
#### Volume mounts
|
||||||
|
|
||||||
|
|||||||
16
docs/faq.qmd
16
docs/faq.qmd
@@ -35,7 +35,21 @@ description: Frequently asked questions
|
|||||||
|
|
||||||
**Q: How to call Axolotl via custom python scripts?**
|
**Q: How to call Axolotl via custom python scripts?**
|
||||||
|
|
||||||
> A: Yes, since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.
|
> A: Since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.
|
||||||
|
|
||||||
|
**Q: How to know the value to use for `fsdp_transformer_layer_cls_to_wrap`?**
|
||||||
|
|
||||||
|
> A: This is the class name of the transformer layer to wrap with FSDP. For example, for `LlamaForCausalLM`, the value is `LlamaDecoderLayer`. To find this for a specific model, check the model's `PreTrainedModel` definition and look for `_no_split_modules` variable in the `modeling_<model_name>.py` file within `transformers` library.
|
||||||
|
|
||||||
|
**Q: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token**
|
||||||
|
|
||||||
|
> A: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:
|
||||||
|
|
||||||
|
> ```yaml
|
||||||
|
> special_tokens:
|
||||||
|
> # str. If you're not sure, set to same as `eos_token`.
|
||||||
|
> pad_token: "..."
|
||||||
|
> ```
|
||||||
|
|
||||||
### Chat templates
|
### Chat templates
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ We currently support several common model architectures, including (but not limi
|
|||||||
- `qwen2`
|
- `qwen2`
|
||||||
- `gemma`
|
- `gemma`
|
||||||
- `gemma2`
|
- `gemma2`
|
||||||
|
- `gemma3`
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ Axolotl supports several methods for multi-GPU training:
|
|||||||
|
|
||||||
- DeepSpeed (recommended)
|
- DeepSpeed (recommended)
|
||||||
- FSDP (Fully Sharded Data Parallel)
|
- FSDP (Fully Sharded Data Parallel)
|
||||||
|
- Sequence parallelism
|
||||||
- FSDP + QLoRA
|
- FSDP + QLoRA
|
||||||
|
|
||||||
## DeepSpeed {#sec-deepspeed}
|
## DeepSpeed {#sec-deepspeed}
|
||||||
@@ -35,6 +36,9 @@ deepspeed: deepspeed_configs/zero1.json
|
|||||||
### Usage {#sec-deepspeed-usage}
|
### Usage {#sec-deepspeed-usage}
|
||||||
|
|
||||||
```{.bash}
|
```{.bash}
|
||||||
|
# Fetch deepspeed configs (if not already present)
|
||||||
|
axolotl fetch deepspeed_configs
|
||||||
|
|
||||||
# Passing arg via config
|
# Passing arg via config
|
||||||
axolotl train config.yml
|
axolotl train config.yml
|
||||||
|
|
||||||
@@ -47,10 +51,20 @@ axolotl train config.yml --deepspeed deepspeed_configs/zero1.json
|
|||||||
We provide default configurations for:
|
We provide default configurations for:
|
||||||
|
|
||||||
- ZeRO Stage 1 (`zero1.json`)
|
- ZeRO Stage 1 (`zero1.json`)
|
||||||
|
- ZeRO Stage 1 with torch compile (`zero1_torch_compile.json`)
|
||||||
- ZeRO Stage 2 (`zero2.json`)
|
- ZeRO Stage 2 (`zero2.json`)
|
||||||
- ZeRO Stage 3 (`zero3.json`)
|
- ZeRO Stage 3 (`zero3.json`)
|
||||||
|
- ZeRO Stage 3 with bf16 (`zero3_bf16.json`)
|
||||||
|
- ZeRO Stage 3 with bf16 and CPU offload params(`zero3_bf16_cpuoffload_params.json`)
|
||||||
|
- ZeRO Stage 3 with bf16 and CPU offload params and optimizer (`zero3_bf16_cpuoffload_all.json`)
|
||||||
|
|
||||||
Choose based on your memory requirements and performance needs.
|
::: {.callout-tip}
|
||||||
|
|
||||||
|
Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.
|
||||||
|
|
||||||
|
Start from Stage 1 -> Stage 2 -> Stage 3.
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
## FSDP {#sec-fsdp}
|
## FSDP {#sec-fsdp}
|
||||||
|
|
||||||
@@ -66,6 +80,28 @@ fsdp_config:
|
|||||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Sequence parallelism {#sec-sequence-parallelism}
|
||||||
|
|
||||||
|
We support sequence parallelism (SP) via the
|
||||||
|
[ring-flash-attention](https://github.com/zhuzilin/ring-flash-attention) project. This
|
||||||
|
allows one to split up sequences across GPUs, which is useful in the event that a
|
||||||
|
single sequence causes OOM errors during model training.
|
||||||
|
|
||||||
|
First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
|
||||||
|
or from source with `pip install .[ring-flash-attn]`.
|
||||||
|
|
||||||
|
Your Axolotl YAML config should contain the following lines:
|
||||||
|
|
||||||
|
```{.yaml}
|
||||||
|
sequence_parallel_degree: 4 # Split each sequence into 4 parts, one per GPU
|
||||||
|
flash_attention: true # Required with sequence parallelism
|
||||||
|
|
||||||
|
# Optional; strides across the key dimension. Larger values use more memory but will make training faster.
|
||||||
|
heads_k_stride: 1
|
||||||
|
```
|
||||||
|
|
||||||
|
See our [dedicated guide](sequence_parallelism.qmd) for more details.
|
||||||
|
|
||||||
### FSDP + QLoRA {#sec-fsdp-qlora}
|
### FSDP + QLoRA {#sec-fsdp-qlora}
|
||||||
|
|
||||||
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
|
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
|
||||||
|
|||||||
@@ -1,28 +1,180 @@
|
|||||||
# MultiModal / Vision Language Models (BETA)
|
---
|
||||||
|
title: MultiModal / Vision Language Models (BETA)
|
||||||
|
format:
|
||||||
|
html:
|
||||||
|
toc: true
|
||||||
|
toc-depth: 3
|
||||||
|
---
|
||||||
|
|
||||||
### Supported Models
|
## Supported Models
|
||||||
|
|
||||||
- Mllama, i.e. llama with vision models
|
- [Mllama](#sec-mllama)
|
||||||
|
- [Llama4](#sec-llama4)
|
||||||
|
- [Pixtral](#sec-pixtral)
|
||||||
|
- [Llava-1.5](#sec-llava-15)
|
||||||
|
- [Mistral-Small-3.1](#sec-mistral-small-31)
|
||||||
|
- [Gemma-3](#sec-gemma-3)
|
||||||
|
- [Qwen2-VL](#sec-qwen2-vl)
|
||||||
|
- [Qwen2.5-VL](#sec-qwen25-vl)
|
||||||
|
|
||||||
### Usage
|
## Usage
|
||||||
|
|
||||||
Currently multimodal support is limited and doesn't have full feature parity. To finetune a multimodal Llama w/ LoRA,
|
Multimodal support is limited and doesn't have full feature parity.
|
||||||
you'll need to use the following in YAML in combination with the rest of the required hyperparams.
|
|
||||||
|
Here are the hyperparams you'll need to use to finetune a multimodal model.
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
base_model: alpindale/Llama-3.2-11B-Vision-Instruct
|
|
||||||
processor_type: AutoProcessor
|
processor_type: AutoProcessor
|
||||||
skip_prepare_dataset: true
|
|
||||||
|
|
||||||
chat_template: llama3_2_vision
|
skip_prepare_dataset: true
|
||||||
|
remove_unused_columns: false # leave columns in place as they are needed to handle image embeddings during training
|
||||||
|
sample_packing: false # not yet supported with multimodal
|
||||||
|
|
||||||
|
chat_template: # see in next section
|
||||||
|
|
||||||
|
# example dataset
|
||||||
datasets:
|
datasets:
|
||||||
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||||
type: chat_template
|
type: chat_template
|
||||||
split: train[:1%]
|
split: train[:1%]
|
||||||
field_messages: messages
|
field_messages: messages
|
||||||
remove_unused_columns: false
|
|
||||||
sample_packing: false
|
|
||||||
|
|
||||||
# only finetune the Language model, leave the vision model and vision tower frozen
|
# (optional) if doing lora, only finetune the Language model,
|
||||||
|
# leave the vision model and vision tower frozen
|
||||||
|
# load_in_8bit: true
|
||||||
|
adapter: lora
|
||||||
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||||
|
|
||||||
|
# (optional) if you want to resize images to a set size
|
||||||
|
image_size: 512
|
||||||
|
image_resize_algorithm: bilinear
|
||||||
|
```
|
||||||
|
|
||||||
|
Please see [examples](https://github.com/axolotl-ai/axolotl/tree/main/examples) folder for full configs.
|
||||||
|
|
||||||
|
::: {.callout-warning}
|
||||||
|
Some of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.
|
||||||
|
:::
|
||||||
|
|
||||||
|
### Mllama {#sec-mllama}
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
base_model: meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||||
|
|
||||||
|
chat_template: llama3_2_vision
|
||||||
|
```
|
||||||
|
|
||||||
|
### Llama4 {#sec-llama4}
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
base_model: meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||||
|
|
||||||
|
chat_template: llama4
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pixtral {#sec-pixtral}
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
base_model: mistralai/Pixtral-12B-2409
|
||||||
|
|
||||||
|
chat_template: pixtral
|
||||||
|
```
|
||||||
|
|
||||||
|
### Llava-1.5 {#sec-llava-15}
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
base_model: llava-hf/llava-1.5-7b-hf
|
||||||
|
|
||||||
|
chat_template: llava
|
||||||
|
```
|
||||||
|
|
||||||
|
### Mistral-Small-3.1 {#sec-mistral-small-31}
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
|
||||||
|
|
||||||
|
chat_template: mistral_v7_tekken
|
||||||
|
```
|
||||||
|
|
||||||
|
### Gemma-3 {#sec-gemma-3}
|
||||||
|
|
||||||
|
::: {.callout-tip}
|
||||||
|
The Gemma3-1B model is a text-only model, so please train as regular text model.
|
||||||
|
:::
|
||||||
|
|
||||||
|
For multi-modal 4B/12B/27B models, use the following config:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
base_model: google/gemma-3-4b-it
|
||||||
|
|
||||||
|
chat_template: gemma3
|
||||||
|
```
|
||||||
|
|
||||||
|
### Qwen2-VL {#sec-qwen2-vl}
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
base_model: Qwen/Qwen2-VL-7B-Instruct
|
||||||
|
|
||||||
|
chat_template: qwen2_vl
|
||||||
|
```
|
||||||
|
|
||||||
|
### Qwen2.5-VL {#sec-qwen25-vl}
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
base_model: Qwen/Qwen2.5-VL-7B-Instruct
|
||||||
|
|
||||||
|
chat_template: qwen2_vl # same as qwen2-vl
|
||||||
|
```
|
||||||
|
|
||||||
|
## Dataset Format
|
||||||
|
|
||||||
|
For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
|
||||||
|
|
||||||
|
- A message is a list of `role` and `content`.
|
||||||
|
- `role` can be `system`, `user`, `assistant`, etc.
|
||||||
|
- `content` is a list of `type` and (`text` or `image` or `path` or `url` or `base64`).
|
||||||
|
|
||||||
|
::: {.callout-note}
|
||||||
|
For backwards compatibility:
|
||||||
|
|
||||||
|
- If the dataset has a `images` or `image` column of `list[Image]`, it will be appended to the first `content` list as `{"type": "image", "image": ...}`. However, if the content already has a `{"type": "image"}` but no `image` key, it will be set the `image` key.
|
||||||
|
- If `content` is a string, it will be converted to a list with `type` as `text`.
|
||||||
|
:::
|
||||||
|
|
||||||
|
::: {.callout-tip}
|
||||||
|
For image loading, you can use the following keys within `content` alongside `"type": "image"`:
|
||||||
|
|
||||||
|
- `"path": "/path/to/image.jpg"`
|
||||||
|
- `"url": "https://example.com/image.jpg"`
|
||||||
|
- `"base64": "..."`
|
||||||
|
- `"image": PIL.Image`
|
||||||
|
:::
|
||||||
|
|
||||||
|
Here is an example of a multi-modal dataset:
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "You are a helpful assistant."}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
|
||||||
|
{"type": "text", "text": "Describe this image in detail."}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "The image is a bee."}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -502,9 +502,48 @@ The input format is a simple JSON input with customizable fields based on the ab
|
|||||||
Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
|
Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
|
||||||
:::
|
:::
|
||||||
|
|
||||||
|
If you have multiple GPUs available, we reccomend using `vLLM` with the `GRPOTrainer` to significantly speedup trajectory generation during training.
|
||||||
|
First, launch a `vLLM` server using `trl vllm-serve` - you may use a config file or CLI overrides to configure your vLLM server. In this example, we're
|
||||||
|
using 4 GPUs - 2 for training, and 2 for vLLM:
|
||||||
|
|
||||||
|
::: {.callout-important}
|
||||||
|
Make sure you've installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. `pip install axolotl[vllm]`.
|
||||||
|
:::
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
||||||
|
|
||||||
|
vllm:
|
||||||
|
host: 0.0.0.0
|
||||||
|
port: 8000
|
||||||
|
tensor_parallel_size: 2
|
||||||
|
gpu_memory_utilization: 0.85
|
||||||
|
dtype: auto
|
||||||
|
# max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand
|
||||||
|
|
||||||
|
rl: grpo
|
||||||
|
trl:
|
||||||
|
use_vllm: true
|
||||||
|
vllm_server_host: 0.0.0.0
|
||||||
|
vllm_server_port: 8000
|
||||||
|
vllm_server_timeout: 300
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Reward functions
|
||||||
|
|
||||||
GRPO uses custom reward functions and transformations. Please have them ready locally.
|
GRPO uses custom reward functions and transformations. Please have them ready locally.
|
||||||
|
|
||||||
For ex, to load OpenAI's GSM8K and use a random reward for completions:
|
For example, to load OpenAI's GSM8K and use a random reward for completions:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# rewards.py
|
# rewards.py
|
||||||
@@ -530,8 +569,6 @@ trl:
|
|||||||
beta: 0.001
|
beta: 0.001
|
||||||
max_completion_length: 256
|
max_completion_length: 256
|
||||||
use_vllm: True
|
use_vllm: True
|
||||||
vllm_device: auto
|
|
||||||
vllm_gpu_memory_utilization: 0.15
|
|
||||||
num_generations: 4
|
num_generations: 4
|
||||||
reward_funcs: ["rewards.rand_reward_func"] # format: '{file_name}.{fn_name}'
|
reward_funcs: ["rewards.rand_reward_func"] # format: '{file_name}.{fn_name}'
|
||||||
reward_weights: [1.0]
|
reward_weights: [1.0]
|
||||||
|
|||||||
100
docs/sequence_parallelism.qmd
Normal file
100
docs/sequence_parallelism.qmd
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
---
|
||||||
|
title: Sequence Parallelism
|
||||||
|
description: Train with long sequences split across multiple GPUs.
|
||||||
|
---
|
||||||
|
|
||||||
|
# Sequence Parallelism
|
||||||
|
|
||||||
|
Sequence parallelism is a technique that splits sequences across multiple GPUs,
|
||||||
|
allowing you to train with very long sequences that wouldn't fit on a single GPU. Each
|
||||||
|
GPU processes a different portion of the sequence, and the results are aggregated
|
||||||
|
through a ring communication pattern.
|
||||||
|
|
||||||
|
## When to Use Sequence Parallelism
|
||||||
|
|
||||||
|
Use sequence parallelism when:
|
||||||
|
|
||||||
|
- You need to train with sequence lengths that don't fit into a single GPU's memory
|
||||||
|
- You have multiple GPUs available
|
||||||
|
- You're experiencing OOM (Out Of Memory) errors with long sequences
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
To enable sequence parallelism, add the following to your configuration file:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Set to a divisor (> 1) of the number of GPUs available
|
||||||
|
sequence_parallel_degree: 4 # Split sequences across 4 GPUs
|
||||||
|
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
||||||
|
heads_k_stride: 1
|
||||||
|
# Optional; one of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to
|
||||||
|
# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
|
||||||
|
ring_attn_func:
|
||||||
|
```
|
||||||
|
|
||||||
|
The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
|
||||||
|
|
||||||
|
- With 8 GPUs, valid values would be 2, 4, or 8
|
||||||
|
- With 4 GPUs, valid values would be 2 or 4
|
||||||
|
|
||||||
|
## Implementation Details
|
||||||
|
|
||||||
|
When sequence parallelism is enabled:
|
||||||
|
|
||||||
|
1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group
|
||||||
|
2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
|
||||||
|
3. Position IDs are adjusted to maintain proper relative positions, especially for packed sequences
|
||||||
|
4. The trainer uses special ring communication patterns for attention operations
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
To use sequence parallelism, you need:
|
||||||
|
|
||||||
|
- Multiple GPUs (at least 2)
|
||||||
|
- The `ring-flash-attn` package. Install with:
|
||||||
|
- `pip install axolotl[ring-flash-attn]` (preferred)
|
||||||
|
- `pip install ring-flash-attn>=0.1.4`
|
||||||
|
|
||||||
|
## Limitations
|
||||||
|
|
||||||
|
- Flash attention must be enabled for this to work (`flash_attention: true` in config YAML)
|
||||||
|
- May have a small performance overhead due to communication between GPUs
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
base_model: meta-llama/Llama-3-8B-Instruct
|
||||||
|
sequence_len: 8192
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
sequence_parallel_degree: 4 # Split each sequence into 4 parts, one per GPU
|
||||||
|
flash_attention: true # Required with sequence parallelism
|
||||||
|
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
||||||
|
heads_k_stride: 1
|
||||||
|
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
This will train the Llama 3 8B model with 8K context length, with each sequence split
|
||||||
|
into 2 subsequences of length 4096 across 2 GPUs.
|
||||||
|
|
||||||
|
## Sample Packing with Sequence Parallelism
|
||||||
|
|
||||||
|
Sequence parallelism is compatible with Axolotl's sample packing functionality. When using both features together:
|
||||||
|
|
||||||
|
1. Samples are first packed together
|
||||||
|
2. The packed sequences are then divided across GPUs in the sequence parallel group
|
||||||
|
3. Position IDs are automatically adjusted to maintain proper relative positions
|
||||||
|
|
||||||
|
## Effect on Batch Size
|
||||||
|
|
||||||
|
When using sequence parallelism, your effective global batch size is **divided** by the `sequence_parallel_degree`. This happens because:
|
||||||
|
|
||||||
|
- Each group of `sequence_parallel_degree` GPUs works on the same batch (just different parts of each sequence)
|
||||||
|
- The number of batches processed per step decreases
|
||||||
|
|
||||||
|
For example:
|
||||||
|
- With 8 GPUs and no sequence parallelism: 8 different batches processed per step
|
||||||
|
- With 8 GPUs and `sequence_parallel_degree=4`: Only 2 different batches processed per step (each split across 4 GPUs)
|
||||||
|
- If your per-GPU `micro_batch_size` is 2, the global batch size decreases from 16 to 4
|
||||||
@@ -8,10 +8,6 @@ tokenizer_type: GPT2Tokenizer
|
|||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
tokenizer_use_fast: true
|
tokenizer_use_fast: true
|
||||||
tokenizer_legacy: true
|
tokenizer_legacy: true
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
hf_use_auth_token: true
|
hf_use_auth_token: true
|
||||||
datasets:
|
datasets:
|
||||||
@@ -34,7 +30,6 @@ lora_alpha:
|
|||||||
lora_dropout:
|
lora_dropout:
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -58,16 +53,12 @@ learning_rate: 0.000085
|
|||||||
train_on_inputs: true
|
train_on_inputs: true
|
||||||
group_by_length: false
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: false
|
gradient_checkpointing: false
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
sdp_attention:
|
sdp_attention:
|
||||||
flash_optimum:
|
flash_optimum:
|
||||||
@@ -80,8 +71,6 @@ evals_per_epoch: 4
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
save_total_limit:
|
save_total_limit:
|
||||||
|
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ base_model: cerebras/Cerebras-GPT-1.3B
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
@@ -22,7 +21,6 @@ lora_target_modules:
|
|||||||
- c_attn
|
- c_attn
|
||||||
- c_proj
|
- c_proj
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
lora_fan_in_fan_out:
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
wandb_watch:
|
wandb_watch:
|
||||||
@@ -36,15 +34,10 @@ optimizer: paged_adamw_8bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: true
|
tf32: true
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
xformers_attention: true
|
||||||
flash_attention:
|
flash_attention:
|
||||||
@@ -53,10 +46,6 @@ gptq_model_v1:
|
|||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
@@ -26,7 +25,6 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,29 +39,18 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
s2_attention:
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
@@ -26,9 +25,7 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -43,28 +40,18 @@ optimizer: paged_adamw_32bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
@@ -26,7 +25,6 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,29 +39,18 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
s2_attention:
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
@@ -26,9 +25,7 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -43,28 +40,18 @@ optimizer: paged_adamw_32bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
@@ -26,7 +25,6 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,29 +39,18 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
s2_attention:
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
@@ -26,9 +25,7 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -43,28 +40,18 @@ optimizer: paged_adamw_32bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
58
examples/cohere/command-r-7b-qlora.yml
Normal file
58
examples/cohere/command-r-7b-qlora.yml
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
base_model: CohereForAI/c4ai-command-r7b-12-2024
|
||||||
|
model_type: AutoModelForCausalLM
|
||||||
|
tokenizer_type: AutoTokenizer
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: true
|
||||||
|
|
||||||
|
# huggingface repo
|
||||||
|
chat_template: cohere
|
||||||
|
datasets:
|
||||||
|
- path: cgato/SlimOrcaDedupCleaned
|
||||||
|
type: chat_template
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
|
||||||
|
sequence_len: 2048
|
||||||
|
sample_packing: true
|
||||||
|
eval_sample_packing: false
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 4
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: auto
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
resume_from_checkpoint:
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_ratio: 0.1
|
||||||
|
evals_per_epoch:
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
@@ -4,10 +4,6 @@ base_model: LnL-AI/dbrx-base-converted-v2
|
|||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
type: alpaca
|
type: alpaca
|
||||||
@@ -48,26 +44,20 @@ optimizer: paged_adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: false # don't use with fsdp_activation_checkpointing
|
gradient_checkpointing: false # don't use with fsdp_activation_checkpointing
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ trust_remote_code: true
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
@@ -48,26 +47,20 @@ optimizer: paged_adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: false # don't use with fsdp_activation_checkpointing
|
gradient_checkpointing: false # don't use with fsdp_activation_checkpointing
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -4,10 +4,6 @@ base_model: LnL-AI/dbrx-base-converted-v2
|
|||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
type: alpaca
|
type: alpaca
|
||||||
@@ -35,25 +31,19 @@ optimizer: paged_adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
deepspeed: deepspeed_configs/zero3_bf16.json
|
deepspeed: deepspeed_configs/zero3_bf16.json
|
||||||
|
|||||||
58
examples/deepcoder/deepcoder-14B-preview-lora.yml
Normal file
58
examples/deepcoder/deepcoder-14B-preview-lora.yml
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
base_model: agentica-org/DeepCoder-14B-Preview
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: true
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: fozziethebeat/alpaca_messages_2k_test
|
||||||
|
type: chat_template
|
||||||
|
field_messages: messages
|
||||||
|
message_property_mappings:
|
||||||
|
role: role
|
||||||
|
content: content
|
||||||
|
|
||||||
|
dataset_prepared_path:
|
||||||
|
val_set_size: 0.05
|
||||||
|
output_dir: ./outputs/lora-out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
eval_sample_packing: false
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 4
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: auto
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
resume_from_checkpoint:
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
58
examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
Normal file
58
examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
base_model: deepcogito/cogito-v1-preview-llama-3B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: true
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: fozziethebeat/alpaca_messages_2k_test
|
||||||
|
type: chat_template
|
||||||
|
field_messages: messages
|
||||||
|
message_property_mappings:
|
||||||
|
role: role
|
||||||
|
content: content
|
||||||
|
|
||||||
|
dataset_prepared_path:
|
||||||
|
val_set_size: 0.05
|
||||||
|
output_dir: ./outputs/lora-out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
eval_sample_packing: false
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: auto
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
resume_from_checkpoint:
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
58
examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
Normal file
58
examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
base_model: deepcogito/cogito-v1-preview-qwen-14B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: true
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: fozziethebeat/alpaca_messages_2k_test
|
||||||
|
type: chat_template
|
||||||
|
field_messages: messages
|
||||||
|
message_property_mappings:
|
||||||
|
role: role
|
||||||
|
content: content
|
||||||
|
|
||||||
|
dataset_prepared_path:
|
||||||
|
val_set_size: 0.05
|
||||||
|
output_dir: ./outputs/lora-out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
eval_sample_packing: false
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: auto
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
resume_from_checkpoint:
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
@@ -3,10 +3,6 @@ base_model: deepseek-ai/DeepSeek-V2-Lite
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
type: alpaca
|
type: alpaca
|
||||||
@@ -31,27 +27,19 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 2e-5
|
learning_rate: 2e-5
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
eval_table_size:
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
fsdp:
|
fsdp:
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ trust_remote_code: true
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
|
|
||||||
plugins:
|
plugins:
|
||||||
@@ -52,27 +51,19 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 2e-5
|
learning_rate: 2e-5
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
eval_table_size:
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
fsdp:
|
fsdp:
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ trust_remote_code: true
|
|||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
gptq: false
|
gptq: false
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
@@ -25,9 +24,7 @@ max_packed_sequence_len:
|
|||||||
lora_r: 16
|
lora_r: 16
|
||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.0
|
lora_dropout: 0.0
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
wandb_watch:
|
wandb_watch:
|
||||||
@@ -41,15 +38,10 @@ optimizer: adamw_bnb_8bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00003
|
learning_rate: 0.00003
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: true
|
tf32: true
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
xformers_attention: true
|
||||||
flash_attention:
|
flash_attention:
|
||||||
@@ -58,11 +50,7 @@ gptq_model_v1:
|
|||||||
warmup_steps: 40
|
warmup_steps: 40
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
bos_token: "<|endoftext|>"
|
bos_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -15,7 +15,6 @@ load_in_8bit: false
|
|||||||
# enable 4bit for QLoRA
|
# enable 4bit for QLoRA
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
gptq: false
|
gptq: false
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
- path: QingyiSi/Alpaca-CoT
|
- path: QingyiSi/Alpaca-CoT
|
||||||
@@ -38,9 +37,7 @@ lora_alpha: 16
|
|||||||
# 0.05 for 33B and 65B models
|
# 0.05 for 33B and 65B models
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
# add LoRA modules on all linear layers of the base model
|
# add LoRA modules on all linear layers of the base model
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -67,10 +64,7 @@ lr_scheduler: cosine
|
|||||||
# - 2e-4 for 7b & 13b
|
# - 2e-4 for 7b & 13b
|
||||||
# - 1e-4 for 33b & 64b
|
# - 1e-4 for 33b & 64b
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: true
|
tf32: true
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
# stop training after this many evaluation losses have increased in a row
|
# stop training after this many evaluation losses have increased in a row
|
||||||
@@ -78,7 +72,6 @@ gradient_checkpointing: true
|
|||||||
early_stopping_patience: 3
|
early_stopping_patience: 3
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
auto_resume_from_checkpoints: true
|
auto_resume_from_checkpoints: true
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
xformers_attention: true
|
||||||
flash_attention:
|
flash_attention:
|
||||||
@@ -87,11 +80,7 @@ gptq_model_v1:
|
|||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.000001
|
weight_decay: 0.000001
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
bos_token: "<|endoftext|>"
|
bos_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -7,11 +7,7 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
gptq: false
|
gptq: false
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
@@ -25,9 +21,7 @@ max_packed_sequence_len:
|
|||||||
lora_r: 64
|
lora_r: 64
|
||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.0
|
lora_dropout: 0.0
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
wandb_watch:
|
wandb_watch:
|
||||||
@@ -41,15 +35,10 @@ optimizer: adamw_bnb_8bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00003
|
learning_rate: 0.00003
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: true
|
tf32: true
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
xformers_attention: true
|
||||||
flash_attention:
|
flash_attention:
|
||||||
@@ -58,11 +47,7 @@ gptq_model_v1:
|
|||||||
warmup_steps: 40
|
warmup_steps: 40
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
bos_token: "<|endoftext|>"
|
bos_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
# huggingface repo
|
# huggingface repo
|
||||||
datasets:
|
datasets:
|
||||||
@@ -42,28 +41,16 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
eval_max_new_tokens: 128
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
# huggingface repo
|
# huggingface repo
|
||||||
chat_template: gemma
|
chat_template: gemma
|
||||||
@@ -48,28 +47,16 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
eval_table_size:
|
|
||||||
eval_max_new_tokens: 128
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -6,10 +6,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
reward_model: true
|
reward_model: true
|
||||||
chat_template: gemma
|
chat_template: gemma
|
||||||
datasets:
|
datasets:
|
||||||
@@ -38,8 +34,6 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: true
|
bf16: true
|
||||||
fp16:
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
@@ -47,21 +41,12 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
eval_table_size:
|
|
||||||
eval_max_new_tokens: 128
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
66
examples/gemma3/gemma-3-1b-qlora.yml
Normal file
66
examples/gemma3/gemma-3-1b-qlora.yml
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
base_model: google/gemma-3-1b-it
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
|
model_type: AutoModelForCausalLM
|
||||||
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
# gemma3 doesn't seem to play nice with ddp
|
||||||
|
ddp_find_unused_parameters: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: true
|
||||||
|
|
||||||
|
# huggingface repo
|
||||||
|
chat_template: gemma3
|
||||||
|
datasets:
|
||||||
|
- path: cgato/SlimOrcaDedupCleaned
|
||||||
|
type: chat_template
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
|
||||||
|
sequence_len: 2048
|
||||||
|
sample_packing: true
|
||||||
|
eval_sample_packing: false
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 4
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: auto
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
resume_from_checkpoint:
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_ratio: 0.1
|
||||||
|
evals_per_epoch:
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
60
examples/gemma3/gemma-3-4b-qlora.yml
Normal file
60
examples/gemma3/gemma-3-4b-qlora.yml
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
base_model: google/gemma-3-4b-it
|
||||||
|
|
||||||
|
load_in_4bit: true
|
||||||
|
|
||||||
|
# gemma3 doesn't seem to play nice with ddp
|
||||||
|
ddp_find_unused_parameters: true
|
||||||
|
|
||||||
|
chat_template: gemma3
|
||||||
|
datasets:
|
||||||
|
- path: cgato/SlimOrcaDedupCleaned
|
||||||
|
type: chat_template
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.01
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
adapter: qlora
|
||||||
|
lora_model_dir:
|
||||||
|
|
||||||
|
sequence_len: 2048
|
||||||
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
fp16:
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
eager_attention:
|
||||||
|
|
||||||
|
warmup_ratio: 0.1
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
62
examples/gemma3/gemma-3-4b-vision-qlora.yml
Normal file
62
examples/gemma3/gemma-3-4b-vision-qlora.yml
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
base_model: google/gemma-3-4b-it
|
||||||
|
processor_type: AutoProcessor
|
||||||
|
|
||||||
|
load_in_4bit: true
|
||||||
|
|
||||||
|
# these 3 lines are needed for now to handle vision chat templates w images
|
||||||
|
skip_prepare_dataset: true
|
||||||
|
remove_unused_columns: false
|
||||||
|
sample_packing: false
|
||||||
|
|
||||||
|
# gemma3 doesn't seem to play nice with ddp
|
||||||
|
ddp_find_unused_parameters: true
|
||||||
|
|
||||||
|
chat_template: gemma3
|
||||||
|
datasets:
|
||||||
|
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||||
|
type: chat_template
|
||||||
|
split: train[:1%]
|
||||||
|
field_messages: messages
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.01
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
adapter: qlora
|
||||||
|
lora_model_dir:
|
||||||
|
|
||||||
|
sequence_len: 2048
|
||||||
|
pad_to_sequence_len: false
|
||||||
|
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
fp16:
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
eager_attention:
|
||||||
|
|
||||||
|
warmup_ratio: 0.1
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
@@ -4,7 +4,6 @@ base_model: EleutherAI/gpt-j-6b
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
@@ -18,9 +17,7 @@ max_packed_sequence_len:
|
|||||||
lora_r: 8
|
lora_r: 8
|
||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
wandb_watch:
|
wandb_watch:
|
||||||
@@ -34,15 +31,10 @@ optimizer: paged_adamw_8bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0001
|
learning_rate: 0.0001
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: true
|
tf32: true
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
xformers_attention: true
|
||||||
flash_attention:
|
flash_attention:
|
||||||
@@ -51,10 +43,6 @@ gptq_model_v1:
|
|||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ trust_remote_code: true
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
@@ -40,26 +39,18 @@ optimizer: paged_adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00001
|
learning_rate: 0.00001
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ trust_remote_code: true
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
@@ -39,26 +38,20 @@ optimizer: paged_adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00001
|
learning_rate: 0.00001
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed: deepspeed_configs/zero2.json
|
deepspeed: deepspeed_configs/zero2.json
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
use_tensorboard: true
|
use_tensorboard: true
|
||||||
chat_template: jamba
|
chat_template: jamba
|
||||||
datasets:
|
datasets:
|
||||||
@@ -39,8 +38,6 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00001
|
learning_rate: 0.00001
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: true
|
bf16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
|
|||||||
@@ -33,13 +33,9 @@ optimizer: adamw_bnb_8bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00003
|
learning_rate: 0.00003
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
tf32: true
|
tf32: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 5
|
logging_steps: 5
|
||||||
xformers_attention: true
|
xformers_attention: true
|
||||||
flash_attention:
|
flash_attention:
|
||||||
@@ -48,11 +44,7 @@ gptq_model_v1:
|
|||||||
warmup_steps: 20
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
tokens:
|
tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -5,10 +5,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
@@ -26,7 +22,6 @@ lora_r:
|
|||||||
lora_alpha:
|
lora_alpha:
|
||||||
lora_dropout:
|
lora_dropout:
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,18 +36,12 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
flash_attn_cross_entropy: false
|
flash_attn_cross_entropy: false
|
||||||
flash_attn_rms_norm: true
|
flash_attn_rms_norm: true
|
||||||
@@ -61,11 +50,8 @@ flash_attn_fuse_mlp: true
|
|||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
|
deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -10,9 +10,6 @@ gptq_disable_exllama: true
|
|||||||
|
|
||||||
tokenizer_use_fast: true
|
tokenizer_use_fast: true
|
||||||
tokenizer_legacy: true
|
tokenizer_legacy: true
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
hf_use_auth_token: true
|
hf_use_auth_token: true
|
||||||
datasets:
|
datasets:
|
||||||
@@ -33,7 +30,6 @@ lora_target_modules:
|
|||||||
- q_proj
|
- q_proj
|
||||||
- v_proj
|
- v_proj
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
lora_fan_in_fan_out:
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_watch:
|
wandb_watch:
|
||||||
wandb_name:
|
wandb_name:
|
||||||
@@ -50,26 +46,19 @@ torchdistx_path:
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
lr_quadratic_warmup: true
|
lr_quadratic_warmup: true
|
||||||
learning_rate: 0.000017
|
learning_rate: 0.000017
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: false
|
bf16: false
|
||||||
fp16: false
|
fp16: false
|
||||||
float16: true
|
float16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention:
|
flash_attention:
|
||||||
sdp_attention:
|
sdp_attention:
|
||||||
flash_optimum:
|
flash_optimum:
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
|
|||||||
@@ -5,10 +5,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca
|
type: alpaca
|
||||||
@@ -26,7 +22,6 @@ lora_r:
|
|||||||
lora_alpha:
|
lora_alpha:
|
||||||
lora_dropout:
|
lora_dropout:
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
lisa_n_layers: 4
|
lisa_n_layers: 4
|
||||||
lisa_step_interval: 20
|
lisa_step_interval: 20
|
||||||
@@ -45,18 +40,12 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 5e-5 # recommendation from lisa paper for 7b
|
learning_rate: 5e-5 # recommendation from lisa paper for 7b
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
flash_attn_cross_entropy: false
|
flash_attn_cross_entropy: false
|
||||||
flash_attn_rms_norm: true
|
flash_attn_rms_norm: true
|
||||||
@@ -65,13 +54,8 @@ flash_attn_fuse_mlp: true
|
|||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -5,10 +5,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
@@ -26,7 +22,6 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
peft:
|
peft:
|
||||||
loftq_config:
|
loftq_config:
|
||||||
loftq_bits: 4
|
loftq_bits: 4
|
||||||
@@ -44,29 +39,16 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
s2_attention:
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
eval_max_new_tokens: 128
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
@@ -26,7 +25,6 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,29 +39,16 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
s2_attention:
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
eval_max_new_tokens: 128
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: yahma/alpaca-cleaned
|
- path: yahma/alpaca-cleaned
|
||||||
@@ -26,9 +25,7 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -43,28 +40,19 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00001
|
learning_rate: 0.00001
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: true
|
use_reentrant: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
@@ -26,9 +25,7 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -43,27 +40,16 @@ optimizer: paged_adamw_32bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
@@ -24,9 +23,7 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 8
|
lora_r: 8
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
relora_steps: 150
|
relora_steps: 150
|
||||||
relora_warmup_steps: 10
|
relora_warmup_steps: 10
|
||||||
@@ -45,28 +42,18 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ processor_type: AutoProcessor
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
# these 3 lines are needed for now to handle vision chat templates w images
|
# these 3 lines are needed for now to handle vision chat templates w images
|
||||||
skip_prepare_dataset: true
|
skip_prepare_dataset: true
|
||||||
@@ -45,14 +44,11 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: true
|
bf16: true
|
||||||
fp16:
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
eager_attention:
|
eager_attention:
|
||||||
@@ -60,8 +56,4 @@ eager_attention:
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ liger_rms_norm: true
|
|||||||
liger_glu_activation: true
|
liger_glu_activation: true
|
||||||
liger_fused_linear_cross_entropy: true
|
liger_fused_linear_cross_entropy: true
|
||||||
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
chat_template: llama3
|
chat_template: llama3
|
||||||
datasets:
|
datasets:
|
||||||
@@ -42,27 +41,19 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 2e-5
|
learning_rate: 2e-5
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
eval_table_size:
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -2,10 +2,6 @@ base_model: NousResearch/Meta-Llama-3.1-8B
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
type: alpaca
|
type: alpaca
|
||||||
@@ -30,29 +26,19 @@ optimizer: paged_adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 2e-5
|
learning_rate: 2e-5
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
eval_table_size:
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
chat_template: llama3
|
chat_template: llama3
|
||||||
rl: dpo
|
rl: dpo
|
||||||
@@ -42,7 +41,6 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -57,28 +55,15 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
s2_attention:
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
eval_max_new_tokens: 128
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
chat_template: llama3
|
chat_template: llama3
|
||||||
datasets:
|
datasets:
|
||||||
@@ -37,7 +36,6 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -52,30 +50,17 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
s2_attention:
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
eval_max_new_tokens: 128
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
chat_template: llama3
|
chat_template: llama3
|
||||||
rl: dpo
|
rl: dpo
|
||||||
@@ -58,7 +57,6 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -73,28 +71,15 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
s2_attention:
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
eval_max_new_tokens: 128
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
@@ -19,7 +18,6 @@ val_set_size: 0.0
|
|||||||
output_dir: ./outputs/lora-out
|
output_dir: ./outputs/lora-out
|
||||||
|
|
||||||
dataset_exact_deduplication: true
|
dataset_exact_deduplication: true
|
||||||
test_value: true
|
|
||||||
|
|
||||||
sequence_len: 4096
|
sequence_len: 4096
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
@@ -32,7 +30,6 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
lora_modules_to_save:
|
lora_modules_to_save:
|
||||||
- embed_tokens
|
- embed_tokens
|
||||||
- lm_head
|
- lm_head
|
||||||
@@ -50,30 +47,17 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
s2_attention:
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
eval_max_new_tokens: 128
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|||||||
@@ -2,10 +2,6 @@ base_model: NousResearch/Llama-3.2-1B
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca
|
type: alpaca
|
||||||
@@ -24,7 +20,6 @@ lora_r: 16
|
|||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
# Currently, we don't support dropout with our custom Triton kernels
|
# Currently, we don't support dropout with our custom Triton kernels
|
||||||
# lora_dropout: 0.05
|
# lora_dropout: 0.05
|
||||||
lora_fan_in_fan_out:
|
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- gate_proj
|
- gate_proj
|
||||||
- down_proj
|
- down_proj
|
||||||
@@ -53,18 +48,12 @@ optimizer: adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -73,10 +62,6 @@ loss_watchdog_patience: 3
|
|||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|||||||
@@ -2,10 +2,6 @@ base_model: NousResearch/Llama-3.2-1B
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca
|
type: alpaca
|
||||||
@@ -24,7 +20,6 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 16
|
lora_r: 16
|
||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_fan_in_fan_out:
|
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- gate_proj
|
- gate_proj
|
||||||
- down_proj
|
- down_proj
|
||||||
@@ -47,18 +42,12 @@ optimizer: adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -67,11 +56,9 @@ loss_watchdog_patience: 3
|
|||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed: deepspeed_configs/zero3.json
|
deepspeed: deepspeed_configs/zero3.json
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|
||||||
|
|||||||
65
examples/llama-3/lora-1b-sample-packing-sequentially.yml
Normal file
65
examples/llama-3/lora-1b-sample-packing-sequentially.yml
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
base_model: meta-llama/Llama-3.2-1B
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
|
model_type: LlamaForCausalLM
|
||||||
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: true
|
||||||
|
load_in_4bit: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path:
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/lora-out
|
||||||
|
|
||||||
|
test_value: true
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
sample_packing_sequentially: true
|
||||||
|
curriculum_sampling: true
|
||||||
|
eval_sample_packing: false
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
lora_modules_to_save:
|
||||||
|
- embed_tokens
|
||||||
|
- lm_head
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 4
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: auto
|
||||||
|
tf32: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
resume_from_checkpoint:
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch: 4
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|end_of_text|>
|
||||||
@@ -2,10 +2,6 @@ base_model: NousResearch/Llama-3.2-1B
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca
|
type: alpaca
|
||||||
@@ -24,7 +20,6 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 16
|
lora_r: 16
|
||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_fan_in_fan_out:
|
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- gate_proj
|
- gate_proj
|
||||||
- down_proj
|
- down_proj
|
||||||
@@ -47,18 +42,12 @@ optimizer: adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -67,10 +56,6 @@ loss_watchdog_patience: 3
|
|||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
@@ -27,7 +26,6 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
lora_modules_to_save:
|
lora_modules_to_save:
|
||||||
- embed_tokens
|
- embed_tokens
|
||||||
- lm_head
|
- lm_head
|
||||||
@@ -45,30 +43,17 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
s2_attention:
|
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
eval_max_new_tokens: 128
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ base_model: meta-llama/Llama-3.2-1B
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
rl: kto
|
rl: kto
|
||||||
rl_beta: 0.5
|
rl_beta: 0.5
|
||||||
@@ -32,7 +31,6 @@ lora_r: 32
|
|||||||
lora_alpha: 64
|
lora_alpha: 64
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -47,31 +45,19 @@ optimizer: adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 20
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
eval_max_new_tokens: 128
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ base_model: NousResearch/Llama-3.2-1B
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
@@ -24,7 +23,6 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_fan_in_fan_out:
|
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- gate_proj
|
- gate_proj
|
||||||
- down_proj
|
- down_proj
|
||||||
@@ -47,18 +45,12 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -66,13 +58,7 @@ loss_watchdog_patience: 3
|
|||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
eval_max_new_tokens: 128
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
@@ -24,7 +23,6 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 16
|
lora_r: 16
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
gradient_accumulation_steps: 4
|
||||||
@@ -34,8 +32,6 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00001
|
learning_rate: 0.00001
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: true
|
bf16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer # PreTrainedTokenizerFast
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
@@ -26,9 +25,7 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 8
|
lora_r: 8
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -43,28 +40,19 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00001
|
learning_rate: 0.00001
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: true
|
use_reentrant: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: aaditya/alpaca_subset_1
|
- path: aaditya/alpaca_subset_1
|
||||||
@@ -26,9 +25,7 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -43,28 +40,17 @@ optimizer: paged_adamw_32bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16:
|
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_table_size:
|
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|||||||
28
examples/llama-4/README.md
Normal file
28
examples/llama-4/README.md
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
# Llama 4 by Meta AI
|
||||||
|
|
||||||
|
## Flash Attention vs Flex Attention
|
||||||
|
|
||||||
|
While Flash Attention to support is "enabled" for Llama-4, the upstream implementation is not correct and usage of Flex Attention is recommended.
|
||||||
|
|
||||||
|
## Available Examples
|
||||||
|
|
||||||
|
### Llama 4 Scout 17Bx16Experts (109B)
|
||||||
|
|
||||||
|
Flex Attention
|
||||||
|
- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100-flex.yaml)
|
||||||
|
- [Text Multi GPU QLoRA w/ FSDP2](./scout-qlora-flexattn-fsdp2.yaml)
|
||||||
|
|
||||||
|
[//]: # (Flash Attention (Do not use))
|
||||||
|
|
||||||
|
[//]: # (- [Multi-Modal/Vision QLoRA w/ FSDP1](./scout-vision-qlora-fsdp.yaml))
|
||||||
|
|
||||||
|
[//]: # (- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml))
|
||||||
|
|
||||||
|
[//]: # (- [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml))
|
||||||
|
|
||||||
|
Our Single H100 implementation for Llama 4 Scout uses only 64.5GB VRAM for post-training with 4k context length @ 519 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/wpie7dkj)
|
||||||
|
Multi-GPU (4xH100) for Llama 4 Scout uses 62.8GB VRAM/GPU @ 4k contenxt length @ 280tps/gpu, [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/2lkezdj8)
|
||||||
|
|
||||||
|
### Llama 4 Maverick 17Bx128Experts (400B)
|
||||||
|
|
||||||
|
Coming Soon
|
||||||
88
examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
Normal file
88
examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
base_model: axolotl-quants/Llama-4-Maverick-17B-128E-Linearized-bnb-nf4-bf16
|
||||||
|
model_type: Llama4ForConditionalGeneration
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_layer_norm: true
|
||||||
|
|
||||||
|
llama4_linearized_experts: true
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_target_modules:
|
||||||
|
- self_attn.q_proj
|
||||||
|
- self_attn.k_proj
|
||||||
|
- self_attn.v_proj
|
||||||
|
- self_attn.o_proj
|
||||||
|
- shared_expert.gate_proj
|
||||||
|
- shared_expert.up_proj
|
||||||
|
- shared_expert.down_proj
|
||||||
|
# - experts.gate_projs.[0-9]+$
|
||||||
|
# - experts.up_projs.[0-9]+$
|
||||||
|
# - experts.down_projs.[0-9]+$
|
||||||
|
lora_modules_to_save:
|
||||||
|
# - lm_head
|
||||||
|
# - embed_tokens
|
||||||
|
|
||||||
|
chat_template: llama4
|
||||||
|
datasets:
|
||||||
|
- path: mlabonne/FineTome-100k
|
||||||
|
type: chat_template
|
||||||
|
split: train[:20%]
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_fused
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 1e-4
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
gradient_checkpointing: offload
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
|
||||||
|
warmup_steps: 20
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
- auto_wrap
|
||||||
|
- full_shard
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||||
|
fsdp_limit_all_gathers: true
|
||||||
|
fsdp_sync_module_states: true
|
||||||
|
fsdp_offload_params: true
|
||||||
|
fsdp_use_orig_params: false
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|finetune_right_pad_id|>
|
||||||
|
eos_token: <|eot|>
|
||||||
92
examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
Normal file
92
examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||||
|
model_type: Llama4ForConditionalGeneration
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
|
||||||
|
# torch_compile: true
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_layer_norm: true
|
||||||
|
|
||||||
|
llama4_linearized_experts: true
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_target_modules:
|
||||||
|
- self_attn.q_proj
|
||||||
|
- self_attn.k_proj
|
||||||
|
- self_attn.v_proj
|
||||||
|
- self_attn.o_proj
|
||||||
|
- shared_expert.gate_proj
|
||||||
|
- shared_expert.up_proj
|
||||||
|
- shared_expert.down_proj
|
||||||
|
# - experts.gate_projs.[0-9]+$
|
||||||
|
# - experts.up_projs.[0-9]+$
|
||||||
|
# - experts.down_projs.[0-9]+$
|
||||||
|
lora_modules_to_save:
|
||||||
|
- lm_head
|
||||||
|
- embed_tokens
|
||||||
|
|
||||||
|
chat_template: llama4
|
||||||
|
datasets:
|
||||||
|
- path: mlabonne/FineTome-100k
|
||||||
|
type: chat_template
|
||||||
|
split: train[:20%]
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_fused
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 2e-5
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 100
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
- auto_wrap
|
||||||
|
- full_shard
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||||
|
fsdp_limit_all_gathers: true
|
||||||
|
fsdp_sync_module_states: true
|
||||||
|
fsdp_offload_params: true
|
||||||
|
fsdp_use_orig_params: false
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
|
fsdp_activation_checkpointing: true
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|finetune_right_pad_id|>
|
||||||
|
eos_token: <|eot|>
|
||||||
85
examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
Normal file
85
examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||||
|
model_type: Llama4ForConditionalGeneration
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_layer_norm: true
|
||||||
|
|
||||||
|
llama4_linearized_experts: true
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_target_modules:
|
||||||
|
- self_attn.q_proj
|
||||||
|
- self_attn.k_proj
|
||||||
|
- self_attn.v_proj
|
||||||
|
- self_attn.o_proj
|
||||||
|
- shared_expert.gate_proj
|
||||||
|
- shared_expert.up_proj
|
||||||
|
- shared_expert.down_proj
|
||||||
|
# - experts.gate_projs.[0-9]+$
|
||||||
|
# - experts.up_projs.[0-9]+$
|
||||||
|
# - experts.down_projs.[0-9]+$
|
||||||
|
lora_modules_to_save:
|
||||||
|
# - lm_head
|
||||||
|
# - embed_tokens
|
||||||
|
|
||||||
|
lora_mlp_kernel: true
|
||||||
|
lora_qkv_kernel: true
|
||||||
|
lora_o_kernel: true
|
||||||
|
|
||||||
|
chat_template: llama4
|
||||||
|
datasets:
|
||||||
|
- path: mlabonne/FineTome-100k
|
||||||
|
type: chat_template
|
||||||
|
split: train[:20%]
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
sequence_len: 4096 # up to 8k will work on a single H100
|
||||||
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_4bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 1e-4
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
gradient_checkpointing: offload
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
|
||||||
|
warmup_steps: 20
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|finetune_right_pad_id|>
|
||||||
|
eos_token: <|eot|>
|
||||||
88
examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
Normal file
88
examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||||
|
model_type: Llama4ForConditionalGeneration
|
||||||
|
processor_type: Llama4Processor
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
|
||||||
|
# these 3 lines are needed for now to handle vision chat templates w images
|
||||||
|
skip_prepare_dataset: true
|
||||||
|
remove_unused_columns: false
|
||||||
|
sample_packing: false
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_layer_norm: true
|
||||||
|
|
||||||
|
llama4_linearized_experts: true # use Axolotl's customized model
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_target_modules:
|
||||||
|
- self_attn.q_proj
|
||||||
|
- self_attn.k_proj
|
||||||
|
- self_attn.v_proj
|
||||||
|
- self_attn.o_proj
|
||||||
|
- shared_expert.gate_proj
|
||||||
|
- shared_expert.up_proj
|
||||||
|
- shared_expert.down_proj
|
||||||
|
- vision_adapter.mlp.fc1
|
||||||
|
- vision_adapter.mlp.fc2
|
||||||
|
# - experts.gate_projs.[0-9]+$
|
||||||
|
# - experts.up_projs.[0-9]+$
|
||||||
|
# - experts.down_projs.[0-9]+$
|
||||||
|
lora_modules_to_save:
|
||||||
|
- lm_head
|
||||||
|
- embed_tokens
|
||||||
|
|
||||||
|
chat_template: llama4
|
||||||
|
datasets:
|
||||||
|
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||||
|
type: chat_template
|
||||||
|
split: train[:1%]
|
||||||
|
field_messages: messages
|
||||||
|
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_4bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 2e-5
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 100
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
- auto_wrap
|
||||||
|
- full_shard
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||||
|
fsdp_limit_all_gathers: true
|
||||||
|
fsdp_sync_module_states: true
|
||||||
|
fsdp_offload_params: true
|
||||||
|
fsdp_use_orig_params: false
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
|
fsdp_activation_checkpointing: true
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|finetune_right_pad_id|>
|
||||||
|
eos_token: <|eot|>
|
||||||
86
examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
Normal file
86
examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||||
|
model_type: Llama4ForConditionalGeneration
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_layer_norm: true
|
||||||
|
|
||||||
|
llama4_linearized_experts: true
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_target_modules:
|
||||||
|
- self_attn.q_proj
|
||||||
|
- self_attn.k_proj
|
||||||
|
- self_attn.v_proj
|
||||||
|
- self_attn.o_proj
|
||||||
|
- shared_expert.gate_proj
|
||||||
|
- shared_expert.up_proj
|
||||||
|
- shared_expert.down_proj
|
||||||
|
# - experts.gate_projs.[0-9]+$
|
||||||
|
# - experts.up_projs.[0-9]+$
|
||||||
|
# - experts.down_projs.[0-9]+$
|
||||||
|
lora_modules_to_save:
|
||||||
|
# - lm_head
|
||||||
|
# - embed_tokens
|
||||||
|
|
||||||
|
chat_template: llama4
|
||||||
|
datasets:
|
||||||
|
- path: mlabonne/FineTome-100k
|
||||||
|
type: chat_template
|
||||||
|
split: train[:20%]
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 3
|
||||||
|
optimizer: adamw_torch_4bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 1e-4
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
logging_steps: 1
|
||||||
|
flex_attention: true
|
||||||
|
flex_attn_compile_kwargs:
|
||||||
|
dynamic: false
|
||||||
|
mode: max-autotune-no-cudagraphs
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
- auto_wrap
|
||||||
|
- full_shard
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_version: 2
|
||||||
|
fsdp_offload_params: false
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||||
|
fsdp_state_dict_type: SHARDED_STATE_DICT
|
||||||
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
|
fsdp_reshard_after_forward: true
|
||||||
|
fsdp_activation_checkpointing: true
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|finetune_right_pad_id|>
|
||||||
|
eos_token: <|eot|>
|
||||||
85
examples/llama-4/scout-qlora-single-h100-flex.yaml
Normal file
85
examples/llama-4/scout-qlora-single-h100-flex.yaml
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||||
|
model_type: Llama4ForConditionalGeneration
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||||
|
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_layer_norm: true
|
||||||
|
cut_cross_entropy: true
|
||||||
|
|
||||||
|
llama4_linearized_experts: true # needed with custom linearized experts model
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_target_modules:
|
||||||
|
- self_attn.q_proj
|
||||||
|
- self_attn.k_proj
|
||||||
|
- self_attn.v_proj
|
||||||
|
- self_attn.o_proj
|
||||||
|
- shared_expert.gate_proj
|
||||||
|
- shared_expert.up_proj
|
||||||
|
- shared_expert.down_proj
|
||||||
|
# - experts.gate_projs.[0-9]+$ # optionally train the moe experts
|
||||||
|
# - experts.up_projs.[0-9]+$
|
||||||
|
# - experts.down_projs.[0-9]+$
|
||||||
|
lora_modules_to_save:
|
||||||
|
# - lm_head # needed if modifying vocabulary
|
||||||
|
# - embed_tokens
|
||||||
|
|
||||||
|
lora_mlp_kernel: true
|
||||||
|
lora_qkv_kernel: true
|
||||||
|
lora_o_kernel: true
|
||||||
|
|
||||||
|
chat_template: llama4
|
||||||
|
datasets:
|
||||||
|
- path: mlabonne/FineTome-100k
|
||||||
|
type: chat_template
|
||||||
|
split: train[:20%]
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
sequence_len: 4096 # up to 8k will work on a single H100
|
||||||
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_4bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 1e-4
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
torch_compile: true
|
||||||
|
flex_attention: true
|
||||||
|
flex_attn_compile_kwargs:
|
||||||
|
dynamic: false
|
||||||
|
mode: max-autotune-no-cudagraphs
|
||||||
|
|
||||||
|
gradient_checkpointing: offload
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
|
||||||
|
logging_steps: 1
|
||||||
|
warmup_steps: 20
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|finetune_right_pad_id|>
|
||||||
|
eos_token: <|eot|>
|
||||||
89
examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
Normal file
89
examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||||
|
model_type: Llama4ForConditionalGeneration
|
||||||
|
processor_type: Llama4Processor
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
# these 3 lines are needed for now to handle vision chat templates w images
|
||||||
|
skip_prepare_dataset: true
|
||||||
|
remove_unused_columns: false
|
||||||
|
sample_packing: false
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_layer_norm: true
|
||||||
|
|
||||||
|
llama4_linearized_experts: true # use Axolotl's customized model
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_target_modules:
|
||||||
|
- self_attn.q_proj
|
||||||
|
- self_attn.k_proj
|
||||||
|
- self_attn.v_proj
|
||||||
|
- self_attn.o_proj
|
||||||
|
- shared_expert.gate_proj
|
||||||
|
- shared_expert.up_proj
|
||||||
|
- shared_expert.down_proj
|
||||||
|
- vision_adapter.mlp.fc1
|
||||||
|
- vision_adapter.mlp.fc2
|
||||||
|
# - experts.gate_projs.[0-9]+$
|
||||||
|
# - experts.up_projs.[0-9]+$
|
||||||
|
# - experts.down_projs.[0-9]+$
|
||||||
|
lora_modules_to_save:
|
||||||
|
- lm_head
|
||||||
|
- embed_tokens
|
||||||
|
|
||||||
|
chat_template: llama4
|
||||||
|
datasets:
|
||||||
|
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||||
|
type: chat_template
|
||||||
|
split: train[:1%]
|
||||||
|
field_messages: messages
|
||||||
|
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_4bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 1e-4
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
logging_steps: 1
|
||||||
|
flex_attention: true
|
||||||
|
flex_attn_compile_kwargs:
|
||||||
|
dynamic: false
|
||||||
|
mode: max-autotune-no-cudagraphs
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
- auto_wrap
|
||||||
|
- full_shard
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_version: 2
|
||||||
|
fsdp_offload_params: false
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||||
|
fsdp_state_dict_type: SHARDED_STATE_DICT
|
||||||
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
|
fsdp_reshard_after_forward: true
|
||||||
|
fsdp_activation_checkpointing: true
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|finetune_right_pad_id|>
|
||||||
|
eos_token: <|eot|>
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user