Compare commits

..

4 Commits

Author SHA1 Message Date
Wing Lian
d1f36d7b78 set download to use revision 2024-10-11 11:03:43 -04:00
Wing Lian
87248027d0 use revision tied to head 2024-10-11 11:03:43 -04:00
Wing Lian
d0d22b7812 only use revision on hf hub backed datasets 2024-10-11 11:03:43 -04:00
Thomas Cleberg
68db5b1b67 Add support for revision dataset parameter 2024-10-11 11:03:43 -04:00
163 changed files with 3829 additions and 8588 deletions

View File

@@ -24,12 +24,6 @@ jobs:
python_version: "3.11"
pytorch: 2.3.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
- cuda: "124"
cuda_version: 12.4.1
cudnn_version: ""
python_version: "3.10"
pytorch: 2.4.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
- cuda: "124"
cuda_version: 12.4.1
cudnn_version: ""
@@ -40,25 +34,23 @@ jobs:
cuda_version: 12.4.1
cudnn_version: ""
python_version: "3.11"
pytorch: 2.5.1
pytorch: 2.4.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Docker metadata
id: metadata
uses: docker/metadata-action@v5
uses: docker/metadata-action@v3
with:
images: |
winglian/axolotl-base
axolotlai/axolotl-base
images: winglian/axolotl-base
- name: Login to Docker Hub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
uses: docker/setup-buildx-action@v2
- name: Build
uses: docker/build-push-action@v4
with:

View File

@@ -17,7 +17,7 @@ jobs:
- name: Set up Quarto
uses: quarto-dev/quarto-actions/setup@v2
- name: Setup Python
uses: actions/setup-python@v5
uses: actions/setup-python@v3
with:
python-version: '3.10'
- name: install dependencies

View File

@@ -15,9 +15,9 @@ jobs:
name: pre-commit
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
cache: 'pip' # caching pip dependencies
- uses: pre-commit/action@v3.0.1
- uses: pre-commit/action@v3.0.0

View File

@@ -4,13 +4,11 @@ on:
push:
branches:
- "main"
tags:
- "v*"
workflow_dispatch:
jobs:
build-axolotl:
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
strategy:
fail-fast: false
matrix:
@@ -31,11 +29,6 @@ jobs:
python_version: "3.11"
pytorch: 2.4.1
axolotl_extras:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.5.1
axolotl_extras:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout
@@ -44,12 +37,7 @@ jobs:
id: metadata
uses: docker/metadata-action@v5
with:
images: |
winglian/axolotl
axolotlai/axolotl
tags: |
type=ref,event=branch
type=pep440,pattern={{version}}
images: winglian/axolotl
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Docker Hub
@@ -63,7 +51,7 @@ jobs:
with:
context: .
build-args: |
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
CUDA=${{ matrix.cuda }}
PYTORCH_VERSION=${{ matrix.pytorch }}
AXOLOTL_ARGS=${{ matrix.axolotl_args }}
@@ -77,7 +65,7 @@ jobs:
build-axolotl-cloud:
needs: build-axolotl
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
# this job needs to be run on self-hosted GPU runners...
strategy:
matrix:
@@ -98,11 +86,6 @@ jobs:
python_version: "3.11"
pytorch: 2.4.1
axolotl_extras:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.5.1
axolotl_extras:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout
@@ -111,25 +94,20 @@ jobs:
id: metadata
uses: docker/metadata-action@v5
with:
images: |
winglian/axolotl-cloud
axolotlai/axolotl-cloud
tags: |
type=ref,event=branch
type=pep440,pattern={{version}}
images: winglian/axolotl-cloud
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
uses: docker/setup-buildx-action@v2
- name: Build
uses: docker/build-push-action@v5
with:
context: .
build-args: |
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
CUDA=${{ matrix.cuda }}
file: ./docker/Dockerfile-cloud
push: ${{ github.event_name != 'pull_request' }}
@@ -140,7 +118,7 @@ jobs:
build-axolotl-cloud-no-tmux:
needs: build-axolotl
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
# this job needs to be run on self-hosted GPU runners...
strategy:
matrix:
@@ -158,25 +136,20 @@ jobs:
id: metadata
uses: docker/metadata-action@v5
with:
images: |
winglian/axolotl-cloud-term
axolotlai/axolotl-cloud-term
tags: |
type=ref,event=branch
type=pep440,pattern={{version}}
images: winglian/axolotl-cloud-term
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
uses: docker/setup-buildx-action@v2
- name: Build
uses: docker/build-push-action@v5
with:
context: .
build-args: |
BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
CUDA=${{ matrix.cuda }}
file: ./docker/Dockerfile-cloud-no-tmux
push: ${{ github.event_name != 'pull_request' }}

View File

@@ -8,14 +8,9 @@ on:
schedule:
- cron: '0 0 * * 1,4' # Runs at 00:00 UTC every monday & thursday
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
test-axolotl-multigpu:
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
strategy:
fail-fast: false
matrix:
@@ -26,17 +21,10 @@ jobs:
pytorch: 2.3.1
axolotl_extras:
num_gpus: 2
- cuda: 124
cuda_version: 12.4.1
- cuda: 121
cuda_version: 12.1.1
python_version: "3.11"
pytorch: 2.4.1
axolotl_extras:
num_gpus: 2
nightly_build: "true"
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.5.1
pytorch: 2.3.1
axolotl_extras:
num_gpus: 2
nightly_build: "true"

View File

@@ -7,7 +7,7 @@ on:
jobs:
build-axolotl:
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
strategy:
fail-fast: false
matrix:
@@ -28,11 +28,6 @@ jobs:
python_version: "3.11"
pytorch: 2.4.1
axolotl_extras:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.5.1
axolotl_extras:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout
@@ -41,9 +36,7 @@ jobs:
id: metadata
uses: docker/metadata-action@v5
with:
images: |
winglian/axolotl
axolotlai/axolotl
images: winglian/axolotl
tags: |
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
- name: Set up Docker Buildx
@@ -71,7 +64,7 @@ jobs:
build-axolotl-cloud:
needs: build-axolotl
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
# this job needs to be run on self-hosted GPU runners...
strategy:
matrix:
@@ -92,11 +85,6 @@ jobs:
python_version: "3.11"
pytorch: 2.4.1
axolotl_extras:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.5.1
axolotl_extras:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout
@@ -105,9 +93,7 @@ jobs:
id: metadata
uses: docker/metadata-action@v5
with:
images: |
winglian/axolotl-cloud
axolotlai/axolotl-cloud
images: winglian/axolotl-cloud
tags: |
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
- name: Login to Docker Hub
@@ -116,7 +102,7 @@ jobs:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
uses: docker/setup-buildx-action@v2
- name: Build
uses: docker/build-push-action@v5
with:

View File

@@ -3,24 +3,12 @@ name: publish pypi
on:
push:
tags:
- 'v*'
workflow_dispatch:
- '*'
jobs:
setup_release:
name: Create Release
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Create release
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: gh release create "$GITHUB_REF_NAME" # GITHUB_REF_NAME is the tag name in `on.push.tags` workflows
pypi-publish:
name: Upload release to PyPI
runs-on: ubuntu-latest
needs: [setup_release]
environment:
name: pypi
url: https://pypi.org/p/axolotl
@@ -28,10 +16,10 @@ jobs:
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
steps:
- name: Check out repository code
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v5
uses: actions/setup-python@v4
with:
python-version: "3.10"
@@ -39,7 +27,7 @@ jobs:
run: |
pip3 install wheel packaging
pip3 install -e .
pip3 install -r requirements-dev.txt -r requirements-tests.txt
pip3 install -r requirements-tests.txt
- name: Extract tag name
id: tag
@@ -49,9 +37,9 @@ jobs:
run: |
sed -i -E 's/version="([0-9.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py
- name: Build a source dist
- name: Build a binary wheel
run: |
python setup.py sdist
python setup.py sdist bdist_wheel
- name: Publish package distributions to PyPI
uses: pypa/gh-action-pypi-publish@release/v1

View File

@@ -9,12 +9,12 @@ jobs:
name: pre-commit
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
cache: 'pip' # caching pip dependencies
- uses: pre-commit/action@v3.0.1
- uses: pre-commit/action@v3.0.0
env:
SKIP: no-commit-to-branch
@@ -25,15 +25,15 @@ jobs:
fail-fast: false
matrix:
python_version: ["3.10", "3.11"]
pytorch_version: ["2.3.1", "2.4.1", "2.5.1"]
pytorch_version: ["2.3.1", "2.4.1"]
timeout-minutes: 20
steps:
- name: Check out repository code
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v5
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python_version }}
cache: 'pip' # caching pip dependencies
@@ -47,15 +47,13 @@ jobs:
sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt
- name: Install dependencies
run: |
pip3 install --upgrade pip
pip3 install --upgrade packaging
pip3 install -U -e .
pip3 install -r requirements-dev.txt -r requirements-tests.txt
pip3 install -r requirements-tests.txt
- name: Run tests
run: |
@@ -83,17 +81,17 @@ jobs:
num_gpus: 1
axolotl_extras: mamba-ssm
nightly_build: "true"
- cuda: 124
cuda_version: 12.4.1
- cuda: 121
cuda_version: 12.1.1
python_version: "3.11"
pytorch: 2.4.1
pytorch: 2.3.1
num_gpus: 1
axolotl_extras:
axolotl_extras: mamba-ssm
nightly_build: "true"
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.5.1
pytorch: 2.4.1
num_gpus: 1
axolotl_extras:
nightly_build: "true"

View File

@@ -15,22 +15,17 @@ on:
- '.github/workflows/*.yml'
workflow_dispatch:
# Cancel jobs on the same ref if a new one is triggered
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
jobs:
pre-commit:
name: pre-commit
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
cache: 'pip' # caching pip dependencies
- uses: pre-commit/action@v3.0.1
- uses: pre-commit/action@v3.0.0
env:
SKIP: no-commit-to-branch
@@ -41,97 +36,61 @@ jobs:
fail-fast: false
matrix:
python_version: ["3.10", "3.11"]
pytorch_version: ["2.3.1", "2.4.1", "2.5.1"]
pytorch_version: ["2.3.1", "2.4.1"]
timeout-minutes: 20
steps:
- name: Check out repository code
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v5
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python_version }}
cache: 'pip' # caching pip dependencies
- name: upgrade pip
run: |
pip3 install --upgrade pip
pip3 install --upgrade packaging setuptools wheel
- name: Install PyTorch
run: |
pip3 install torch==${{ matrix.pytorch_version }}
pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
- name: Install dependencies
run: |
pip3 show torch
pip3 install --upgrade pip
pip3 install --upgrade packaging
pip3 install -U -e .
pip3 install -r requirements-dev.txt -r requirements-tests.txt
pip3 install -r requirements-tests.txt
- name: Run tests
run: |
pytest -n8 --ignore=tests/e2e/ tests/
pytest --ignore=tests/e2e/ tests/
- name: cleanup pip cache
run: |
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
pytest-sdist:
name: PyTest from Source Dist
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python_version: ["3.11"]
pytorch_version: ["2.4.1", "2.5.1"]
timeout-minutes: 20
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python_version }}
cache: 'pip' # caching pip dependencies
- name: upgrade pip
run: |
pip3 install --upgrade pip
pip3 install --upgrade packaging setuptools wheel
- name: Install PyTorch
run: |
pip3 install torch==${{ matrix.pytorch_version }}
- name: Install dependencies
run: |
pip3 show torch
python3 setup.py sdist
pip3 install dist/axolotl*.tar.gz
pip3 install -r requirements-dev.txt -r requirements-tests.txt
- name: Run tests
run: |
pytest -n8 --ignore=tests/e2e/ tests/
- name: cleanup pip cache
run: |
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
docker-e2e-tests-1st:
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
docker-e2e-tests:
if: github.repository_owner == 'axolotl-ai-cloud'
# this job needs to be run on self-hosted GPU runners...
runs-on: [self-hosted, modal]
timeout-minutes: 90
needs: [pre-commit, pytest, pytest-sdist]
timeout-minutes: 60
needs: [pre-commit, pytest]
strategy:
fail-fast: false
matrix:
include:
- cuda: 121
cuda_version: 12.1.1
python_version: "3.10"
pytorch: 2.3.1
num_gpus: 1
axolotl_extras: mamba-ssm
- cuda: 121
cuda_version: 12.1.1
python_version: "3.11"
pytorch: 2.3.1
num_gpus: 1
axolotl_extras: mamba-ssm
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
@@ -160,49 +119,3 @@ jobs:
- name: Run tests job on Modal
run: |
modal run cicd.tests
docker-e2e-tests:
if: github.repository_owner == 'axolotl-ai-cloud'
# this job needs to be run on self-hosted GPU runners...
runs-on: [self-hosted, modal]
timeout-minutes: 90
needs: [pre-commit, pytest, docker-e2e-tests-1st]
strategy:
fail-fast: false
matrix:
include:
- cuda: 121
cuda_version: 12.1.1
python_version: "3.10"
pytorch: 2.3.1
num_gpus: 1
axolotl_extras: mamba-ssm
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.5.1
num_gpus: 1
axolotl_extras:
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install Modal
run: |
python -m pip install --upgrade pip
pip install modal==0.63.64 jinja2
- name: Update env vars
run: |
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
- name: Run tests job on Modal
run: |
modal run cicd.tests

3
.gitignore vendored
View File

@@ -182,6 +182,3 @@ submit.sh
typings/
out/
# vim
*.swp

View File

@@ -1,4 +0,0 @@
include requirements.txt
include README.md
include LICENSE
recursive-include axolotl *.py

View File

@@ -1,21 +1,8 @@
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="image/axolotl_logo_digital_white.svg">
<source media="(prefers-color-scheme: light)" srcset="image/axolotl_logo_digital_black.svg">
<img alt="Axolotl" src="image/axolotl_logo_digital_black.svg" width="400" height="104" style="max-width: 100%;">
</picture>
</p>
# Axolotl
<p align="center">
<img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
<a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
<img src="https://img.shields.io/github/stars/axolotl-ai-cloud/axolotl" alt="GitHub Repo stars">
</p>
<p align="center">
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
</p>
![tests](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg)
![tests-nightly](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg)
![multigpu-semi-weekly tests](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg)
Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.
@@ -88,7 +75,7 @@ Features:
<td>
<div align="center">
<img src="image/axolotl_symbol_digital_white.svg" alt="axolotl" width="160">
<img src="image/axolotl.png" alt="axolotl" width="160">
<div>
<p>
<b>Axolotl provides a unified repository for fine-tuning <br />a variety of AI models with ease</b>
@@ -134,7 +121,7 @@ Features:
Get started with Axolotl in just a few steps! This quickstart guide will walk you through setting up and running a basic fine-tuning task.
**Requirements**: Nvidia GPU (Ampere architecture or newer for `bf16` and Flash Attention), Python >=3.10 and PyTorch >=2.3.1.
**Requirements**: Python >=3.10 and Pytorch >=2.1.1.
```bash
git clone https://github.com/axolotl-ai-cloud/axolotl
@@ -172,7 +159,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl
#### Docker
```bash
docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
docker run --gpus '"all"' --rm -it winglian/axolotl:main-latest
```
Or run on the current files for development:
@@ -191,7 +178,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl
A more powerful Docker command to run would be this:
```bash
docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-latest
docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-latest
```
It additionally:
@@ -223,7 +210,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --
#### Cloud GPU
For cloud GPU providers that support docker images, use [`axolotlai/axolotl-cloud:main-latest`](https://hub.docker.com/r/axolotlai/axolotl-cloud/tags)
For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud:main-latest`](https://hub.docker.com/r/winglian/axolotl-cloud/tags)
- on Latitude.sh use this [direct link](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
- on JarvisLabs.ai use this [direct link](https://jarvislabs.ai/templates/axolotl)
@@ -332,7 +319,7 @@ Write a job description in YAML as below:
# dstack.yaml
type: task
image: axolotlai/axolotl-cloud:main-latest
image: winglian/axolotl-cloud:main-20240429-py3.11-cu121-2.2.2
env:
- HUGGING_FACE_HUB_TOKEN
@@ -396,10 +383,11 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
- typescript
type: ... # unimplemented custom format
# chat_template https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html#chat_template
# fastchat conversation
# See 'conversation' options: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
- path: ...
type: chat_template
chat_template: chatml # defaults to tokenizer's chat_template
type: sharegpt
conversation: chatml # default: vicuna_v1.1
# local
- path: data.jsonl # or json
@@ -574,8 +562,7 @@ plugins:
- axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_swiglu: true
liger_fused_linear_cross_entropy: true
```

View File

@@ -1,4 +1,4 @@
FROM axolotlai/axolotl-base:{{ BASE_TAG }}
FROM winglian/axolotl-base:{{ BASE_TAG }}
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
@@ -23,12 +23,11 @@ RUN git fetch origin +$GITHUB_REF && \
git checkout FETCH_HEAD
# If AXOLOTL_EXTRAS is set, append it in brackets
RUN pip install causal_conv1d
RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
fi
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
@@ -38,7 +37,7 @@ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
fi
# So we can test the Docker image
RUN pip install -r requirements-dev.txt -r requirements-tests.txt
RUN pip install -r requirements-tests.txt
# fix so that git fetch/pull from remote works
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \

View File

@@ -1,6 +1,6 @@
#!/bin/bash
set -e
pytest -n8 --ignore=tests/e2e/ /workspace/axolotl/tests/
pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
pytest -n1 --dist loadfile -v /workspace/axolotl/tests/e2e/patched/ /workspace/axolotl/tests/e2e/integrations/
pytest --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/

View File

@@ -10,7 +10,7 @@ import tempfile
import jinja2
import modal
from jinja2 import select_autoescape
from modal import App, Image
from modal import Image, Stub
cicd_path = pathlib.Path(__file__).parent.resolve()
@@ -46,7 +46,7 @@ cicd_image = (
.pip_install("fastapi==0.110.0", "pydantic==2.6.3")
)
app = App("Axolotl CI/CD", secrets=[])
stub = Stub("Axolotl CI/CD", secrets=[])
N_GPUS = int(os.environ.get("N_GPUS", 2))
@@ -61,10 +61,10 @@ def run_cmd(cmd: str, run_folder: str):
exit(exit_code) # pylint: disable=consider-using-sys-exit
@app.function(
@stub.function(
image=cicd_image,
gpu=GPU_CONFIG,
timeout=60 * 60,
timeout=45 * 60,
cpu=8.0,
memory=131072 * N_GPUS,
)
@@ -72,6 +72,6 @@ def cicd_pytest():
run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")
@app.local_entrypoint()
@stub.local_entrypoint()
def main():
cicd_pytest.remote()

View File

@@ -2,4 +2,4 @@
set -e
# only run one test at a time so as not to OOM the GPU
pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/
pytest -n1 /workspace/axolotl/tests/e2e/multigpu/

View File

@@ -10,7 +10,7 @@ import tempfile
import jinja2
import modal
from jinja2 import select_autoescape
from modal import App, Image
from modal import Image, Stub
cicd_path = pathlib.Path(__file__).parent.resolve()
@@ -47,7 +47,7 @@ cicd_image = (
.pip_install("fastapi==0.110.0", "pydantic==2.6.3")
)
app = App("Axolotl CI/CD", secrets=[])
stub = Stub("Axolotl CI/CD", secrets=[])
N_GPUS = int(os.environ.get("N_GPUS", 1))
@@ -62,10 +62,10 @@ def run_cmd(cmd: str, run_folder: str):
exit(exit_code) # pylint: disable=consider-using-sys-exit
@app.function(
@stub.function(
image=cicd_image,
gpu=GPU_CONFIG,
timeout=60 * 60,
timeout=45 * 60,
cpu=8.0,
memory=131072,
)
@@ -73,6 +73,6 @@ def cicd_pytest():
run_cmd("./cicd/cicd.sh", "/workspace/axolotl")
@app.local_entrypoint()
@stub.local_entrypoint()
def main():
cicd_pytest.remote()

View File

@@ -14,6 +14,15 @@
"bf16": {
"enabled": true
},
"fp16": {
"enabled": "auto",
"auto_cast": false,
"loss_scale": 0,
"initial_scale_power": 32,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"train_batch_size": "auto",

View File

@@ -24,6 +24,15 @@
"bf16": {
"enabled": true
},
"fp16": {
"enabled": "auto",
"auto_cast": false,
"loss_scale": 0,
"initial_scale_power": 32,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"train_batch_size": "auto",

View File

@@ -20,6 +20,15 @@
"bf16": {
"enabled": true
},
"fp16": {
"enabled": "auto",
"auto_cast": false,
"loss_scale": 0,
"initial_scale_power": 32,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"train_batch_size": "auto",

View File

@@ -1,4 +1,4 @@
# Example config for debugging the chat_template prompt format
# Example config for debugging the sharegpt prompt format
base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
@@ -7,8 +7,8 @@ load_in_8bit: true
load_in_4bit: false
datasets:
- path: fozziethebeat/alpaca_messages_2k_test
type: chat_template
- path: philschmid/guanaco-sharegpt-style
type: sharegpt
shards: 10
val_set_size: 0
output_dir: temp_debug/axolotl_outputs/model

View File

@@ -1,5 +1,5 @@
ARG BASE_TAG=main-base
FROM axolotlai/axolotl-base:$BASE_TAG
FROM winglian/axolotl-base:$BASE_TAG
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
ARG AXOLOTL_EXTRAS=""
@@ -20,6 +20,7 @@ RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
WORKDIR /workspace/axolotl
# If AXOLOTL_EXTRAS is set, append it in brackets
RUN pip install causal_conv1d
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
pip install -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
else \

View File

@@ -1,5 +1,5 @@
ARG BASE_TAG=main
FROM axolotlai/axolotl:$BASE_TAG
FROM winglian/axolotl:$BASE_TAG
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"

View File

@@ -1,5 +1,5 @@
ARG BASE_TAG=main
FROM axolotlai/axolotl:$BASE_TAG
FROM winglian/axolotl:$BASE_TAG
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"

View File

@@ -1,5 +1,5 @@
ARG BASE_TAG=main-base
FROM axolotlai/axolotl-base:$BASE_TAG
FROM winglian/axolotl-base:$BASE_TAG
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
ARG AXOLOTL_EXTRAS=""

View File

@@ -83,7 +83,7 @@ lora_on_cpu: true
datasets:
# HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
- path: vicgalle/alpaca-gpt4
# The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
data_files: # Optional[str] path to source data files
@@ -91,7 +91,15 @@ datasets:
name: # Optional[str] name of dataset configuration to load
train_on_split: train # Optional[str] name of dataset split to load from
revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
trust_remote_code: # Optional[bool] Trust remote code for untrusted source
# Optional[str] fastchat conversation type, only used with type: sharegpt
conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
field_human: # Optional[str]. Human key to use for conversation.
field_model: # Optional[str]. Assistant key to use for conversation.
# Add additional keys from your dataset as input or output roles
roles:
input: # Optional[List[str]]. These will be masked based on train_on_input
output: # Optional[List[str]].
# Custom user instruction prompt
- path: repo
@@ -116,48 +124,6 @@ datasets:
# For `completion` datsets only, uses the provided field instead of `text` column
field:
# Using chat template
- path: ...
# Set type to `chat_template` to use this strategy
type: chat_template
# Specify the name of the chat template to use
# The name of the chat template to use for training, following values are supported:
# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default.
# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py
# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.
# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
chat_template: tokenizer_default
# Custom jinja template for chat template. This will be only used if `chat_template` is set to `jinja` or empty (in which case chat_template is automatically set to `jinja`).
chat_template_jinja:
# The key in the data example that contains the messages. Default is "messages".
field_messages: messages
# The key in the message turn that contains the role. Default is "role".
message_field_role: role
# The key in the message turn that contains the content. Default is "content".
message_field_content: content
# Optional[Dict[str, List]]. Roles mapping for the messages.
roles:
user: ["human", "user"]
assistant: ["gpt", "assistant", "ai"]
system: ["system"]
## NOTE: Leaving the below empty will default to using the simple legacy tokenization strategy where only last message is trained on.
# Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
roles_to_train: ["gpt", "assistant"]
# Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:
# - all: train on all EOS tokens
# - turn: train on the EOS token at the end of each trainable turn
# - last: train on the last EOS token in the conversation
train_on_eos: last
# The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
message_field_training: training
# The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
# The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).
# See example at `docs/dataset-formats/conversation.qmd`
message_field_training_detail: train_detail
# If false, the datasets will not be shuffled and will keep their original order in `datasets`.
# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
shuffle_merged_datasets: true
@@ -175,19 +141,10 @@ test_datasets:
# use RL training: 'dpo', 'ipo', 'kto'
rl:
# whether to perform weighting if doing DPO training. Boolean.
dpo_use_weighting:
# The name of the chat template to use for training, following values are supported:
# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.
# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py
# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer.
# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
# The selected chat template will be saved to the tokenizer_config.json for easier inferencing
# Note: It is recommended to set train_on_inputs to true when using a chat template that is different from the model's default chat template.
chat_template: tokenizer_default
# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
chat_template_jinja: null
# Saves the desired chat template to the tokenizer_config.json for easier inferencing
# Currently supports chatml and inst (mistral/mixtral)
chat_template: chatml
# Changes the default system message
default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
# Axolotl attempts to save the dataset as an arrow after packing the data together so
@@ -309,7 +266,6 @@ wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_step
# mlflow configuration if you're using it
mlflow_tracking_uri: # URI to mlflow
mlflow_experiment_name: # Your experiment name
mlflow_run_name: # Your run name
hf_mlflow_log_artifacts: # set to true to copy each saved checkpoint on each save to mlflow artifact registry
# Comet configuration if you're using it
@@ -358,7 +314,7 @@ max_steps:
eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"]
eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", chrf]
loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
@@ -406,7 +362,6 @@ lr_div_factor: # Learning rate div factor
# - adamw_torch_fused
# - adamw_torch_xla
# - adamw_apex_fused
# - adopt_adamw (only for torch version >= 2.5.1)
# - adafactor
# - adamw_anyprecision
# - sgd

View File

@@ -6,8 +6,31 @@ order: 3
## sharegpt
IMPORTANT: ShareGPT is deprecated!. Please see `chat_template` section below.
conversations where `from` is `human`/`gpt`. (optional: first row with role `system` to override default system prompt)
```{.json filename="data.jsonl"}
{"conversations": [{"from": "...", "value": "..."}]}
```
Note: `type: sharegpt` opens special configs:
- `conversation`: enables conversions to many Conversation types. Refer to the 'name' [here](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py) for options.
- `roles`: allows you to specify the roles for input and output. This is useful for datasets with custom roles such as `tool` etc to support masking.
- `field_human`: specify the key to use instead of `human` in the conversation.
- `field_model`: specify the key to use instead of `gpt` in the conversation.
```yaml
datasets:
path: ...
type: sharegpt
conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
field_human: # Optional[str]. Human key to use for conversation.
field_model: # Optional[str]. Assistant key to use for conversation.
# Add additional keys from your dataset as input or output roles
roles:
input: # Optional[List[str]]. These will be masked based on train_on_input
output: # Optional[List[str]].
```
## pygmalion
@@ -15,137 +38,34 @@ IMPORTANT: ShareGPT is deprecated!. Please see `chat_template` section below.
{"conversations": [{"role": "...", "value": "..."}]}
```
## sharegpt.load_role
## chat_template
Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2.
conversations where `role` is used instead of `from`
```{.json filename="data.jsonl"}
{"conversations": [{"role": "...", "content": "..."}]}
{"conversations": [{"role": "...", "value": "..."}]}
```
See `config.qmd` for full configs and supported templates.
## sharegpt.load_guanaco
### Migrating from sharegpt
Most configs can be adapted as follows:
```yaml
# old
chat_template: chatml
datasets:
- path: ...
type: sharegpt
conversation: chatml
# new (if using tokenizer's chat_template)
datasets:
- path: ...
type: chat_template
field_messages: conversations
message_field_role: from
message_field_content: value
# new (if setting a new chat_template like chatml, gemma, etc)
chat_template: chatml
datasets:
- path: ...
type: chat_template
field_messages: conversations
message_field_role: from
message_field_content: value
```
We recommend checking the below examples for other usecases.
### Examples
1. Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
```yaml
datasets:
- path: ...
type: chat_template
```
2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
```yaml
chat_template: gemma # this overwrites the tokenizer's chat_template
datasets:
- path: ...
type: chat_template
roles_to_train: ["assistant"]
```
3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
```yaml
chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template
datasets:
- path: ...
type: chat_template
roles_to_train: ["assistant"]
```
4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.
```yaml
# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
chat_template_jinja: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
datasets:
- path: ...
type: chat_template
roles_to_train: ["assistant"]
```
5. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
For a data sample that looks like:
conversations where `from` is `prompter` `assistant` instead of default sharegpt
```{.json filename="data.jsonl"}
{
"conversations": [
{"from": "system", "value": "You are an AI assistant.", "train": false},
{"from": "human", "value": "Hello", "train": false},
{"from": "assistant", "value": "Hello", "train": true},
{"from": "human", "value": "How are you?", "train": true},
{
"from": "assistant",
"value": "I'm doing very well, thank you!",
"train_detail": [
{"begin_offset": 0, "end_offset": 8, "train": false},
{"begin_offset": 9, "end_offset": 18, "train": true},
{"begin_offset": 19, "end_offset": 30, "train": false},
],
},
{
"from": "human",
"value": "I'm doing very well, thank you!",
"train": true,
},
{"from": "assistant", "value": "Hi there!", "train": true}
]
}
{"conversations": [{"from": "...", "value": "..."}]}
```
The configuration would look like:
## sharegpt.load_ultrachat
```yaml
datasets:
- path: ...
type: chat_template
chat_template: tokenizer_default
field_messages: conversations
message_field_role: from
message_field_content: value
roles_to_train: []
train_on_eos: turn
message_field_training: train
message_field_training_detail: train_detail
conversations where the turns field is 'messages', human is 'user' and gpt is 'assistant'.
```{.json filename="data.jsonl"}
{"messages": [{"user": "...", "assistant": "..."}]}
```
Tip: It is not necessary to use both `message_field_training` and `message_field_training_detail` at a time.
## sharegpt_jokes
creates a chat where bot is asked to tell a joke, then explain why the joke is funny
```{.json filename="data.jsonl"}
{"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
```

View File

@@ -51,12 +51,12 @@ While debugging it's helpful to simplify your test scenario as much as possible.
### Background
The below example shows how to configure VSCode to debug data preprocessing of the `chat_template` format. This is the format used when you have the following in your axolotl config:
The below example shows how to configure VSCode to debug data preprocessing of the `sharegpt` format. This is the format used when you have the following in your axolotl config:
```yaml
datasets:
- path: <path to your chat_template formatted dataset> # example on HF Hub: fozziethebeat/alpaca_messages_2k_test
type: chat_template
- path: <path to your sharegpt formatted dataset> # example on HF Hub: philschmid/guanaco-sharegpt-style
type: sharegpt
```
>[!Important]
@@ -83,7 +83,7 @@ If you developing on a remote host, you can easily use VSCode to debug remotely.
The easiest way to get started is to modify the [.vscode/launch.json](../.vscode/launch.json) file in this project. This is just an example configuration, so you may need to modify or copy it to suit your needs.
For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_chat_template.yml`, you would use the below configuration[^1]. Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to `devtools` and set the `env` variable `HF_HOME` to a temporary folder that is later partially deleted. This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.
For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_sharegpt.yml`, you would use the below configuration[^1]. Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to `devtools` and set the `env` variable `HF_HOME` to a temporary folder that is later partially deleted. This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.
```jsonc
// .vscode/launch.json
@@ -91,12 +91,12 @@ For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 acceler
"version": "0.2.0",
"configurations": [
{
"name": "Debug axolotl prompt - chat_template",
"name": "Debug axolotl prompt - sharegpt",
"type": "python",
"module": "accelerate.commands.launch",
"request": "launch",
"args": [
"-m", "axolotl.cli.train", "dev_chat_template.yml",
"-m", "axolotl.cli.train", "dev_sharegpt.yml",
// The flags below simplify debugging by overriding the axolotl config
// with the debugging tips above. Modify as needed.
"--dataset_processes=1", // limits data preprocessing to one process
@@ -185,7 +185,7 @@ style="border-radius: 10px; display: block; margin: auto;" width="560" height="3
## Debugging With Docker
Using [official Axolotl Docker images](https://hub.docker.com/r/axolotlai/axolotl/tags) is a great way to debug your code, and is a very popular way to use Axolotl. Attaching VSCode to Docker takes a few more steps.
Using [official Axolotl Docker images](https://hub.docker.com/r/winglian/axolotl/tags) is a great way to debug your code, and is a very popular way to use Axolotl. Attaching VSCode to Docker takes a few more steps.
### Setup
@@ -202,11 +202,11 @@ cd axolotl
Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:[^2]
```bash
docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1
docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-py3.10-cu118-2.0.1
```
>[!Tip]
> To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/axolotlai/axolotl/tags). For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).
> To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/winglian/axolotl/tags). For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).
You will now be in the container. Next, perform an editable install of Axolotl:
@@ -240,6 +240,6 @@ style="border-radius: 10px; display: block; margin: auto;" width="560" height="3
</div>
<br>
[^1]: The config actually mimics the command `CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/chat_template.yml`, but this is the same thing.
[^1]: The config actually mimics the command `CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/sharegpt.yml`, but this is the same thing.
[^2]: Many of the below flags are recommended best practices by Nvidia when using nvidia-container-toolkit. You can read more about these flags [here](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html).

View File

@@ -11,10 +11,12 @@ standard industry baselines.
### Installation
The following will install the correct unsloth and extras from source.
The following will install unsloth from source and downgrade xformers as unsloth is incompatible with the most up
to date libraries.
```bash
python scripts/unsloth_install.py | sh
pip install --no-deps "unsloth @ git+https://github.com/unslothai/unsloth.git"
pip install --no-deps --force-reinstall xformers==0.0.26.post1
```
### Using unsloth w Axolotl

View File

@@ -2,15 +2,19 @@
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"id": "AKjdG7tbTb-n"
},
"source": [
"## Setting up"
"# Example notebook for running Axolotl on google colab"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"id": "RcbNpOgWRcii"
},
"outputs": [],
"source": [
"import torch\n",
@@ -18,76 +22,82 @@
"assert (torch.cuda.is_available()==True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "h3nLav8oTRA5"
},
"source": [
"## Install Axolotl and dependencies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3c3yGAwnOIdi",
"outputId": "e3777b5a-40ef-424f-e181-62dfecd1dd01"
},
"outputs": [],
"source": [
"!pip install axolotl[deepspeed]"
"!pip install -e git+https://github.com/axolotl-ai-cloud/axolotl#egg=axolotl\n",
"!pip install flash-attn==\"2.5.0\"\n",
"!pip install deepspeed==\"0.13.1\"!pip install mlflow==\"2.13.0\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"id": "BW2MFr7HTjub"
},
"source": [
"## Hugging Face login (optional)"
"## Create an yaml config file"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import notebook_login\n",
"notebook_login()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Example configuration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"id": "9pkF2dSoQEUN"
},
"outputs": [],
"source": [
"import yaml\n",
"\n",
"# Your YAML string\n",
"yaml_string = \"\"\"\n",
"base_model: NousResearch/Meta-Llama-3.1-8B\n",
"base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\n",
"model_type: LlamaForCausalLM\n",
"tokenizer_type: LlamaTokenizer\n",
"\n",
"load_in_8bit: false\n",
"load_in_4bit: true\n",
"strict: false\n",
"\n",
"datasets:\n",
" - path: tatsu-lab/alpaca\n",
" - path: mhenrichsen/alpaca_2k_test\n",
" type: alpaca\n",
"dataset_prepared_path: last_run_prepared\n",
"dataset_prepared_path:\n",
"val_set_size: 0.05\n",
"output_dir: ./outputs/lora-out\n",
"\n",
"sequence_len: 2048\n",
"sample_packing: true\n",
"eval_sample_packing: true\n",
"pad_to_sequence_len: true\n",
"output_dir: ./outputs/qlora-out\n",
"\n",
"adapter: qlora\n",
"lora_model_dir:\n",
"\n",
"sequence_len: 4096\n",
"sample_packing: true\n",
"eval_sample_packing: false\n",
"pad_to_sequence_len: true\n",
"\n",
"lora_r: 32\n",
"lora_alpha: 16\n",
"lora_dropout: 0.05\n",
"lora_target_modules:\n",
"lora_target_linear: true\n",
"lora_fan_in_fan_out:\n",
"lora_modules_to_save:\n",
" - embed_tokens\n",
" - lm_head\n",
"\n",
"wandb_project:\n",
"wandb_entity:\n",
@@ -95,12 +105,12 @@
"wandb_name:\n",
"wandb_log_model:\n",
"\n",
"gradient_accumulation_steps: 2\n",
"micro_batch_size: 1\n",
"num_epochs: 1\n",
"optimizer: paged_adamw_8bit\n",
"gradient_accumulation_steps: 4\n",
"micro_batch_size: 2\n",
"num_epochs: 4\n",
"optimizer: paged_adamw_32bit\n",
"lr_scheduler: cosine\n",
"learning_rate: 2e-5\n",
"learning_rate: 0.0002\n",
"\n",
"train_on_inputs: false\n",
"group_by_length: false\n",
@@ -111,15 +121,13 @@
"gradient_checkpointing: true\n",
"early_stopping_patience:\n",
"resume_from_checkpoint:\n",
"local_rank:\n",
"logging_steps: 1\n",
"xformers_attention:\n",
"flash_attention: false\n",
"sdp_attention: true\n",
"flash_attention: true\n",
"\n",
"warmup_steps: 1\n",
"max_steps: 25\n",
"evals_per_epoch: 1\n",
"eval_table_size:\n",
"warmup_steps: 10\n",
"evals_per_epoch: 4\n",
"saves_per_epoch: 1\n",
"debug:\n",
"deepspeed:\n",
@@ -127,9 +135,8 @@
"fsdp:\n",
"fsdp_config:\n",
"special_tokens:\n",
" pad_token: <|end_of_text|>\n",
"\"\"\"\n",
"\n",
"\"\"\"\n",
"\n",
"# Convert the YAML string to a Python dictionary\n",
"yaml_dict = yaml.safe_load(yaml_string)\n",
@@ -139,124 +146,31 @@
"\n",
"# Write the YAML file\n",
"with open(file_path, 'w') as file:\n",
" yaml.dump(yaml_dict, file)"
" yaml.dump(yaml_dict, file)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"id": "bidoj8YLTusD"
},
"source": [
"Above we have a configuration file with base LLM model and datasets specified, among many other things. Axolotl can automatically detect whether the specified datasets are on HuggingFace repo or local machine.\n",
"\n",
"The Axolotl configuration options encompass model and dataset selection, data pre-processing, and training. Let's go through them line by line:\n",
"\n",
"* \"base model\": String value, specifies the underlying pre-trained LLM that will be used for finetuning\n",
"\n",
"Next we have options for model weights quantization. Quantization allows for reduction in occupied memory on GPUs.\n",
"\n",
"* \"load_in_8bit\": Boolean value, whether to quantize the model weights into 8-bit integer.\n",
"\n",
"* \"load_in_4bit\": Boolean value, whether to quantize the model weights into 4-bit integer.\n",
"\n",
"* \"strict\": Boolean value. If false, it allows for overriding established configuration options in the yaml file when executing in command-line interface.\n",
"\n",
"* \"datasets\": a list of dicts that contain path and type of data sets as well as other optional configurations where datasets are concerned. Supports multiple datasets.\n",
"\n",
"* \"val_set_size\": Either a float value less than one or an integer less than the total size of dataset. Sets the size of validation set from the whole dataset. If float, sets the proportion of the dataset assigned for validation. If integer, sets the direct size of validation set.\n",
"\n",
"* \"output_dir\": String value. Path of trained model.\n",
"\n",
"For data preprocessing:\n",
"\n",
"* \"sequence_len\": Integer. Specifies the maximum sequence length of the input. Typically 2048 or less.\n",
"\n",
"* \"pad_to_sequence_len\": Boolean. Padding input to maximum sequence length.\n",
"\n",
"* \"sample_packing\": Boolean. Specifies whether to use multi-packing with block diagonal attention.\n",
"\n",
"* \"special_tokens\": Python dict, optional. Allows users to specify the additional special tokens to be ignored by the tokenizer.\n",
"\n",
"For LoRA configuration and its hyperparamters:\n",
"\n",
"* \"adapter\": String. Either \"lora\" or \"qlora\", depending on user's choice.\n",
"\n",
"* \"lora_model_dir\": String, Optional. Path to directory that contains LoRA model, if there is already a trained LoRA model the user would like to use.\n",
"\n",
"* \"lora_r\": Integer. Refers to the rank of LoRA decomposition matrices. Higher value will reduce LoRA efficiency. Recommended to be set to 8.\n",
"\n",
"* \"lora_alpha\": Integer. Scale the weight matrices by $\\frac{\\text{lora_alpha}}{\\text{lora_r}}$Recommended to be fixed at 16.\n",
"\n",
"* \"lora_dropout\": Float that is 1 or less. The dropout probability of a lora layer.\n",
"\n",
"* \"lora_target_linear\": Boolean. If true, lora will target all linear modules in the transformers architecture.\n",
"\n",
"* \"lora_modules_to_save\": If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n",
"\n",
"See [LoRA](https://arxiv.org/abs/2106.09685) for detailed explanation of LoRA implementation.\n",
"\n",
"For the training configurations:\n",
"\n",
"* \"gradient_accumulation_steps\": Integer. The number of steps over which to accumulate gradient for batch training. E.g. if 2, backprop is performed every two steps.\n",
"\n",
"* \"micro_batch_size\": Integer. Batch size per gpu / gradient_accumulation_steps\n",
"\n",
"* \"num_epochs\": Integer. Number of epochs. One epoch is when training has looped over every batch in the whole data set once.\n",
"\n",
"* \"optimizer\": The optimizer to use for the training.\n",
"\n",
"* \"learning_rate\": The learning rate.\n",
"\n",
"* \"lr_scheduler\": The learning rate scheduler to use for adjusting learning rate during training.\n",
"\n",
"* \"train_on_inputs\": Boolean. Whether to ignore or include the user's prompt from the training labels.\n",
"\n",
"* \"group_by_length\": Boolean. Whether to group similarly sized data to minimize padding.\n",
"\n",
"* \"bf16\": Either \"auto\", \"true\", or \"false\". Whether to use CUDA bf16 floating point format. If set to \"auto\", will automatically apply bf16 should the gpu supports it.\n",
"\n",
"* \"fp16\": Optional. Specifies whether to use CUDA fp16. Automatically set to true if \"bf16\" is set to true. Otherwise false.\n",
"\n",
"* \"tf32\": Boolean. Whether to use CUDA tf32. Will override bf16.\n",
"\n",
"* \"gradient_checkpointing\": Boolean. Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\n",
"\n",
"* \"gradient_checkpointing_kwargs\": Python Dict. Fed into the trainer.\n",
"\n",
"* \"logging_steps\": Integer. Log training information over every specified number of steps.\n",
"\n",
"* \"flash_attention\": Boolean. Whether to use the [flash attention](https://github.com/Dao-AILab/flash-attention) mechanism.\n",
"\n",
"* \"sdp_attention\": Boolean. Whether to use the Scaled Dot Product attention mechanism (the attention mechanism in the [original implementation](https://arxiv.org/abs/1706.03762) of transformers.)\n",
"\n",
"* \"warmup_steps\": Integer. The number of pre-training steps where a very low learning rate is used.\n",
"\n",
"* \"evals_per_epoch\": Integer. Number of evaluations to be performed within one training epoch.\n",
"\n",
"* \"saves_per_epoch\": Integer. Number of times the model is saved in one training epoch.\n",
"\n",
"* \"weight_decay\": Positive Float. Sets the \"strength\" of weight decay (i.e. setting the coefficient of L2 regularization)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The above is but a snippet aiming to get users familiarized with the types of streamlined configuration options axolotl provides. For a full list of configuration options, see [here](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Train the model"
"## Launch the training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ydTI2Jk2RStU",
"outputId": "d6d0df17-4b53-439c-c802-22c0456d301b"
},
"outputs": [],
"source": [
"# By using the ! the comand will be executed as a bash command\n",
"!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
]
},
@@ -264,7 +178,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Predict with trained model"
"## Play with inference"
]
},
{
@@ -273,85 +187,36 @@
"metadata": {},
"outputs": [],
"source": [
"# By using the ! the comand will be executed as a bash command\n",
"!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n",
" --lora_model_dir=\"./outputs/lora-out\" --gradio"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deeper Dive"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It is also helpful to gain some familiarity over some of the core inner workings of axolotl"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configuration Normalization"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Axolotl uses a custom Dict class, called ```DictDefault```\n",
"to store configurations specified in the yaml configuration file (into a Python variable named ```cfg```). The definition for this custom Dict can be found in the [utils/dict.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/dict.py)\n",
"\n",
"```DictDefault``` is amended such that calling a missing key from it will result in a ```None``` return type. This is important because if some configuration options aren't specified by the user, the ```None``` type allows Axolotl to perform boolean operations to determine the default settings for missing configurations. For more examples on how this is done, check out [utils/config/__init__.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/__init__.py)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading Models, Tokenizers, and Trainer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If we inspect [cli.train.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/cli/train.py), we will find that most of the heavy lifting were done by the function ```train()``` which is itself imported from [src/axolotl/train.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/train.py).\n",
"\n",
"```train()``` takes care of loading the appropriate tokenizer and pre-trained model through ```load_model()``` and ```load_tokenizer()``` from [src/axolotl/utils/models.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/models.py) respectively.\n",
"\n",
"```load_tokenizer()``` loads in the appropriate tokenizer given the desired model, as well as chat templates.\n",
"\n",
"```ModelLoader``` class follows after tokenizer has been selected. It will automatically discern the base model type, load in the desired model, as well as applying model-appropriate attention mechanism modifications (e.g. flash attention). Depending on which base model the user chooses in the configuration, ```ModelLoader``` will utilize the corresponding \"attention hijacking\" script. For example, if the user specified the base model to be ```NousResearch/Meta-Llama-3.1-8B```, which is of llama type, and set ```flash_attn``` to ```True```, ```ModelLoader``` will load in [llama_attn_hijack_flash.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/monkeypatch/llama_attn_hijack_flash.py). For a list of supported attention hijacking, please refer to the directory [/src/axolotl/monkeypatch/](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/monkeypatch)\n",
"\n",
"Another important operation encompassed in ```train()``` is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of ```setup_trainer()``` from [/src/axolotl/utils/trainer.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/trainer.py), which in turn relies on modules from [/src/axolotl/core/trainer_builder.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/core/trainer_builder.py).\n",
"```trainer_builder.py``` provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning ('dpo', 'ipo', 'kto') )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Monkey patch\n",
"\n",
"The [Monkey patch directory](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/monkeypatch) is where model architecture/optimization patching scripts are stored (these are modifications that are not implemented in the official releases, hence the name monkey patch). It includes attention jacking, ReLoRA, and unsloth optimization."
" --qlora_model_dir=\"./qlora-out\" --gradio"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"version": "3.9.6"
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

View File

@@ -9,17 +9,14 @@ strict: false
plugins:
- axolotl.integrations.liger.LigerPlugin
liger_rms_norm: true
liger_glu_activation: true
liger_swiglu: true
liger_fused_linear_cross_entropy: true
chat_template: deepseek_v2
datasets:
- path: mlabonne/FineTome-100k
type: chat_template
split: train[:20%]
field_messages: conversations
message_field_role: from
message_field_content: value
split: train
dataset_prepared_path: last_run_prepared
val_set_size: 0.0

View File

@@ -11,11 +11,8 @@ chat_template: gemma
datasets:
- path: cgato/SlimOrcaDedupCleaned
type: chat_template
chat_template: gemma
drop_system_message: true
field_messages: conversations
message_field_role: from
message_field_content: value
val_set_size: 0.0
output_dir: ./outputs/out

View File

@@ -1,63 +0,0 @@
base_model: google/gemma-2-2b
model_type: AutoModelForSequenceClassification
tokenizer_type: AutoTokenizer
load_in_8bit: false
load_in_4bit: false
strict: false
reward_model: true
chat_template: gemma
datasets:
- path: argilla/distilabel-intel-orca-dpo-pairs
type: bradley_terry.chat_template
val_set_size: 0.0
output_dir: ./outputs/out
remove_unused_columns: false
sequence_len: 2048
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: true
fp16:
tf32: true
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch:
eval_table_size:
eval_max_new_tokens: 128
saves_per_epoch: 1
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:

View File

@@ -4,15 +4,11 @@ tokenizer_type: AutoTokenizer
load_in_4bit: true
strict: false
use_tensorboard: true
chat_template: jamba
datasets:
- path: cgato/SlimOrcaDedupCleaned
type: chat_template
chat_template: jamba
drop_system_message: true
field_messages: conversations
message_field_role: from
message_field_content: value
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: jamba-large-fsdp-qlora-ft

View File

@@ -4,7 +4,7 @@ plugins:
- axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_swiglu: true
liger_fused_linear_cross_entropy: true
strict: false
@@ -14,10 +14,6 @@ datasets:
- path: mlabonne/FineTome-100k
type: chat_template
split: train[:20%]
field_messages: conversations
message_field_role: from
message_field_content: value
dataset_prepared_path: last_run_prepared
val_set_size: 0.02
output_dir: ./outputs/out

View File

@@ -11,6 +11,7 @@ rl: dpo
datasets:
- path: fozziethebeat/alpaca_messages_2k_dpo_test
type: chat_template.default
chat_template: llama3
field_messages: conversation
field_chosen: chosen
field_rejected: rejected

View File

@@ -10,6 +10,7 @@ chat_template: llama3
datasets:
- path: fozziethebeat/alpaca_messages_2k_test
type: chat_template
chat_template: llama3
field_messages: messages
message_field_role: role
message_field_content: content

View File

@@ -1,77 +0,0 @@
base_model: meta-llama/Llama-3.2-1B
load_in_8bit: false
load_in_4bit: true
strict: false
datasets:
- path: teknium/GPT4-LLM-Cleaned
type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/qlora-out
adapter: qlora
lora_model_dir:
sequence_len: 2048
sample_packing: true
eval_sample_packing: true
pad_to_sequence_len: true
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_fan_in_fan_out:
lora_target_modules:
- gate_proj
- down_proj
- up_proj
- q_proj
- v_proj
- k_proj
- o_proj
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3
warmup_steps: 10
evals_per_epoch: 4
eval_table_size:
eval_max_new_tokens: 128
saves_per_epoch: 1
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:
pad_token: "<|end_of_text|>"

View File

@@ -1,93 +0,0 @@
#Note that we are switching from the regular chat template to chatml.
#If you experience problems with the special tokens, training for more epochs can help.
#After training, merge the model before inference otherwise you might
#face problems with the special tokens.
base_model: mistralai/Mistral-7B-Instruct-v0.2
model_type: MistralForCausalLM
tokenizer_type: LlamaTokenizer
load_in_8bit: false
load_in_4bit: true
strict: false
chat_template: chatml
rl: dpo
datasets:
- path: olivermolenschot/alpaca_messages_dpo_test
type: chat_template.default
field_messages: conversation
field_chosen: chosen
field_rejected: rejected
message_field_role: role
message_field_content: content
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/dpo-qlora
sequence_len: 2048
sample_packing: false
pad_to_sequence_len: true
adapter: qlora
lora_model_dir:
lora_r: 8
lora_alpha: 16
lora_dropout: 0.2
lora_target_linear: true
lora_fan_in_fan_out:
lora_target_modules:
- gate_proj
- down_proj
- up_proj
- q_proj
- v_proj
- k_proj
- o_proj
lora_modules_to_save:
- embed_tokens
- lm_head
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 16
num_epochs: 6
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0001
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: false
s2_attention:
warmup_steps: 10
evals_per_epoch: 4
eval_table_size:
eval_max_new_tokens: 128
saves_per_epoch: 1
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:
bos_token: "<|im_start|>"
eos_token: "<|im_end|>"

View File

@@ -10,6 +10,7 @@ chat_template: phi_3
datasets:
- path: fozziethebeat/alpaca_messages_2k_test
type: chat_template
chat_template: phi_3
field_messages: messages
message_field_role: role
message_field_content: content

View File

@@ -1,67 +0,0 @@
base_model: Qwen/Qwen2.5-0.5B
strict: false
chat_template: qwen_25
rl: dpo
datasets:
- path: fozziethebeat/alpaca_messages_2k_dpo_test
type: chat_template.default
field_messages: conversation
field_chosen: chosen
field_rejected: rejected
message_field_role: role
message_field_content: content
roles:
system:
- system
user:
- user
assistant:
- assistant
dataset_prepared_path:
val_set_size: 0.0
output_dir: ./outputs/dpo-out
sequence_len: 2048
sample_packing: false
pad_to_sequence_len: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
warmup_steps: 10
evals_per_epoch: 4
eval_table_size:
eval_max_new_tokens: 128
saves_per_epoch: 1
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:

Binary file not shown.

Before

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 11 KiB

View File

@@ -1,19 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 1113 283.5">
<path fill="#141310" d="M435,234.3l-12.1-48.8h-54.4l-12.1,48.8h-24.7l48.2-185.1h31.6l47.9,185.1h-24.5ZM417.7,164.9l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
<path fill="#141310" d="M568.2,234.3l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.6-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.4,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.2,49.3,71.7h-28.5Z"/>
<path fill="#141310" d="M658.6,236.3c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM658.6,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
<path fill="#141310" d="M860.6,236.3c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM860.6,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
<path fill="#141310" d="M773.9,234c-18,0-32.6-14.6-32.6-32.6V48.8h24.1v152.6c0,4.7,3.8,8.5,8.5,8.5h16.8v24.1h-16.8Z"/>
<path fill="#141310" d="M1036.2,234.3V81.4c0-4.7-3.8-8.5-8.5-8.5h-16.8v-24.1h16.8c18,0,32.6,14.6,32.6,32.6v152.9h-24.1Z"/>
<path fill="#141310" d="M978.6,234.3c-18,0-32.6-14.6-32.6-32.6v-85.1h-20.3v-22.1h20.3v-45.3h24.1v45.3h30.2v22.1h-30.2v85.1c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
<path fill="#141310" d="M51.5,49h12.2v-20.6h-12.2c-16,0-29,13-29,29v32.8h20.6v-32.8c0-4.7,3.8-8.4,8.4-8.4Z"/>
<path fill="#141310" d="M92.8,49h12.2v-20.6h-12.2c-16,0-29,13-29,29v12.2h20.6v-12.2c0-4.7,3.8-8.4,8.4-8.4Z"/>
<path fill="#141310" d="M249.3,57.4c0-16-13-29-29-29h-12.2v20.6h12.2c4.7,0,8.4,3.8,8.4,8.4v32.8h20.6v-32.8Z"/>
<path fill="#141310" d="M187.4,90.2v-20.6h-103.1v20.6h-41.2v20.6h-20.6v41.2c0,11.4,9.2,20.6,20.6,20.6h185.5c11.4,0,20.6-9.2,20.6-20.6v-41.2h-20.6v-20.6h-41.2ZM166.8,141.7c0-5.7-4.6-10.3-10.3-10.3s-10.3,4.6-10.3,10.3v10.3h-20.6v-20.6c0-11.4,9.2-20.6,20.6-20.6s20.6,9.2,20.6,20.6v10.3ZM228.7,141.7c0-5.7-4.6-10.3-10.3-10.3s-10.3,4.6-10.3,10.3v10.3h-20.6v-20.6c0-11.4,9.2-20.6,20.6-20.6s20.6,9.2,20.6,20.6v10.3Z"/>
<path fill="#141310" d="M208,57.4c0-16-13-29-29-29h-12.2v20.6h12.2c4.7,0,8.4,3.8,8.4,8.4v12.2h20.6v-12.2Z"/>
<rect fill="#141310" x="22.5" y="234.5" width="41.2" height="20.6"/>
<rect fill="#141310" x="84.3" y="234.5" width="164.9" height="20.6"/>
<rect fill="#141310" x="208" y="193.3" width="41.2" height="20.6"/>
<rect fill="#141310" x="22.5" y="193.3" width="164.9" height="20.6"/>
</svg>

Before

Width:  |  Height:  |  Size: 3.2 KiB

View File

@@ -1,11 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 1113 283.5">
<path fill="#fff" d="M462.9,234.2l-12.1-48.8h-54.4l-12.1,48.8h-24.7l48.2-185h31.6l47.9,185h-24.4ZM445.7,164.8l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
<path fill="#fff" d="M596.1,234.2l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.5-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.3,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.1,49.3,71.6h-28.5Z"/>
<path fill="#fff" d="M686.4,236.2c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.6,14.8-41.4,9.8-9.7,23.4-14.7,40.2-14.7s30.4,4.9,40.2,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.4-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM686.4,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.8v-36.7c0-10.5-2.8-18.5-8.2-23.8-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
<path fill="#fff" d="M888.3,236.2c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.6,14.8-41.4,9.8-9.7,23.4-14.7,40.2-14.7s30.4,4.9,40.2,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.4-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM888.3,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.8v-36.7c0-10.5-2.8-18.5-8.2-23.8-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
<path fill="#fff" d="M801.7,234c-18,0-32.6-14.6-32.6-32.6V48.8h24.1v152.5c0,4.7,3.8,8.5,8.5,8.5h16.7v24.1h-16.7Z"/>
<path fill="#fff" d="M1063.8,234.2V81.4c0-4.7-3.8-8.5-8.5-8.5h-16.7v-24.1h16.7c18,0,32.6,14.6,32.6,32.6v152.8h-24.1Z"/>
<path fill="#fff" d="M1006.2,234.2c-18,0-32.6-14.6-32.6-32.6v-85h-20.3v-22.1h20.3v-45.2h24.1v45.2h30.2v22.1h-30.2v85c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
<path fill="#fff" d="M160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM277.3,57.4c0-23.8-19.3-43.1-43.1-43.1h-12.2c-3.9,0-7.6,1.6-10.2,4.4-5.9-2.9-12.3-4.4-18.9-4.4h-12.2c-7.7,0-14.1,6.3-14.1,14.1v20.6c0,2.4.6,4.6,1.6,6.6h-37c1-2,1.6-4.2,1.6-6.6v-20.6c0-7.7-6.3-14.1-14.1-14.1h-12.2c-6.5,0-13,1.5-18.9,4.4-2.6-2.8-6.3-4.4-10.2-4.4h-12.2c-23.8,0-43.1,19.3-43.1,43.1v32.8c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v41.2c0,11,5.2,20.8,13.2,27.2-7.3.4-13.2,6.6-13.2,14v20.6c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v20.6c0,7.7,6.3,14.1,14.1,14.1h41.2c4.1,0,7.7-1.7,10.3-4.5,2.6,2.8,6.2,4.5,10.3,4.5h164.9c7.7,0,14.1-6.3,14.1-14.1v-20.6c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-20.6c0-7.5-5.8-13.6-13.2-14,8-6.4,13.2-16.2,13.2-27.2v-41.2c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-32.8ZM77.8,255.1h-41.2v-20.6h41.2v20.6ZM36.5,213.9v-20.6h164.9v20.6H36.5ZM263.3,255.1H98.4v-20.6h164.9v20.6ZM263.3,213.9h-41.2v-20.6h41.2v20.6ZM263.3,90.2h-20.6v20.6h20.6v41.2c0,11.4-9.2,20.6-20.6,20.6H57.2c-11.4,0-20.6-9.2-20.6-20.6v-41.2h20.6v-20.6h-20.6v-32.8c0-16,13-29,29-29h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v32.8h41.2v-20.6h-20.6v-12.2c0-16,13-29,29-29h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v12.2h103.1v-12.2c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16,0,29,13,29,29v12.2h-20.6v20.6h41.2v-32.8c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16,0,29,13,29,29v32.8ZM201.4,152h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6s-20.6,9.2-20.6,20.6v20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6Z"/>
</svg>

Before

Width:  |  Height:  |  Size: 6.6 KiB

View File

@@ -1,26 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 283.5 283.5">
<defs>
<style>
.cls-1 {
fill: #141310;
}
</style>
</defs>
<!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142) -->
<g>
<g id="Layer_1">
<g>
<path class="cls-1" d="M46.9,37.4h13.7V14.2h-13.7c-18,0-32.7,14.6-32.7,32.7v36.9h23.2v-36.9c0-5.2,4.2-9.5,9.5-9.5Z"/>
<path class="cls-1" d="M93.2,37.4h13.7V14.2h-13.7c-18,0-32.7,14.6-32.7,32.7v13.7h23.2v-13.7c0-5.2,4.2-9.5,9.5-9.5Z"/>
<path class="cls-1" d="M269.3,46.9c0-18-14.6-32.7-32.7-32.7h-13.7v23.2h13.7c5.2,0,9.5,4.2,9.5,9.5v36.9h23.2v-36.9Z"/>
<path class="cls-1" d="M199.7,83.8v-23.2h-116v23.2h-46.4v23.2H14.2v46.4c0,12.8,10.4,23.2,23.2,23.2h208.7c12.8,0,23.2-10.4,23.2-23.2v-46.4h-23.2v-23.2h-46.4ZM176.5,141.7c0-6.4-5.2-11.6-11.6-11.6s-11.6,5.2-11.6,11.6v11.6h-23.2v-23.2c0-12.8,10.4-23.2,23.2-23.2s23.2,10.4,23.2,23.2v11.6ZM246.1,141.7c0-6.4-5.2-11.6-11.6-11.6s-11.6,5.2-11.6,11.6v11.6h-23.2v-23.2c0-12.8,10.4-23.2,23.2-23.2s23.2,10.4,23.2,23.2v11.6Z"/>
<path class="cls-1" d="M222.9,46.9c0-18-14.6-32.7-32.7-32.7h-13.7v23.2h13.7c5.2,0,9.5,4.2,9.5,9.5v13.7h23.2v-13.7Z"/>
<rect class="cls-1" x="14.2" y="246.1" width="46.4" height="23.2"/>
<rect class="cls-1" x="83.8" y="246.1" width="185.5" height="23.2"/>
<rect class="cls-1" x="222.9" y="199.7" width="46.4" height="23.2"/>
<rect class="cls-1" x="14.2" y="199.7" width="185.5" height="23.2"/>
</g>
</g>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 1.6 KiB

View File

@@ -1,16 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 283.5 283.5">
<defs>
<style>
.cls-1 {
fill: #fff;
}
</style>
</defs>
<!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142) -->
<g>
<g id="Layer_1">
<path class="cls-1" d="M152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM269.3,57.3c0-23.8-19.4-43.1-43.1-43.1h-12.2c-3.9,0-7.6,1.6-10.2,4.4-5.9-2.9-12.3-4.4-18.9-4.4h-12.2c-7.8,0-14.1,6.3-14.1,14.1v20.6c0,2.4.6,4.6,1.6,6.6h-37c1-2,1.6-4.2,1.6-6.6v-20.6c0-7.8-6.3-14.1-14.1-14.1h-12.2c-6.6,0-13,1.5-18.9,4.4-2.6-2.8-6.3-4.4-10.2-4.4h-12.2c-23.8,0-43.1,19.4-43.1,43.1v32.8c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v41.3c0,11,5.2,20.9,13.2,27.2-7.4.4-13.2,6.6-13.2,14v20.6c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v20.6c0,7.8,6.3,14.1,14.1,14.1h41.3c4.1,0,7.7-1.7,10.3-4.5,2.6,2.8,6.2,4.5,10.3,4.5h165.1c7.8,0,14.1-6.3,14.1-14.1v-20.6c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-20.6c0-7.5-5.9-13.6-13.2-14,8-6.4,13.2-16.2,13.2-27.2v-41.3c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-32.8ZM69.5,255.2H28.2v-20.6h41.3v20.6ZM28.2,214v-20.6h165.1v20.6H28.2ZM255.2,255.2H90.1v-20.6h165.1v20.6ZM255.2,214h-41.3v-20.6h41.3v20.6ZM255.2,90.1h-20.6v20.6h20.6v41.3c0,11.4-9.2,20.6-20.6,20.6H48.9c-11.4,0-20.6-9.2-20.6-20.6v-41.3h20.6v-20.6h-20.6v-32.8c0-16.1,13-29.1,29.1-29.1h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v32.8h41.3v-20.6h-20.6v-12.2c0-16.1,13-29.1,29.1-29.1h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v12.2h103.2v-12.2c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16.1,0,29.1,13,29.1,29.1v12.2h-20.6v20.6h41.3v-32.8c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16.1,0,29.1,13,29.1,29.1v32.8ZM193.3,152h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6s-20.6,9.2-20.6,20.6v20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6Z"/>
</g>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 5.0 KiB

View File

@@ -1,17 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 765.4 212.6">
<!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142) -->
<g>
<g id="Layer_1">
<g>
<path d="M121.6,198.1l-12.1-48.8h-54.4l-12.1,48.8h-24.7L66.6,12.9h31.6l47.9,185.1h-24.5ZM104.4,128.6l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
<path d="M254.9,198.1l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.6-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.4,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.2,49.3,71.7h-28.5Z"/>
<path d="M345.2,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM345.2,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
<path d="M547.3,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM547.3,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
<path d="M460.6,197.8c-18,0-32.6-14.6-32.6-32.6V12.5h24.1v152.6c0,4.7,3.8,8.5,8.5,8.5h16.8v24.1h-16.8Z"/>
<path d="M722.8,198.1V45.2c0-4.7-3.8-8.5-8.5-8.5h-16.8V12.5h16.8c18,0,32.6,14.6,32.6,32.6v152.9h-24.1Z"/>
<path d="M665.2,198.1c-18,0-32.6-14.6-32.6-32.6v-85.1h-20.3v-22.1h20.3V12.9h24.1v45.3h30.2v22.1h-30.2v85.1c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
</g>
</g>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 2.1 KiB

View File

@@ -1,24 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 765.4 212.6">
<defs>
<style>
.cls-1 {
fill: #fff;
}
</style>
</defs>
<!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142) -->
<g>
<g id="Layer_1">
<g>
<path class="cls-1" d="M121.6,198.1l-12.1-48.8h-54.4l-12.1,48.8h-24.7L66.6,12.9h31.6l47.9,185.1h-24.5ZM104.4,128.6l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
<path class="cls-1" d="M254.9,198.1l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.6-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.4,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.2,49.3,71.7h-28.5Z"/>
<path class="cls-1" d="M345.2,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM345.2,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
<path class="cls-1" d="M547.3,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM547.3,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
<path class="cls-1" d="M460.6,197.8c-18,0-32.6-14.6-32.6-32.6V12.5h24.1v152.6c0,4.7,3.8,8.5,8.5,8.5h16.8v24.1h-16.8Z"/>
<path class="cls-1" d="M722.8,198.1V45.2c0-4.7-3.8-8.5-8.5-8.5h-16.8V12.5h16.8c18,0,32.6,14.6,32.6,32.6v152.9h-24.1Z"/>
<path class="cls-1" d="M665.2,198.1c-18,0-32.6-14.6-32.6-32.6v-85.1h-20.3v-22.1h20.3V12.9h24.1v45.3h30.2v22.1h-30.2v85.1c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
</g>
</g>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 2.3 KiB

View File

@@ -2,4 +2,3 @@ pre-commit
black
mypy
types-requests
tbparse

View File

@@ -1,3 +1,2 @@
pytest
pytest-xdist
pytest-retry

View File

@@ -1,22 +1,22 @@
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
packaging==23.2
peft==0.13.2
transformers==4.46.3
tokenizers>=0.20.1
bitsandbytes==0.44.1
accelerate==1.1.0
datasets==3.1.0
deepspeed==0.15.4
peft==0.13.0
transformers==4.45.1
tokenizers>=0.19.1
bitsandbytes==0.44.0
accelerate==0.34.2
datasets==2.21.0
deepspeed==0.14.4
pydantic==2.6.3
addict
fire
PyYAML>=6.0
requests
flash-attn==2.7.0.post2
flash-attn==2.6.3
sentencepiece
wandb
einops
xformers>=0.0.23.post1
xformers==0.0.28.post1
optimum==1.16.2
hf_transfer
colorama
@@ -28,12 +28,13 @@ scipy
scikit-learn==1.4.2
pynvml
art
fschat @ git+https://github.com/lm-sys/FastChat.git@27a05b04a35510afb1d767ae7e5990cbd278f8fe
gradio==3.50.2
tensorboard
python-dotenv==1.0.1
autoawq==0.2.7.post2
autoawq>=0.2.5
triton>=2.3.0
liger-kernel==0.4.2
liger-kernel==0.3.0
mamba-ssm==1.2.0.post1
@@ -42,7 +43,7 @@ s3fs>=2024.5.0
gcsfs>=2024.5.0
# adlfs
trl==0.12.0
trl==0.9.6
zstandard==0.22.0
fastcore
@@ -51,6 +52,3 @@ lm_eval==0.4.4
langdetect==1.0.9
immutabledict==4.2.0
antlr4-python3-runtime==4.13.2
torchao==0.5.0
schedulefree==1.3.0

View File

@@ -1,315 +0,0 @@
accelerate==0.34.1
addict==2.4.0
aiofiles==23.2.1
aiohttp==3.9.0
aiosignal==1.3.1
aiostream==0.5.2
alembic==1.13.1
annotated-types==0.6.0
annoy==1.17.3
ansible==6.7.0
ansible-core==2.13.13
ansible-vault==2.1.0
anyio==3.7.1
appdirs==1.4.4
art==6.0
asgiref==3.7.2
async-timeout==4.0.2
attrdict==2.0.1
attrs==22.2.0
awscli==1.32.75
-e git+ssh://git@github.com/OpenAccess-AI-Collective/axolotl.git@6e354682e3c1735d3f7fb9e362280c38e922260f#egg=axolotl
backoff==2.2.1
base58==2.1.1
beartype==0.17.2
bitnet==0.2.1
bitsandbytes==0.42.0
bittensor==6.7.0
black==23.7.0
blinker==1.7.0
boto3==1.34.75
botocore==1.34.75
cachetools==5.3.3
cachy==0.1.1
certifi==2023.7.22
cffi==1.16.0
cfgv==3.3.1
chai-guanaco==1.2.4
charset-normalizer==3.2.0
cleo==0.6.8
click==8.1.7
cloudpickle==2.0.0
cohere==4.11.2
colorama==0.4.4
coloredlogs==15.0.1
CoLT5-attention==0.10.20
contextlib2==21.6.0
contourpy==1.2.0
cryptography==41.0.3
cycler==0.12.1
cytoolz==0.12.3
databricks-cli==0.18.0
dataclasses-json==0.5.7
datasets==2.11.0
ddt==1.6.0
decorator==5.1.1
deepspeed==0.15.0
# Editable Git install with no remote (dialogpt==0.1)
-e /Users/wing/Projects/ml/dialogpt/src
dill==0.3.6
distlib==0.3.6
docker==7.0.0
docker-pycreds==0.4.0
docstring-parser==0.15
docutils==0.16
ecdsa==0.18.0
einops==0.7.0
einops-exts==0.0.4
einx==0.1.3
entrypoints==0.4
eth-hash==0.6.0
eth-keys==0.5.0
eth-typing==4.0.0
eth-utils==2.3.1
evaluate==0.4.0
exceptiongroup==1.1.1
fastapi==0.109.2
fastcore==1.5.29
ffmpy==0.4.0
filelock==3.12.2
-e git+https://github.com/NousResearch/finetuning-subnet.git@24e9407d6b4430a7ca39d344692f89ce5a97d27e#egg=finetuning_subnet
fire==0.5.0
first==2.0.2
flake8==7.0.0
Flask==3.0.1
fonttools==4.47.2
frozendict==2.4.1
frozenlist==1.3.3
fschat @ git+https://github.com/lm-sys/FastChat.git@27a05b04a35510afb1d767ae7e5990cbd278f8fe
fsspec==2023.6.0
fuzzywuzzy==0.18.0
gitdb==4.0.10
GitPython==3.1.31
google-pasta==0.2.0
gradio==4.42.0
gradio_client==1.3.0
greenlet==2.0.2
grpclib==0.4.7
gunicorn==21.2.0
h11==0.14.0
h2==4.1.0
hpack==4.0.0
httpcore==0.17.3
httpx==0.24.1
huggingface-hub==0.23.4
humanfriendly==10.0
hyperframe==6.0.1
identify==2.5.24
idna==3.4
immutables==0.20
importlib-metadata==6.7.0
importlib-resources==6.1.1
inflection==0.5.1
iniconfig==2.0.0
itsdangerous==2.1.2
Jinja2==3.1.2
jmespath==1.0.1
joblib==1.3.2
jsonlines==3.1.0
jsonschema==2.6.0
kiwisolver==1.4.5
langchain==0.0.144
Levenshtein==0.24.0
libcst==1.1.0
liger-kernel==0.0.0
lion-pytorch==0.1.2
llama-cpp-python==0.1.36
llvmlite==0.40.1
local-attention==1.9.0
loguru==0.7.0
Mako==1.3.2
Markdown==3.5.2
markdown-it-py==3.0.0
markdown2==2.4.10
MarkupSafe==2.1.2
marshmallow==3.19.0
marshmallow-enum==1.5.1
matplotlib==3.8.2
mccabe==0.7.0
mdurl==0.1.2
MEGABYTE-pytorch==0.0.7
-e git+https://github.com/cg123/mergekit.git@53c5f414774a0558b8d84858fb6374bc93a8f1c1#egg=mergekit
mlflow==2.10.0
modal==0.62.77
more-itertools==10.2.0
mpmath==1.2.1
msgpack==1.0.7
msgpack-numpy-opentensor==0.5.0
multidict==6.0.4
multiprocess==0.70.14
munch==2.5.0
mypy==1.3.0
mypy-extensions==1.0.0
nest-asyncio==1.6.0
netaddr==0.10.1
networkx==3.0rc1
nh3==0.2.14
nodeenv==1.8.0
nomic==2.0.2
numba==0.57.1
numexpr==2.8.4
numpy==1.24.4
oauthlib==3.2.2
openai==0.27.4
openapi==1.1.0
openapi-schema-pydantic==1.2.4
optimum==1.8.6
orjson==3.10.7
packaging==23.1
pandas==2.0.0
parameterized==0.9.0
password-strength==0.0.3.post2
pastel==0.1.1
pathos==0.3.0
pathspec==0.11.1
pathtools==0.1.2
peft==0.11.1
pendulum==3.0.0
Pillow==9.5.0
pip-tools==1.11.0
platformdirs==3.2.0
pluggy==1.4.0
poetry==0.7.1
pox==0.3.2
ppft==1.7.6.6
pre-commit==3.3.2
prettytable==3.10.0
prompt-toolkit==3.0.39
protobuf==3.20.2
protobuf3-to-dict==0.1.5
psutil==5.9.5
psycopg==3.1.18
PuLP==2.8.0
py==1.11.0
py-bip39-bindings==0.1.11
py-cpuinfo==9.0.0
py-ed25519-zebra-bindings==1.0.1
py-sr25519-bindings==0.2.0
pyarrow==11.0.0
pyasn1==0.6.0
pycodestyle==2.11.1
pycparser==2.21
pycryptodome==3.20.0
pydantic==2.5.3
pydantic_core==2.14.6
pydub==0.25.1
pyfiglet==0.8.post1
pyflakes==3.2.0
Pygments==2.15.1
PyJWT==2.8.0
pylev==1.4.0
PyNaCl==1.5.0
pynvml==11.5.0
pyparsing==2.4.7
pyrsistent==0.14.11
pytest==8.0.2
pytest-asyncio==0.23.4
python-dateutil==2.8.2
python-dotenv==1.0.1
python-Levenshtein==0.24.0
python-multipart==0.0.9
pytz==2023.3
PyYAML==6.0.1
querystring-parser==1.2.4
rapidfuzz==3.6.1
regex==2023.6.3
requests==2.31.0
requests-toolbelt==0.8.0
resolvelib==0.8.1
responses==0.18.0
retry==0.9.2
rich==13.7.0
rsa==4.7.2
ruff==0.6.3
s3transfer==0.10.1
safetensors==0.4.5
sagemaker==2.148.0
scalecodec==1.2.7
schedulefree==1.2.1
schema==0.7.5
scikit-learn==1.4.0
scipy==1.9.3
seaborn==0.13.2
semantic-version==2.10.0
sentencepiece==0.2.0
sentry-sdk==1.19.1
setproctitle==1.3.2
shellingham==1.5.4
shortuuid==1.0.11
shtab==1.6.5
sigtools==4.0.1
six==1.16.0
skypilot==0.4.1
smdebug-rulesconfig==1.0.1
smmap==5.0.0
sniffio==1.3.0
SQLAlchemy==1.4.47
sqlparse==0.4.4
starlette==0.36.3
substrate-interface==1.5.2
svgwrite==1.4.3
sympy==1.11.1
synchronicity==0.6.7
tabulate==0.9.0
tblib==1.7.0
tenacity==8.2.2
tensor-parallel==2.0.0
termcolor==2.2.0
text2art==0.2.0
threadpoolctl==3.2.0
tiktoken==0.6.0
time-machine==2.14.1
timm==0.9.16
tokenizers==0.19.1
tokenmonster==1.1.12
toml==0.9.6
tomli==2.0.1
tomlkit==0.12.0
toolz==0.12.1
torch==2.2.0
torchdata==0.6.1
torchdiffeq==0.2.3
TorchFix==0.4.0
torchtext==0.15.2
torchvision==0.17.0
tqdm==4.66.2
transformers==4.44.2
trl==0.9.6
typer==0.12.5
types-certifi==2021.10.8.3
types-requests==2.31.0.20240125
types-setuptools==69.0.0.20240125
types-toml==0.10.8.7
typing==3.7.4.3
typing-inspect==0.8.0
typing_extensions==4.9.0
tyro==0.5.18
tzdata==2023.3
unique-names-generator==1.0.2
urllib3==2.2.2
uvicorn==0.22.0
vector_quantize_pytorch==1.14.1
virtualenv==20.23.0
voyager==2.0.2
wandb==0.16.2
watchfiles==0.21.0
wavedrom==2.0.3.post3
wcwidth==0.2.6
websocket-client==1.7.0
websockets==12.0
Werkzeug==3.0.1
wonderwords==2.2.0
xxhash==3.2.0
yarl==1.8.2
zetascale==2.2.7
zipp==3.15.0

View File

@@ -1,60 +0,0 @@
"""
helper script to parse chat datasets into a usable yaml
"""
import click
import yaml
from datasets import load_dataset
@click.command()
@click.argument("dataset", type=str)
@click.option("--split", type=str, default="train")
def parse_dataset(dataset=None, split="train"):
ds_cfg = {}
ds_cfg["path"] = dataset
ds_cfg["split"] = split
ds_cfg["type"] = "chat_template"
ds_cfg["chat_template"] = "<<<Replace based on your model>>>"
dataset = load_dataset(dataset, split=split)
features = dataset.features
feature_keys = features.keys()
field_messages = None
for key in ["conversation", "conversations", "messages"]:
if key in feature_keys:
field_messages = key
break
if not field_messages:
raise ValueError(
f'No conversation field found in dataset: {", ".join(feature_keys)}'
)
ds_cfg["field_messages"] = field_messages
message_fields = features["conversations"][0].keys()
message_field_role = None
for key in ["from", "role"]:
if key in message_fields:
message_field_role = key
break
if not message_field_role:
raise ValueError(
f'No role field found in messages: {", ".join(message_fields)}'
)
ds_cfg["message_field_role"] = message_field_role
message_field_content = None
for key in ["content", "text", "value"]:
if key in message_fields:
message_field_content = key
break
if not message_field_content:
raise ValueError(
f'No content field found in messages: {", ".join(message_fields)}'
)
ds_cfg["message_field_content"] = message_field_content
print(yaml.dump({"datasets": [ds_cfg]}))
if __name__ == "__main__":
parse_dataset()

View File

@@ -2,7 +2,7 @@
# Export specific ENV variables to /etc/rp_environment
echo "Exporting environment variables..."
printenv | grep -E '^HF_|^BNB_|^CUDA_|^NCCL_|^NV|^RUNPOD_|^PATH=|^_=' | sed 's/^\([^=]*\)=\(.*\)$/export \1="\2"/' | grep -v 'printenv' >> /etc/rp_environment
printenv | grep -E '^RUNPOD_|^PATH=|^_=' | sed 's/^\(.*\)=\(.*\)$/export \1="\2"/' >> /etc/rp_environment
echo 'source /etc/rp_environment' >> ~/.bashrc
add_keys_to_authorized() {

View File

@@ -1,33 +0,0 @@
# noqa
# pylint: skip-file
try:
import torch
except ImportError:
raise ImportError("Install torch via `pip install torch`")
from packaging.version import Version as V
v = V(torch.__version__)
cuda = str(torch.version.cuda)
is_ampere = torch.cuda.get_device_capability()[0] >= 8
if cuda != "12.1" and cuda != "11.8" and cuda != "12.4":
raise RuntimeError(f"CUDA = {cuda} not supported!")
if v <= V("2.1.0"):
raise RuntimeError(f"Torch = {v} too old!")
elif v <= V("2.1.1"):
x = "cu{}{}-torch211"
elif v <= V("2.1.2"):
x = "cu{}{}-torch212"
elif v < V("2.3.0"):
x = "cu{}{}-torch220"
elif v < V("2.4.0"):
x = "cu{}{}-torch230"
elif v < V("2.5.0"):
x = "cu{}{}-torch240"
elif v < V("2.6.0"):
x = "cu{}{}-torch250"
else:
raise RuntimeError(f"Torch = {v} too new!")
x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
print(
f'pip install unsloth-zoo && pip install --no-deps "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"'
)

View File

@@ -30,19 +30,13 @@ def parse_requirements():
try:
xformers_version = [req for req in _install_requires if "xformers" in req][0]
torchao_version = [req for req in _install_requires if "torchao" in req][0]
autoawq_version = [req for req in _install_requires if "autoawq" in req][0]
if "Darwin" in platform.system():
# don't install xformers on MacOS
_install_requires.pop(_install_requires.index(xformers_version))
else:
# detect the version of torch already installed
# and set it so dependencies don't clobber the torch version
try:
torch_version = version("torch")
except PackageNotFoundError:
torch_version = "2.5.1"
torch_version = version("torch")
_install_requires.append(f"torch=={torch_version}")
version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
@@ -55,22 +49,11 @@ def parse_requirements():
else:
raise ValueError("Invalid version format")
if (major, minor) >= (2, 5):
_install_requires.pop(_install_requires.index(xformers_version))
if patch == 0:
_install_requires.append("xformers==0.0.28.post2")
else:
_install_requires.append("xformers==0.0.28.post3")
_install_requires.pop(_install_requires.index(autoawq_version))
elif (major, minor) >= (2, 4):
if (major, minor) >= (2, 4):
if patch == 0:
_install_requires.pop(_install_requires.index(xformers_version))
_install_requires.append("xformers>=0.0.27")
else:
_install_requires.pop(_install_requires.index(xformers_version))
_install_requires.append("xformers==0.0.28.post1")
elif (major, minor) >= (2, 3):
_install_requires.pop(_install_requires.index(torchao_version))
if (major, minor) >= (2, 3):
if patch == 0:
_install_requires.pop(_install_requires.index(xformers_version))
_install_requires.append("xformers>=0.0.26.post1")
@@ -78,16 +61,15 @@ def parse_requirements():
_install_requires.pop(_install_requires.index(xformers_version))
_install_requires.append("xformers>=0.0.27")
elif (major, minor) >= (2, 2):
_install_requires.pop(_install_requires.index(torchao_version))
_install_requires.pop(_install_requires.index(xformers_version))
_install_requires.append("xformers>=0.0.25.post1")
else:
_install_requires.pop(_install_requires.index(torchao_version))
_install_requires.pop(_install_requires.index(xformers_version))
_install_requires.append("xformers>=0.0.23.post1")
except PackageNotFoundError:
pass
return _install_requires, _dependency_links
@@ -96,24 +78,26 @@ install_requires, dependency_links = parse_requirements()
setup(
name="axolotl",
version="0.5.2",
version="0.4.1",
description="LLM Trainer",
long_description="Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.",
package_dir={"": "src"},
packages=find_packages("src"),
packages=find_packages(),
install_requires=install_requires,
dependency_links=dependency_links,
extras_require={
"flash-attn": [
"flash-attn==2.7.0.post2",
"flash-attn==2.6.3",
],
"fused-dense-lib": [
"fused-dense-lib @ git+https://github.com/Dao-AILab/flash-attention@v2.6.2#subdirectory=csrc/fused_dense_lib",
],
"deepspeed": [
"deepspeed==0.15.4",
"deepspeed==0.14.4",
"deepspeed-kernels",
],
"mamba-ssm": [
"mamba-ssm==1.2.0.post1",
"causal_conv1d",
],
"auto-gptq": [
"auto-gptq==0.5.1",

View File

@@ -30,10 +30,7 @@ from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
from axolotl.integrations.base import PluginManager
from axolotl.logging_config import configure_logging
from axolotl.train import TrainDatasetMeta
from axolotl.utils.chat_templates import (
get_chat_template,
get_chat_template_from_config,
)
from axolotl.utils.chat_templates import chat_templates
from axolotl.utils.comet_ import setup_comet_env_vars
from axolotl.utils.config import (
normalize_cfg_datasets,
@@ -193,19 +190,18 @@ def do_inference(
):
model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
prompter = cli_args.prompter
default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
for token, symbol in default_tokens.items():
# If the token isn't already specified in the config, add it
if not (cfg.special_tokens and token in cfg.special_tokens):
tokenizer.add_special_tokens({token: symbol})
prompter_module = None
chat_template_str = None
if prompter:
prompter_module = getattr(
importlib.import_module("axolotl.prompters"), prompter
)
elif cfg.chat_template:
chat_template_str = get_chat_template(cfg.chat_template)
elif cfg.datasets[0].type == "chat_template":
chat_template_str = get_chat_template_from_config(
cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
)
model = model.to(cfg.device, dtype=cfg.torch_dtype)
@@ -215,31 +211,13 @@ def do_inference(
instruction = get_multi_line_input()
if not instruction:
return
if prompter_module:
prompt: str = next(
prompter_module().build_prompt(instruction=instruction.strip("\n"))
)
else:
prompt = instruction.strip()
if chat_template_str:
batch = tokenizer.apply_chat_template(
[
{
"role": "user",
"content": prompt,
}
],
return_tensors="pt",
add_special_tokens=True,
add_generation_prompt=True,
chat_template=chat_template_str,
tokenize=True,
return_dict=True,
)
else:
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
print("=" * 40)
model.eval()
@@ -279,6 +257,13 @@ def do_inference_gradio(
model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
prompter = cli_args.prompter
# default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
default_tokens: Dict[str, str] = {}
for token, symbol in default_tokens.items():
# If the token isn't already specified in the config, add it
if not (cfg.special_tokens and token in cfg.special_tokens):
tokenizer.add_special_tokens({token: symbol})
prompter_module = None
chat_template_str = None
@@ -287,7 +272,7 @@ def do_inference_gradio(
importlib.import_module("axolotl.prompters"), prompter
)
elif cfg.chat_template:
chat_template_str = get_chat_template(cfg.chat_template, tokenizer=tokenizer)
chat_template_str = chat_templates(cfg.chat_template)
model = model.to(cfg.device, dtype=cfg.torch_dtype)
@@ -477,12 +462,7 @@ def load_datasets(
processor=processor,
)
if (
cli_args.debug
or cfg.debug
or cli_args.debug_text_only
or int(cli_args.debug_num_examples) > 0
):
if cli_args.debug or cfg.debug:
LOG.info("check_dataset_labels...")
check_dataset_labels(
train_dataset.select(

View File

@@ -23,7 +23,10 @@ from axolotl.cli import (
)
from axolotl.common.cli import PreprocessCliArgs
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
from axolotl.utils.trainer import disable_datasets_caching
from axolotl.prompt_strategies.sharegpt import (
register_chatml_template,
register_llama3_template,
)
LOG = logging.getLogger("axolotl.cli.preprocess")
@@ -40,6 +43,23 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
return_remaining_strings=True
)
if parsed_cfg.chat_template == "chatml":
if parsed_cfg.default_system_message:
LOG.info(
f"ChatML set. Adding default system message: {parsed_cfg.default_system_message}"
)
register_chatml_template(parsed_cfg.default_system_message)
else:
register_chatml_template()
elif parsed_cfg.chat_template == "llama3":
if parsed_cfg.default_system_message:
LOG.info(
f"LLaMA-3 set. Adding default system message: {parsed_cfg.default_system_message}"
)
register_llama3_template(parsed_cfg.default_system_message)
else:
register_llama3_template()
if not parsed_cfg.dataset_prepared_path:
msg = (
Fore.RED
@@ -50,11 +70,10 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
LOG.warning(msg)
parsed_cfg.dataset_prepared_path = DEFAULT_DATASET_PREPARED_PATH
with disable_datasets_caching():
if parsed_cfg.rl: # and parsed_cfg.rl != "orpo":
load_rl_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
else:
load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
if parsed_cfg.rl: # and parsed_cfg.rl != "orpo":
load_rl_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
else:
load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
if parsed_cli_args.download:
model_name = parsed_cfg.base_model

View File

@@ -19,6 +19,10 @@ from axolotl.cli import (
)
from axolotl.common.cli import TrainerCliArgs
from axolotl.integrations.base import PluginManager
from axolotl.prompt_strategies.sharegpt import (
register_chatml_template,
register_llama3_template,
)
from axolotl.train import train
LOG = logging.getLogger("axolotl.cli.train")
@@ -38,6 +42,21 @@ def do_train(cfg, cli_args) -> None:
print_axolotl_text_art()
check_accelerate_default_config()
check_user_token()
if cfg.chat_template == "chatml" and cfg.default_system_message:
LOG.info(
f"ChatML set. Adding default system message: {cfg.default_system_message}"
)
register_chatml_template(cfg.default_system_message)
else:
register_chatml_template()
if cfg.chat_template == "llama3" and cfg.default_system_message:
LOG.info(
f"LLaMA-3 set. Adding default system message: {cfg.default_system_message}"
)
register_llama3_template(cfg.default_system_message)
else:
register_llama3_template()
if cfg.rl: # and cfg.rl != "orpo":
dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)

View File

@@ -23,7 +23,7 @@ class TrainerCliArgs:
debug: bool = field(default=False)
debug_text_only: bool = field(default=False)
debug_num_examples: int = field(default=0)
debug_num_examples: int = field(default=5)
inference: bool = field(default=False)
merge_lora: bool = field(default=False)
prompter: Optional[str] = field(default=None)

View File

@@ -1,34 +0,0 @@
"""
ChatML transformation functions for MessageContents
"""
from typing import Optional
from ..messages import MessageContents, Messages
from .shared import wrap_tools
def format_message(
message: Messages,
message_index: Optional[int] = None, # pylint: disable=unused-argument
) -> Messages:
if message.is_chat_formatted:
return message
# prepend the role prefix within a MessageContents to message.content
message.content.insert(
0,
MessageContents(
type="text",
value=f"<|im_start|>{message.role}\n",
weight=0,
),
)
message.content.append(
MessageContents(type="text", value="<|im_end|>", weight=message.weight)
)
message.content.append(MessageContents(type="text", value="\n", weight=0))
message = wrap_tools(message)
message.is_chat_formatted = True
return message

View File

@@ -1,45 +0,0 @@
"""
Llama 3.x chat formatting functions for MessageContents
"""
from typing import Optional
from ..messages import MessageContents, Messages
from .shared import wrap_tools
def format_message(message: Messages, message_index: Optional[int] = None) -> Messages:
if message.is_chat_formatted:
return message
message_role = message.role
if message.role == "tool":
message_role = "ipython"
# prepend the role prefix within a MessageContents to message.content
message.content.insert(
0,
MessageContents(
type="text",
value=f"<|start_header_id|>{message_role}<|end_header_id|>\n\n",
weight=0,
),
)
message.content.append(
MessageContents(type="text", value="<|eot_id|>", weight=message.weight)
)
message = wrap_tools(message)
if message_index == 0:
message.content.insert(
0,
MessageContents(
type="text",
value="<|begin_of_text|>",
weight=0,
),
)
message.is_chat_formatted = True
return message

View File

@@ -1,47 +0,0 @@
"""
shared functions for format transforms
"""
from axolotl.core.chat.messages import MessageContents, Messages
def wrap_tools(message: Messages):
# loop over message.content by index to find tool calls, we need to wrap each with tags,
# so be wary of indexing issues when changing the list while iterating.
# iterate over the range in reverse order to avoid index shifting
for i in range(len(message.content) - 1, -1, -1):
if message.content[i].type == "tool_call":
# append a </tool_call> MessageContents text tag after
message.content.insert(
i + 1,
MessageContents(
type="text", value="</tool_call>\n", weight=message.weight
),
)
# make sure the actual tool call content ends with a newline
message.content[i].has_newline = True
# prepend a <tool_call> MessageContents text tag before
message.content.insert(
i,
MessageContents(
type="text", value="<tool_call>\n", weight=message.weight
),
)
elif message.content[i].type == "tool_response":
# append a </tool_call> MessageContents text tag after
message.content.insert(
i + 1,
MessageContents(
type="text", value="</tool_response>\n", weight=message.weight
),
)
# make sure the actual tool response content ends with a newline
message.content[i].has_newline = True
# prepend a <tool_call> MessageContents text tag before
message.content.insert(
i,
MessageContents(
type="text", value="<tool_response>\n", weight=message.weight
),
)
return message

View File

@@ -1,230 +0,0 @@
"""
internal message representations of chat messages
"""
import json
from enum import Enum
from typing import Any, Callable, List, Optional, Union
from pydantic import BaseModel
from transformers import PreTrainedTokenizer
class MessageRoles(str, Enum):
"""
Message roles for the system, user, assistant, and tools
"""
system = "system" # pylint: disable=invalid-name
user = "user" # pylint: disable=invalid-name
assistant = "assistant" # pylint: disable=invalid-name
tool = "tool" # pylint: disable=invalid-name
ipython = ( # pylint: disable=invalid-name
# for responses from builtin tools
"ipython"
)
class MessageContentTypes(str, Enum):
"""
Message content types for text, image, audio, tool calls, and tool responses
"""
special_token = "special_token" # pylint: disable=invalid-name # nosec B105
text = "text" # pylint: disable=invalid-name
image = "image" # pylint: disable=invalid-name
audio = "audio" # pylint: disable=invalid-name
tool_call = "tool_call" # pylint: disable=invalid-name # to differentiate regular responses from tool calls from the assistant
tool_response = "tool_response" # pylint: disable=invalid-name
class SpecialToken(str, Enum):
"""
Special tokens for beginning of string and end of string
"""
bos_token = "bos_token" # pylint: disable=invalid-name # nosec B105
eos_token = "eos_token" # pylint: disable=invalid-name # nosec B105
class ToolCallFunction(BaseModel):
"""
Tool call function with name and arguments
"""
name: str
arguments: dict[str, str]
class Tool(BaseModel):
"""
Tool with description, function, and parameters
"""
description: str
function: ToolCallFunction
parameters: dict[str, str] # .properties
class ToolCallContents(BaseModel):
"""
Tool call contents with name, arguments, and optional id
"""
name: str
arguments: dict[str, Union[str, int]]
id: Optional[str] = None # pylint: disable=invalid-name
def __str__(self) -> str:
data = {"name": self.name, "arguments": self.arguments}
if self.id is not None:
data["id"] = self.id
return json.dumps(data)
class ToolResponseContents(BaseModel):
"""
Tool response contents with name, content, and optional id
"""
name: str
content: Union[str, dict[str, Union[str, int, float]]]
id: Optional[str] = None # pylint: disable=invalid-name
def __str__(self) -> str:
data = {"name": self.name, "content": self.content}
if self.id is not None:
data["id"] = self.id
return json.dumps(data)
class MessageContents(BaseModel):
"""
Message contents with type, value, metadata, weight, newline, and end of contents
"""
type: Union[str, MessageContentTypes]
value: Union[str, ToolCallContents, ToolResponseContents, SpecialToken]
meta: Optional[dict[str, Any]] = None # support additional arbitrary metadata
weight: Optional[Union[int, float]] = None
has_newline: bool = False
eoc: bool = False # end of contents
def __str__(self) -> str:
str_val = str(self.value)
if self.has_newline and not str_val.endswith("\n"):
str_val += "\n"
return str_val
class Messages(BaseModel):
"""
Messages with role, content, metadata, weight, and chat formatting
"""
role: Union[MessageRoles, str] # allows for arbitrary roles
content: List["MessageContents"]
meta: Optional[dict[str, Any]] = None # support additional arbitrary metadata
weight: Optional[Union[int, float]] = None
is_chat_formatted: bool = False
def __str__(self) -> str:
return "".join(str(c) for c in self.content)
def tokenized(
self, tokenizer: PreTrainedTokenizer, ignore_index=-100
) -> dict[str, List[int]]:
# iterate over the contents, tokenizing the concatenated string values up to the current MessageContents
# returns a dictionary mapping w input_ids, attention_mask, and labels
input_ids: List[int] = []
labels: List[int] = []
pending_input_ids: List[int] = []
pending_weight = self.weight
running_content = ""
for _, msg_content in enumerate(self.content):
# TODO also handle non-text content types
if msg_content.type in [
MessageContentTypes.text.value,
MessageContentTypes.tool_call.value,
MessageContentTypes.tool_response.value,
]:
running_content += str(msg_content)
tok_results = tokenizer(running_content, add_special_tokens=False)
tok_input_ids = tok_results["input_ids"]
if pending_input_ids:
new_pending_inputs = tok_input_ids[
len(input_ids) : len(input_ids) + len(pending_input_ids)
]
if new_pending_inputs != pending_input_ids:
# logging.warning("tokenization mismatch from concatenation.")
pending_input_ids = new_pending_inputs
input_ids.extend(pending_input_ids)
if pending_weight:
labels.extend(pending_input_ids)
else:
labels.extend([ignore_index] * len(pending_input_ids))
pending_input_ids = tok_results["input_ids"][len(input_ids) :]
pending_weight = self.weight and msg_content.weight not in [0, 0.0]
input_ids.extend(pending_input_ids)
if pending_weight:
labels.extend(pending_input_ids)
else:
labels.extend([ignore_index] * len(pending_input_ids))
attention_mask = [1] * len(input_ids)
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels,
}
class Chats(BaseModel):
"""
top level data structure for chat conversations
"""
conversation: List[Messages]
def __str__(self) -> str:
return "".join(str(c) for c in self.conversation)
def tokenized(
self, tokenizer: Callable[[str], dict[str, List[int]]], ignore_index=-100
) -> dict[str, List[int]]:
input_ids = []
attention_mask = []
labels = []
for msg in self.conversation:
msg_results = msg.tokenized(tokenizer, ignore_index)
input_ids.extend(msg_results["input_ids"])
attention_mask.extend(msg_results["attention_mask"])
labels.extend(msg_results["labels"])
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels,
}
class ChatFormattedChats(Chats):
"""
Chat formatted chats with formatter and optional train on inputs
"""
formatter: Callable # [[Union[dict, Chats]], Chats]
train_on_inputs: bool = False
def model_post_init(self, __context):
for i, msg in enumerate(self.conversation):
self.conversation[i] = self.formatter(msg, message_index=i)
if self.train_on_inputs:
self.conversation[i].weight = 1
class PreferenceChats(BaseModel):
"""
representation for preference data for chat
"""
prompt: List[Messages]
chosen: Messages
rejected: Messages

View File

@@ -1,55 +0,0 @@
"""
chat dataset module
"""
import os
from typing import Callable, Optional, Union
from datasets import Dataset
from transformers import PreTrainedTokenizer
from axolotl.core.chat.messages import ChatFormattedChats
class TokenizedChatDataset(Dataset):
"""
Tokenized chat dataset
"""
def __init__(
self,
data: Dataset,
model_transform: Union[PreTrainedTokenizer, Callable],
*args,
message_transform: Optional[Callable] = None,
formatter=None,
process_count: Optional[int] = None,
keep_in_memory: Optional[bool] = False,
**kwargs,
):
def map_fn(ex):
if message_transform is not None:
ex = message_transform(ex)
if formatter is not None:
ex = ChatFormattedChats(
formatter=formatter,
**ex,
)
else:
ex = ChatFormattedChats(
**ex,
)
return ex.tokenized(model_transform)
process_or_cpu_count: int = (
process_count or os.cpu_count() # type: ignore[assignment]
)
num_proc = min(64, process_or_cpu_count)
features = data.features.keys()
tokenized_data = data.map(
map_fn,
num_proc=num_proc,
keep_in_memory=keep_in_memory,
remove_columns=features,
desc="Tokenizing Chats",
)
super().__init__(tokenized_data.data, *args, **kwargs)

View File

@@ -1,150 +0,0 @@
"""
This module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.
"""
from typing import Any, Mapping, Union
def chat_message_transform_builder( # pylint: disable=dangerous-default-value
train_on_inputs=False,
conversations_field: str = "conversations",
message_field_role: Union[str, list[str]] = ["role", "from"], # commonly "role"
message_field_content: Union[str, list[str]] = [
"value",
"text",
"content",
], # commonly "content"
message_field_training: Union[str, list[str]] = [
"train",
"weight",
], # commonly "weight"
):
"""Builds a transform that takes a row from the dataset and converts it to a Chat
Args:
train_on_inputs (bool, optional):
If True, the transform will train on the inputs. If False, the transform will train on the targets.
Defaults to False.
conversations_field (str, optional):
The field name of the conversations. Defaults to "conversations".
message_field_role (str | list[str], optional):
The field name of the role. Defaults to "role".
message_field_content (str | list[str], optional):
The field name of the message content. Defaults to "content".
message_field_training (str | list[str], optional):
The field name of the train/weight. Defaults to "weight".
Returns:
Callable:
A function that takes a list of conversations and returns a list of messages.
"""
message_field_role = (
[message_field_role]
if isinstance(message_field_role, str)
else message_field_role
)
message_field_content = (
[message_field_content]
if isinstance(message_field_content, str)
else message_field_content
)
message_weight_fields = (
[message_field_training]
if isinstance(message_field_training, str)
else message_field_training
)
role_value_mappings = {
"system": "system",
"user": "user",
"human": "user",
"assistant": "assistant",
"gpt": "assistant",
"tool": "tool",
"ipython": "ipython",
}
if train_on_inputs:
role_default_weights_mappings = {
"system": 1,
"user": 1,
"assistant": 1,
"tool": 1,
"ipython": 1,
}
else:
role_default_weights_mappings = {
"system": 0,
"user": 0,
"assistant": 1,
"tool": 0,
"ipython": 0,
}
def transform_builder(sample: Mapping[str, Any]):
if conversations_field not in sample:
raise ValueError(f"Field '{conversations_field}' not found in sample.")
# if none of the role fields are in the message, raise an error
if not any(
role in sample[conversations_field][0] for role in message_field_role
):
raise ValueError("No role field found in message.")
role_field = next(
role
for role in message_field_role
if role in sample[conversations_field][0]
)
if not any(
field in sample[conversations_field][0] for field in message_field_content
):
raise ValueError("No message_content field found in message.")
message_content_field = next(
field
for field in message_field_content
if field in sample[conversations_field][0]
)
if not any(
field in sample[conversations_field][0] for field in message_field_training
):
message_weight_field = None
else:
message_weight_field = next(
field
for field in message_weight_fields
if field in sample[conversations_field][0]
)
messages = []
for message in sample[conversations_field]:
role = role_value_mappings[message[role_field]]
weight = (
int(message[message_weight_field])
if message_weight_field
else role_default_weights_mappings[role]
)
# TODO if "tool_calls" in message[message_content_field]: then convert tool call to ToolCallContents
if isinstance(message[message_content_field], str):
messages.append(
{
"role": role,
"content": [
{
"type": "text",
"value": message[message_content_field],
}
],
"weight": weight,
}
)
else:
messages.append(
{
"role": role,
"content": message[message_content_field],
"weight": weight,
}
)
return {"conversation": messages}
return transform_builder

View File

@@ -7,7 +7,6 @@ import abc
import gc
import importlib
import importlib.util
import inspect
import logging
import math
import os
@@ -28,6 +27,7 @@ from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
from transformers import (
EarlyStoppingCallback,
PreTrainedModel,
Trainer,
TrainerCallback,
TrainingArguments,
@@ -43,12 +43,9 @@ from trl import (
KTOTrainer,
ORPOConfig,
ORPOTrainer,
RewardConfig,
RewardTrainer,
)
from trl.trainer.utils import RewardDataCollatorWithPadding, pad_to_length
from trl.trainer.utils import pad_to_length
from axolotl.integrations.base import PluginManager
from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
from axolotl.utils import is_comet_available, is_mlflow_available
@@ -64,7 +61,7 @@ from axolotl.utils.callbacks import (
log_prediction_callback_factory,
)
from axolotl.utils.callbacks.lisa import lisa_callback_factory
from axolotl.utils.chat_templates import get_chat_template
from axolotl.utils.chat_templates import chat_templates
from axolotl.utils.collators import (
BatchSamplerDataCollatorForSeq2Seq,
DataCollatorForSeq2Seq,
@@ -304,13 +301,6 @@ class AxolotlCPOConfig(AxolotlTrainingMixins, CPOConfig):
)
@dataclass
class AxolotlRewardConfig(AxolotlTrainingMixins, RewardConfig):
"""
Reward config for Reward training
"""
class SchedulerMixin(Trainer):
"""
Mixin class for scheduler setup in CausalTrainer.
@@ -408,10 +398,12 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
def __init__(
self,
*_args,
num_epochs=1,
bench_data_collator=None,
eval_data_collator=None,
**kwargs,
):
self.num_epochs = num_epochs
self.bench_data_collator = bench_data_collator
self.eval_data_collator = eval_data_collator
super().__init__(*_args, **kwargs)
@@ -436,13 +428,7 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
if (
self.args.loraplus_lr_ratio is None
and self.args.alternate_optimizer
not in [
"optimi_adamw",
"ao_adamw_8bit",
"ao_adamw_4bit",
"ao_adamw_fp8",
"adopt_adamw",
]
not in ["optimi_adamw", "ao_adamw_8bit", "ao_adamw_4bit", "ao_adamw_fp8"]
):
return super().create_optimizer()
@@ -511,14 +497,6 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
self.optimizer = ( # pylint: disable=attribute-defined-outside-init
AdamWFp8(optimizer_grouped_parameters, **optimizer_kwargs)
)
elif self.args.alternate_optimizer == "adopt_adamw":
from axolotl.utils.optimizers.adopt import ADOPT
self.optimizer = ( # pylint: disable=attribute-defined-outside-init
ADOPT(
optimizer_grouped_parameters, decoupled=True, **optimizer_kwargs
)
)
if is_sagemaker_mp_enabled():
self.optimizer = smp.DistributedOptimizer( # pylint: disable=attribute-defined-outside-init
@@ -681,9 +659,7 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
return DataLoader(bench_dataset, **dataloader_params)
# return self.accelerator.prepare(DataLoader(bench_dataset, **dataloader_params))
def compute_loss(
self, model, inputs, return_outputs=False, num_items_in_batch=None
):
def compute_loss(self, model, inputs, return_outputs=False):
# use one's weighted cross entropy loss calc
# if self.args.sample_packing:
# labels = inputs.pop("labels")
@@ -691,18 +667,8 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
# loss = trainer_weighted_loss(outputs, labels, shift_labels=True)
# return (loss, outputs) if return_outputs else loss
if self.args.orpo_alpha:
return self.orpo_compute_loss(
model,
inputs,
return_outputs=return_outputs,
num_items_in_batch=num_items_in_batch,
)
return super().compute_loss(
model,
inputs,
return_outputs=return_outputs,
num_items_in_batch=num_items_in_batch,
)
return self.orpo_compute_loss(model, inputs, return_outputs=return_outputs)
return super().compute_loss(model, inputs, return_outputs=return_outputs)
@staticmethod
def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
@@ -798,13 +764,7 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
).squeeze(2)
return torch.mul(per_token_logps, mask).sum(dim=1) / mask.sum(dim=1)
def orpo_compute_loss(
self,
model,
inputs,
return_outputs=False,
num_items_in_batch=None, # pylint: disable=unused-argument
):
def orpo_compute_loss(self, model, inputs, return_outputs=False):
concat_inputs = AxolotlTrainer.orpo_concatenate_inputs(
inputs,
label_pad_token=-100,
@@ -910,13 +870,13 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
for key, value in metrics.items():
self._stored_metrics[train_eval][key].append(value)
def _save_checkpoint(self, model, trial, **kwargs):
def _save_checkpoint(self, model, trial, metrics=None):
# make sure the checkpoint dir exists, since trainer is flakey
checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
run_dir = self._get_output_dir(trial=trial)
output_dir = os.path.join(run_dir, checkpoint_folder)
os.makedirs(output_dir, exist_ok=True)
return super()._save_checkpoint(model, trial, **kwargs)
return super()._save_checkpoint(model, trial, metrics=metrics)
class AxolotlMambaTrainer(AxolotlTrainer):
@@ -931,7 +891,6 @@ class AxolotlMambaTrainer(AxolotlTrainer):
model,
inputs,
return_outputs=False, # pylint: disable=unused-argument
num_items_in_batch=None, # pylint: disable=unused-argument
):
input_ids = inputs.pop("input_ids")
lm_logits = model(input_ids).logits
@@ -1038,46 +997,19 @@ class AxolotlDPOTrainer(SchedulerMixin, DPOTrainer):
return super().push_to_hub(*args, **kwargs)
@staticmethod
def tokenize_row(
features,
processing_class,
max_prompt_length,
max_completion_length,
add_special_tokens,
self, feature, model: Optional[Union[PreTrainedModel, torch.nn.Module]] = None
) -> Dict:
res = DPOTrainer.tokenize_row(
features,
processing_class,
max_prompt_length,
max_completion_length,
add_special_tokens,
)
# fix when the tokenizer doesn't have a bos_token_id, e.g. Qwen
if processing_class.bos_token is None and res["prompt_input_ids"][0] is None:
res = super().tokenize_row(feature, model=model)
if self.tokenizer.bos_token_id is None and res["prompt_input_ids"][0] is None:
for key in res.keys():
res[key] = res[key][1:]
if processing_class.bos_token and processing_class.bos_token_id is not None:
# dpo trainer may incorrectly prepend the bos_token_id to the dpo outputs
if res["chosen_input_ids"][0] == processing_class.bos_token_id:
res["chosen_input_ids"] = res["chosen_input_ids"][1:]
res["chosen_labels"] = res["chosen_labels"][1:]
res["chosen_attention_mask"] = res["chosen_attention_mask"][1:]
if res["rejected_input_ids"][0] == processing_class.bos_token_id:
res["rejected_input_ids"] = res["rejected_input_ids"][1:]
res["rejected_labels"] = res["rejected_labels"][1:]
res["rejected_attention_mask"] = res["rejected_attention_mask"][1:]
return res
def training_step(
self,
model: nn.Module,
inputs: Dict[str, Union[torch.Tensor, Any]],
num_items_in_batch=None,
self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]
) -> torch.Tensor:
loss: torch.Tensor = super().training_step(model, inputs, num_items_in_batch)
loss: torch.Tensor = super().training_step(model, inputs)
gc.collect()
torch.cuda.empty_cache()
return loss
@@ -1107,14 +1039,6 @@ class AxolotlCPOTrainer(SchedulerMixin, CPOTrainer):
tag_names = ["axolotl", "cpo"]
class AxolotlRewardTrainer(SchedulerMixin, RewardTrainer):
"""
Extend the base RewardTrainer for axolotl helpers
"""
tag_names = ["axolotl", "reward"]
class TrainerBuilderBase(abc.ABC):
"""
Base class for trainer builder
@@ -1175,28 +1099,17 @@ class TrainerBuilderBase(abc.ABC):
def get_callbacks(self) -> List[TrainerCallback]:
callbacks = []
plugin_manager = PluginManager.get_instance()
callbacks.extend(
plugin_manager.add_callbacks_pre_trainer(cfg=self.cfg, model=self.model)
)
if self.cfg.use_wandb:
callbacks.append(
SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
)
if self.cfg.use_mlflow and is_mlflow_available():
from transformers.integrations.integration_utils import MLflowCallback
from axolotl.utils.callbacks.mlflow_ import (
SaveAxolotlConfigtoMlflowCallback,
)
callbacks.extend(
[
SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path),
MLflowCallback,
]
callbacks.append(
SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path)
)
if self.cfg.use_comet and is_comet_available():
from axolotl.utils.callbacks.comet_ import SaveAxolotlConfigtoCometCallback
@@ -1207,23 +1120,11 @@ class TrainerBuilderBase(abc.ABC):
return callbacks
@abstractmethod
def get_post_trainer_create_callbacks(self, trainer):
"""
Callbacks added after the trainer is created, usually b/c these need access to the trainer
"""
callbacks = []
if self.cfg.plugins:
plugin_manager = PluginManager.get_instance()
callbacks.extend(
[
cb
for cb in plugin_manager.add_callbacks_post_trainer(
self.cfg, trainer
)
if cb
]
)
return callbacks
def hook_pre_create_training_args(self, training_arguments_kwargs):
# TODO
@@ -1306,8 +1207,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
callbacks.append(lisa_callback_factory(trainer))
callbacks.extend(super().get_post_trainer_create_callbacks(trainer=trainer))
return callbacks
def _get_trainer_cls(self):
@@ -1315,8 +1214,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
return ReLoRATrainer
if self.cfg.model_config_type == "mamba":
return AxolotlMambaTrainer
if self.cfg.reward_model:
return AxolotlRewardTrainer
return AxolotlTrainer
def build(self, total_num_steps):
@@ -1425,15 +1322,17 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
if not self.cfg.test_datasets and self.cfg.val_set_size == 0:
# no eval set, so don't eval
training_arguments_kwargs["eval_strategy"] = "no"
training_arguments_kwargs["evaluation_strategy"] = "no"
elif self.cfg.eval_steps:
training_arguments_kwargs["eval_strategy"] = "steps"
training_arguments_kwargs["evaluation_strategy"] = "steps"
training_arguments_kwargs["eval_steps"] = self.cfg.eval_steps
elif self.cfg.eval_strategy:
training_arguments_kwargs["eval_strategy"] = self.cfg.eval_strategy
elif self.cfg.evaluation_strategy:
training_arguments_kwargs[
"evaluation_strategy"
] = self.cfg.evaluation_strategy
else:
# we have an eval set, but no steps defined, default to use epoch
training_arguments_kwargs["eval_strategy"] = "epoch"
training_arguments_kwargs["evaluation_strategy"] = "epoch"
if self.cfg.save_steps:
training_arguments_kwargs["save_strategy"] = "steps"
@@ -1546,12 +1445,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
report_to.append("comet_ml")
training_arguments_kwargs["report_to"] = report_to
if self.cfg.use_wandb:
training_arguments_kwargs["run_name"] = self.cfg.wandb_name
elif self.cfg.use_mlflow:
training_arguments_kwargs["run_name"] = self.cfg.mlflow_run_name
else:
training_arguments_kwargs["run_name"] = None
training_arguments_kwargs["run_name"] = (
self.cfg.wandb_name if self.cfg.use_wandb else None
)
training_arguments_kwargs["optim"] = (
self.cfg.optimizer if self.cfg.optimizer else "adamw_hf"
)
@@ -1640,9 +1536,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
training_arguments_kwargs["model_type"] = self.cfg.model_config_type
training_arguments_kwargs["pretraining"] = bool(self.cfg.pretraining_dataset)
if self.cfg.chat_template:
training_arguments_kwargs["chat_template"] = get_chat_template(
self.cfg.chat_template,
tokenizer=self.tokenizer,
training_arguments_kwargs["chat_template"] = chat_templates(
self.cfg.chat_template
)
if self.cfg.rl == "orpo":
@@ -1655,16 +1550,11 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
trainer_kwargs = {}
if self.cfg.reward_model:
trainer_kwargs["max_length"] = self.cfg.sequence_len
# pylint: disable=duplicate-code
if self.cfg.optimizer in [
"optimi_adamw",
"ao_adamw_4bit",
"ao_adamw_8bit",
"ao_adamw_fp8",
"adopt_adamw",
]:
# Set default so transformers doesn't throw
training_arguments_kwargs["optim"] = "adamw_hf"
@@ -1703,13 +1593,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
"accelerator_config"
] = self.cfg.accelerator_config
training_args_cls = (
AxolotlTrainingArguments
if not self.cfg.reward_model
else AxolotlRewardConfig
)
training_args = training_args_cls( # pylint: disable=unexpected-keyword-arg
**training_arguments_kwargs,
training_args = (
AxolotlTrainingArguments( # pylint: disable=unexpected-keyword-arg
**training_arguments_kwargs,
)
)
training_args = self.hook_post_create_training_args(training_args)
@@ -1731,37 +1618,27 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
# https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
data_collator_kwargs["pad_to_multiple_of"] = 64
if self.cfg.reward_model:
data_collator_kwargs["max_length"] = self.cfg.sequence_len
trainer_cls = self._get_trainer_cls()
trainer_kwargs, trainer_cls = self.hook_pre_create_trainer(
trainer_kwargs, trainer_cls
)
if eval_data_collator := self.build_collator(
training_args, is_eval=True, **data_collator_kwargs
):
if not self.cfg.reward_model:
trainer_kwargs["eval_data_collator"] = eval_data_collator
if not self.cfg.reward_model:
trainer_kwargs["bench_data_collator"] = transformers.DataCollatorForSeq2Seq(
self.tokenizer,
return_tensors="pt",
**data_collator_kwargs,
)
sig = inspect.signature(trainer_cls)
if "processing_class" in sig.parameters.keys():
trainer_kwargs["processing_class"] = self.tokenizer
else:
trainer_kwargs["tokenizer"] = self.tokenizer
trainer = trainer_cls(
model=self.model,
train_dataset=self.train_dataset,
eval_dataset=self.eval_dataset,
args=training_args,
tokenizer=self.tokenizer,
data_collator=self.build_collator(training_args, **data_collator_kwargs),
eval_data_collator=self.build_collator(
training_args, is_eval=True, **data_collator_kwargs
),
bench_data_collator=transformers.DataCollatorForSeq2Seq(
self.tokenizer,
return_tensors="pt",
**data_collator_kwargs,
),
callbacks=self.get_callbacks(),
num_epochs=self.cfg.num_epochs,
**trainer_kwargs,
)
trainer = self.hook_post_create_trainer(trainer)
@@ -1795,14 +1672,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
V2BatchSamplerDataCollatorForSeq2Seq,
BatchSamplerDataCollatorForSeq2Seq,
DataCollatorForSeq2Seq,
RewardDataCollatorWithPadding,
]
]
if self.cfg.reward_model:
collator = RewardDataCollatorWithPadding
if "max_length" in kwargs:
kwargs.pop("max_length")
elif use_batch_sampler_collator:
if use_batch_sampler_collator:
if self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES:
collator = V2BatchSamplerDataCollatorForSeq2Seq
elif (
@@ -1839,7 +1711,7 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
return callbacks
def get_post_trainer_create_callbacks(self, trainer):
callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
callbacks = []
return callbacks
def build_training_arguments(self, total_num_steps):
@@ -1867,10 +1739,10 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
training_args_kwargs["save_safetensors"] = self.cfg.save_safetensors
if self.eval_dataset:
training_args_kwargs["eval_strategy"] = "steps"
training_args_kwargs["evaluation_strategy"] = "steps"
training_args_kwargs["eval_steps"] = self.cfg.eval_steps
else:
training_args_kwargs["eval_strategy"] = "no"
training_args_kwargs["evaluation_strategy"] = "no"
if self.cfg.bf16 or self.cfg.bfloat16:
training_args_kwargs["bf16"] = True
@@ -1925,18 +1797,17 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
# default to saving each epoch if not defined
training_args_kwargs["save_strategy"] = "epoch"
training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
if self.cfg.rl_beta:
training_args_kwargs["beta"] = self.cfg.rl_beta
if self.cfg.orpo_alpha:
# trl does some odd mapping of alpha to beta to reuse the beta parameter ???
training_args_kwargs["beta"] = self.cfg.orpo_alpha
training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
training_args_cls = AxolotlDPOConfig
if self.cfg.rpo_alpha is not None:
training_args_kwargs["rpo_alpha"] = self.cfg.rpo_alpha
training_args_cls = None
if self.cfg.rl == "simpo":
training_args_cls = AxolotlCPOConfig
training_args_kwargs["loss_type"] = "simpo"
@@ -1945,13 +1816,13 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
if self.cfg.cpo_alpha is not None:
training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha
elif self.cfg.rl == "orpo":
if self.cfg.rl == "orpo":
training_args_cls = AxolotlORPOConfig
training_args_kwargs["max_length"] = self.cfg.sequence_len
if self.cfg.max_prompt_len:
training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
elif self.cfg.rl == "kto":
if self.cfg.rl == "kto":
training_args_cls = AxolotlKTOConfig
training_args_kwargs["desirable_weight"] = (
@@ -1966,17 +1837,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
if self.cfg.max_prompt_len:
training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
else:
training_args_cls = AxolotlDPOConfig
if self.cfg.rl == "ipo":
training_args_kwargs["loss_type"] = "ipo"
training_args_kwargs["max_length"] = self.cfg.sequence_len
training_args_kwargs["max_completion_length"] = None
training_args_kwargs["max_prompt_length"] = self.cfg.sequence_len
training_args_kwargs["generate_during_eval"] = self.cfg.use_wandb
if self.cfg.dpo_use_weighting is not None:
training_args_kwargs["use_weighting"] = self.cfg.dpo_use_weighting
training_args = training_args_cls( # pylint: disable=unexpected-keyword-arg
output_dir=self.cfg.output_dir,
per_device_train_batch_size=self.cfg.micro_batch_size,
@@ -1997,6 +1857,7 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
training_args = self.build_training_arguments(total_num_steps)
dpo_trainer_kwargs = {}
if self.cfg.rl == "ipo":
dpo_trainer_kwargs["loss_type"] = "ipo"
if self.cfg.dpo_label_smoothing:
dpo_trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
if self.eval_dataset:
@@ -2010,6 +1871,12 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
if self.cfg.rl in ["dpo", "ipo"]:
trainer_cls = AxolotlDPOTrainer
trainer_cls_args = [self.model, self.model_ref]
# these aren't used for the ORPO trainer
dpo_trainer_kwargs["max_length"] = self.cfg.sequence_len
dpo_trainer_kwargs["max_target_length"] = None
dpo_trainer_kwargs["max_prompt_length"] = self.cfg.sequence_len
dpo_trainer_kwargs["generate_during_eval"] = True
elif self.cfg.rl == "orpo":
trainer_cls = AxolotlORPOTrainer
trainer_cls_args = [self.model]
@@ -2021,17 +1888,11 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
trainer_cls_args = [self.model]
else:
raise ValueError(f"Unsupported RL: {self.cfg.rl}")
sig = inspect.signature(trainer_cls)
if "processing_class" in sig.parameters.keys():
dpo_trainer_kwargs["processing_class"] = self.tokenizer
else:
dpo_trainer_kwargs["tokenizer"] = self.tokenizer
dpo_trainer = trainer_cls(
*trainer_cls_args,
args=training_args,
train_dataset=self.train_dataset,
tokenizer=self.tokenizer,
callbacks=self.get_callbacks(),
**dpo_trainer_kwargs,
)
@@ -2053,11 +1914,11 @@ class HFPPOTrainerBuilder(TrainerBuilderBase):
"""
def get_callbacks(self):
callbacks = super().get_callbacks()
callbacks = []
return callbacks
def get_post_trainer_create_callbacks(self, trainer):
callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
callbacks = []
return callbacks
def build(self, total_num_steps):

View File

@@ -18,10 +18,9 @@ Plugins can be used to integrate third-party models, modify the training process
To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.
"""
import collections
import importlib
import logging
from typing import OrderedDict
from typing import List
class BasePlugin:
@@ -48,7 +47,7 @@ class BasePlugin:
Initializes the BasePlugin.
"""
def register(self, cfg): # pylint: disable=unused-argument
def register(self, cfg):
"""
Registers the plugin with the given configuration.
@@ -64,7 +63,7 @@ class BasePlugin:
Returns a pydantic model for the plugin's input arguments.
"""
def pre_model_load(self, cfg): # pylint: disable=unused-argument
def pre_model_load(self, cfg):
"""
Performs actions before the model is loaded.
@@ -75,7 +74,7 @@ class BasePlugin:
None
"""
def post_model_load(self, cfg, model): # pylint: disable=unused-argument
def post_model_load(self, cfg, model):
"""
Performs actions after the model is loaded.
@@ -87,7 +86,7 @@ class BasePlugin:
None
"""
def pre_lora_load(self, cfg, model): # pylint: disable=unused-argument
def pre_lora_load(self, cfg, model):
"""
Performs actions before LoRA weights are loaded.
@@ -99,7 +98,7 @@ class BasePlugin:
None
"""
def post_lora_load(self, cfg, model): # pylint: disable=unused-argument
def post_lora_load(self, cfg, model):
"""
Performs actions after LoRA weights are loaded.
@@ -111,7 +110,7 @@ class BasePlugin:
None
"""
def create_optimizer(self, cfg, trainer): # pylint: disable=unused-argument
def create_optimizer(self, cfg, trainer):
"""
Creates and returns an optimizer for training.
@@ -123,9 +122,7 @@ class BasePlugin:
object: The created optimizer.
"""
def create_lr_scheduler(
self, cfg, trainer, optimizer
): # pylint: disable=unused-argument
def create_lr_scheduler(self, cfg, trainer, optimizer):
"""
Creates and returns a learning rate scheduler.
@@ -138,9 +135,9 @@ class BasePlugin:
object: The created learning rate scheduler.
"""
def add_callbacks_pre_trainer(self, cfg, model): # pylint: disable=unused-argument
def add_callbacks_pre_trainer(self, cfg, model):
"""
setup callbacks before creating the trainer.
Adds callbacks to the trainer before training.
Parameters:
cfg (dict): The configuration for the plugin.
@@ -149,25 +146,20 @@ class BasePlugin:
Returns:
List[callable]: A list of callback functions to be added to the TrainingArgs
"""
return []
def add_callbacks_post_trainer(
self, cfg, trainer
): # pylint: disable=unused-argument
def add_callbacks_post_trainer(self, cfg, trainer):
"""
Adds callbacks to the trainer after creating the trainer.
This is useful for callbacks that require access to the model or trainer.
Adds callbacks to the trainer after training.
Parameters:
cfg (dict): The configuration for the plugin.
trainer (object): The trainer object for training.
Returns:
List[callable]: A list of callback functions to be added
List[callable]: A list of callback functions to be added to the TrainingArgs
"""
return []
def post_train(self, cfg, model): # pylint: disable=unused-argument
def post_train(self, cfg, model):
"""
Performs actions after training is complete.
@@ -179,7 +171,7 @@ class BasePlugin:
None
"""
def post_train_unload(self, cfg): # pylint: disable=unused-argument
def post_train_unload(self, cfg):
"""
Performs actions after training is complete and the model is unloaded.
@@ -235,7 +227,7 @@ class PluginManager:
pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.
"""
plugins: OrderedDict[str, BasePlugin] = collections.OrderedDict()
plugins: List[BasePlugin] = []
_instance = None
@@ -245,7 +237,7 @@ class PluginManager:
"""
if cls._instance is None:
cls._instance = super(PluginManager, cls).__new__(cls)
cls._instance.plugins = collections.OrderedDict()
cls._instance.plugins: List[BasePlugin] = []
return cls._instance
@staticmethod
@@ -273,7 +265,7 @@ class PluginManager:
"""
try:
plugin = load_plugin(plugin_name)
self.plugins[plugin_name] = plugin
self.plugins.append(plugin)
except ImportError:
logging.error(f"Failed to load plugin: {plugin_name}")
@@ -285,7 +277,7 @@ class PluginManager:
list[str]: A list of Pydantic classes for all registered plugins' input arguments.'
"""
input_args = []
for plugin in self.plugins.values():
for plugin in self.plugins:
input_args_from_plugin = plugin.get_input_args()
if input_args_from_plugin is not None:
input_args.append(input_args_from_plugin)
@@ -301,7 +293,7 @@ class PluginManager:
Returns:
None
"""
for plugin in self.plugins.values():
for plugin in self.plugins:
plugin.pre_model_load(cfg)
def post_model_load(self, cfg, model):
@@ -315,7 +307,7 @@ class PluginManager:
Returns:
None
"""
for plugin in self.plugins.values():
for plugin in self.plugins:
plugin.post_model_load(cfg, model)
def pre_lora_load(self, cfg, model):
@@ -329,7 +321,7 @@ class PluginManager:
Returns:
None
"""
for plugin in self.plugins.values():
for plugin in self.plugins:
plugin.pre_lora_load(cfg, model)
def post_lora_load(self, cfg, model):
@@ -343,7 +335,7 @@ class PluginManager:
Returns:
None
"""
for plugin in self.plugins.values():
for plugin in self.plugins:
plugin.post_lora_load(cfg, model)
def create_optimizer(self, cfg, trainer):
@@ -357,7 +349,7 @@ class PluginManager:
Returns:
object: The created optimizer, or None if none was found.
"""
for plugin in self.plugins.values():
for plugin in self.plugins:
optimizer = plugin.create_optimizer(cfg, trainer)
if optimizer is not None:
return optimizer
@@ -375,7 +367,7 @@ class PluginManager:
Returns:
object: The created learning rate scheduler, or None if none was found.
"""
for plugin in self.plugins.values():
for plugin in self.plugins:
scheduler = plugin.create_lr_scheduler(cfg, trainer, optimizer)
if scheduler is not None:
return scheduler
@@ -393,10 +385,8 @@ class PluginManager:
List[callable]: A list of callback functions to be added to the TrainingArgs.
"""
callbacks = []
for plugin in self.plugins.values():
plugin_callbacks = plugin.add_callbacks_pre_trainer(cfg, model)
if plugin_callbacks: # if the plugin returned a list of callbacks
callbacks.extend(plugin_callbacks)
for plugin in self.plugins:
callbacks.extend(plugin.add_callbacks_pre_trainer(cfg, model))
return callbacks
def add_callbacks_post_trainer(self, cfg, trainer):
@@ -411,10 +401,8 @@ class PluginManager:
List[callable]: A list of callback functions to be added to the TrainingArgs.
"""
callbacks = []
for plugin in self.plugins.values():
plugin_callbacks = plugin.add_callbacks_post_trainer(cfg, trainer)
if plugin_callbacks:
callbacks.extend(plugin_callbacks)
for plugin in self.plugins:
callbacks.extend(plugin.add_callbacks_post_trainer(cfg, trainer))
return callbacks
def post_train_unload(self, cfg):
@@ -428,5 +416,5 @@ class PluginManager:
Returns:
None
"""
for plugin in self.plugins.values():
for plugin in self.plugins:
plugin.post_train_unload(cfg)

View File

@@ -1,21 +0,0 @@
MIT License
Copyright (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,13 +0,0 @@
# Grokfast Optimizer
See https://github.com/ironjr/grokfast
### Usage
```yaml
plugins:
- axolotl.integrations.grokfast.GrokfastPlugin
grokfast_alpha: 2.0
grokfast_lamb: 0.98
```

View File

@@ -1,50 +0,0 @@
"""
Grokfast plugin for Axolotl
"""
import logging
from transformers.trainer_callback import TrainerCallback
from ..base import BasePlugin
from .args import GrokfastArgs # pylint: disable=unused-import. # noqa: F401
from .optimizer import gradfilter_ema
LOG = logging.getLogger("axolotl.integrations.grokfast")
class GrokfastCallbackHandler(TrainerCallback):
"""
Transformer trainer callbacks for Grokfast
"""
def __init__(self, *args_, alpha=0.98, lamb=2.0, **kwargs):
super().__init__(*args_, **kwargs)
self.grads = None
self.alpha = alpha
self.lamb = lamb
def on_train_begin(self, *args_, **kwargs): # pylint: disable=unused-argument
self.grads = None
def on_pre_optimizer_step(
self, args_, state, control, **kwargs
): # pylint: disable=unused-argument
model = kwargs.pop("model")
self.grads = gradfilter_ema(model, self.grads, alpha=self.alpha, lamb=self.lamb)
return control
class GrokfastPlugin(BasePlugin):
"""
Plugin for Grokfast optimizer integraton with Axolotl.
"""
def get_input_args(self):
return "axolotl.integrations.grokfast.GrokfastArgs"
def add_callbacks_post_trainer(self, cfg, trainer):
LOG.info("Adding Grokfast callback to the trainer")
callback = GrokfastCallbackHandler(
alpha=cfg.grokfast_alpha, lamb=cfg.grokfast_lamb
)
return [callback]

View File

@@ -1,15 +0,0 @@
"""
config args for grokfast plugin
"""
from typing import Optional
from pydantic import BaseModel
class GrokfastArgs(BaseModel):
"""
Input args for Grokfast optimizer.
"""
grokfast_alpha: Optional[float] = 0.98
grokfast_lamb: Optional[float] = 2.0

View File

@@ -1,63 +0,0 @@
# Copyright: MIT License (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
# Reference: https://github.com/ironjr/grokfast
# pylint: skip-file
from collections import deque
from typing import Dict, Literal, Optional
import torch
import torch.nn as nn
def gradfilter_ma(
m: nn.Module,
grads: Optional[Dict[str, deque]] = None,
window_size: int = 100,
lamb: float = 5.0,
filter_type: Literal["mean", "sum"] = "mean",
warmup: bool = True,
trigger: bool = False, # For ablation study.
) -> Dict[str, deque]:
if grads is None:
grads = {
n: deque(maxlen=window_size)
for n, p in m.named_parameters()
if p.requires_grad and p.grad is not None
}
for n, p in m.named_parameters():
if p.requires_grad and p.grad is not None:
grads[n].append(p.grad.data.detach()) # .cpu())
# Modify the gradients.
if not warmup or len(grads[n]) == window_size and not trigger:
if filter_type == "mean":
avg = sum(grads[n]) / len(grads[n])
elif filter_type == "sum":
avg = sum(grads[n])
else:
raise ValueError(f"Unrecognized filter_type {filter_type}")
p.grad.data = p.grad.data + avg * lamb
return grads
def gradfilter_ema(
m: nn.Module,
grads: Optional[Dict[str, torch.Tensor]] = None,
alpha: float = 0.98,
lamb: float = 2.0,
) -> Dict[str, torch.Tensor]:
if grads is None:
grads = {
n: p.grad.data.detach()
for n, p in m.named_parameters()
if p.requires_grad and p.grad is not None
}
for n, p in m.named_parameters():
if p.requires_grad and p.grad is not None:
grads[n] = grads[n] * alpha + p.grad.data.detach() * (1 - alpha)
p.grad.data = p.grad.data + grads[n] * lamb
return grads

View File

@@ -18,24 +18,20 @@ Module for the Plugin for LIGER integraton with Axolotl.
Liger Kernel is the collection of Triton-native kernels for LLM Training.
It is designed to be performant, correct, and light-weight.
"""
import inspect
import logging
import sys
from functools import partial
from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
from liger_kernel.transformers.functional import liger_cross_entropy
from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
from liger_kernel.transformers.geglu import LigerGEGLUMLP
from liger_kernel.transformers.rms_norm import LigerRMSNorm
from liger_kernel.transformers.rope import liger_rotary_pos_emb
from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
from axolotl.integrations.base import BasePlugin
from ...utils.distributed import zero_only
from .args import LigerArgs # pylint: disable=unused-import. # noqa: F401
LOG = logging.getLogger("axolotl.integrations.liger")
class LigerPlugin(BasePlugin):
"""
@@ -46,31 +42,59 @@ class LigerPlugin(BasePlugin):
return "axolotl.integrations.liger.LigerArgs"
def pre_model_load(self, cfg):
if cfg.model_config_type in MODEL_TYPE_TO_APPLY_LIGER_FN:
apply_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[cfg.model_config_type]
liger_fn_sig = inspect.signature(apply_liger_fn)
kwargs = {}
if "rope" in liger_fn_sig.parameters:
kwargs["rope"] = cfg.liger_rope
if "cross_entropy" in liger_fn_sig.parameters:
kwargs["cross_entropy"] = cfg.liger_cross_entropy
if "fused_linear_cross_entropy" in liger_fn_sig.parameters:
kwargs[
"fused_linear_cross_entropy"
] = cfg.liger_fused_linear_cross_entropy
if "rms_norm" in liger_fn_sig.parameters:
kwargs["rms_norm"] = cfg.liger_rms_norm
if "layer_norm" in liger_fn_sig.parameters:
kwargs["layer_norm"] = cfg.liger_layer_norm
if "geglu" in liger_fn_sig.parameters:
kwargs["geglu"] = cfg.liger_glu_activation
elif "swiglu" in liger_fn_sig.parameters:
kwargs["swiglu"] = cfg.liger_glu_activation
with zero_only():
LOG.info(
f"Applying LIGER to {cfg.model_config_type} with kwargs: {kwargs}"
if cfg.model_config_type == "llama":
from liger_kernel.transformers.model.llama import (
lce_forward as llama_lce_forward,
)
from transformers.models.llama import modeling_llama
if cfg.liger_rope:
modeling_llama.apply_rotary_pos_emb = liger_rotary_pos_emb
if cfg.liger_rms_norm:
modeling_llama.LlamaRMSNorm = LigerRMSNorm
if cfg.liger_swiglu:
modeling_llama.LlamaMLP = LigerSwiGLUMLP
if cfg.liger_cross_entropy:
modeling_llama.CrossEntropyLoss = LigerCrossEntropyLoss
elif cfg.liger_fused_linear_cross_entropy:
modeling_llama.LlamaForCausalLM.forward = llama_lce_forward
elif cfg.model_config_type == "mistral":
from liger_kernel.transformers.model.mistral import (
lce_forward as mistral_lce_forward,
)
from transformers.models.mistral import modeling_mistral
if cfg.liger_rope:
modeling_mistral.apply_rotary_pos_emb = liger_rotary_pos_emb
if cfg.liger_rms_norm:
modeling_mistral.MistralRMSNorm = LigerRMSNorm
if cfg.liger_swiglu:
modeling_mistral.MistralMLP = LigerSwiGLUMLP
if cfg.liger_cross_entropy:
modeling_mistral.CrossEntropyLoss = LigerCrossEntropyLoss
if cfg.liger_fused_linear_cross_entropy:
modeling_mistral.MistralForCausalLM.forward = mistral_lce_forward
elif cfg.model_config_type == "gemma":
from liger_kernel.transformers.model.gemma import (
lce_forward as gemma_lce_forward,
)
from transformers.models.gemma import modeling_gemma
if cfg.liger_rope:
modeling_gemma.apply_rotary_pos_emb = liger_rotary_pos_emb
if cfg.liger_rms_norm:
modeling_gemma.GemmaRMSNorm = partial(
LigerRMSNorm, offset=1.0, init_fn="zeros", casting_mode="gemma"
)
apply_liger_fn(**kwargs)
if cfg.liger_swiglu:
modeling_gemma.GemmaMLP = LigerGEGLUMLP
if cfg.liger_cross_entropy:
modeling_gemma.CrossEntropyLoss = LigerCrossEntropyLoss
if cfg.liger_fused_linear_cross_entropy:
modeling_gemma.GemmaForCausalLM.forward = gemma_lce_forward
elif cfg.model_config_type == "jamba":
from transformers.models.jamba import modeling_jamba
@@ -80,14 +104,30 @@ class LigerPlugin(BasePlugin):
modeling_jamba.apply_rotary_pos_emb = liger_rotary_pos_emb
if cfg.liger_rms_norm:
modeling_jamba.JambaRMSNorm = LigerRMSNorm
if cfg.liger_glu_activation:
if cfg.liger_swiglu:
modeling_jamba.JambaMLP = LigerSwiGLUMLP
if cfg.liger_cross_entropy:
from transformers.loss.loss_utils import nn
nn.functional.cross_entropy = liger_cross_entropy
modeling_jamba.CrossEntropyLoss = LigerCrossEntropyLoss
if cfg.liger_fused_linear_cross_entropy:
modeling_jamba.JambaForCausalLM.forward = jamba_lce_forward
elif cfg.model_config_type == "qwen2":
from liger_kernel.transformers.model.qwen2 import (
lce_forward as qwen2_lce_forward,
)
from transformers.models.qwen2 import modeling_qwen2
if cfg.liger_rope:
modeling_qwen2.apply_rotary_pos_emb = liger_rotary_pos_emb
if cfg.liger_rms_norm:
modeling_qwen2.Qwen2RMSNorm = LigerRMSNorm
if cfg.liger_swiglu:
modeling_qwen2.Qwen2MLP = LigerSwiGLUMLP
if cfg.liger_cross_entropy:
modeling_qwen2.CrossEntropyLoss = LigerCrossEntropyLoss
if cfg.liger_fused_linear_cross_entropy:
modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward
elif cfg.model_config_type == "deepseek_v2":
from accelerate import init_empty_weights
from transformers import AutoModelForCausalLM
@@ -106,11 +146,44 @@ class LigerPlugin(BasePlugin):
logging.warning("Fused liger_rope is not supported for DeepseekV2.")
if cfg.liger_rms_norm:
modeling_mod.DeepseekV2RMSNorm = LigerRMSNorm
if cfg.liger_glu_activation:
if cfg.liger_swiglu:
modeling_mod.DeepseekV2MLP.forward = LigerSwiGLUMLP.forward
if cfg.liger_cross_entropy:
# We do not patch `nn.functional.cross_entropy` for DeepseekV2 as it still uses
# nn.CrossEntropyLoss in the forward method.
modeling_mod.CrossEntropyLoss = LigerCrossEntropyLoss
if cfg.liger_fused_linear_cross_entropy:
modeling_mod.DeepseekV2ForCausalLM.forward = deepseekv2_lce_forward
elif cfg.model_config_type == "gemma2":
from transformers.models.gemma2 import modeling_gemma2
if cfg.liger_rope:
modeling_gemma2.apply_rotary_pos_emb = liger_rotary_pos_emb
if cfg.liger_rms_norm:
modeling_gemma2.Gemma2RMSNorm = partial(
LigerRMSNorm, offset=1.0, init_fn="zeros", casting_mode="gemma"
)
if cfg.liger_swiglu:
modeling_gemma2.Gemma2MLP = LigerGEGLUMLP
if cfg.liger_cross_entropy:
modeling_gemma2.CrossEntropyLoss = LigerCrossEntropyLoss
if cfg.liger_fused_linear_cross_entropy:
logging.warning(
"Fused linear cross entropy is not supported for Gemma 2."
)
elif cfg.model_config_type == "phi3":
from liger_kernel.transformers.model.phi3 import (
lce_forward as phi3_lce_forward,
)
from transformers.models.phi3 import modeling_phi3
if cfg.liger_rope:
modeling_phi3.apply_rotary_pos_emb = liger_rotary_pos_emb
if cfg.liger_rms_norm:
modeling_phi3.Phi3RMSNorm = LigerRMSNorm
if cfg.liger_swiglu:
modeling_phi3.Phi3MLP = LigerSwiGLUMLP
if cfg.liger_cross_entropy:
modeling_phi3.CrossEntropyLoss = LigerCrossEntropyLoss
if cfg.liger_fused_linear_cross_entropy:
modeling_phi3.Phi3ForCausalLM.forward = phi3_lce_forward

View File

@@ -15,12 +15,9 @@
"""
Module for handling LIGER input arguments.
"""
import logging
from typing import Optional
from pydantic import BaseModel, model_validator
LOG = logging.getLogger("axolotl.integrations.liger.args")
from pydantic import BaseModel
class LigerArgs(BaseModel):
@@ -30,24 +27,6 @@ class LigerArgs(BaseModel):
liger_rope: Optional[bool] = None
liger_rms_norm: Optional[bool] = None
liger_layer_norm: Optional[bool] = None
liger_swiglu: Optional[bool] = None
liger_glu_activation: Optional[bool] = None
liger_cross_entropy: Optional[bool] = None
liger_fused_linear_cross_entropy: Optional[bool] = None
@model_validator(mode="before")
@classmethod
def check_deprecated_swiglu(cls, data):
if data.get("liger_swiglu") is not None:
if data.get("liger_glu_activation") is not None:
raise ValueError(
"You cannot have both `liger_swiglu` and `liger_glu_activation` set."
)
LOG.warning(
"The 'liger_swiglu' argument is deprecated and will be removed in a future release. "
"Please use 'liger_glu_activation' instead."
)
data["liger_glu_activation"] = data.pop("liger_swiglu")
return data

View File

@@ -1,361 +0,0 @@
"""
Copyright (c) 2024 by SageAttention team.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from typing import Any, Optional
import torch
from torch.autograd import Function
from .triton.attn_qk_int8_per_block_causal_varlen import (
backward as sageattn_varlen_backward,
)
from .triton.attn_qk_int8_per_block_causal_varlen import forward as attn_true_varlen
from .triton.quant_per_block_varlen import (
per_block_int8 as per_block_int8_varlen_triton,
)
def get_cuda_arch_versions():
cuda_archs = []
for i in range(torch.cuda.device_count()):
major, minor = torch.cuda.get_device_capability(i)
cuda_archs.append(f"sm{major}{minor}")
return cuda_archs
def sageattn_varlen(
q: torch.Tensor,
k: torch.Tensor,
v: torch.Tensor,
cu_seqlens_q: torch.Tensor,
cu_seqlens_k: torch.Tensor,
max_seqlen_q: int,
max_seqlen_k: int,
sm_scale: Optional[float] = None,
smooth_k: bool = True,
**kwargs: Any,
) -> torch.Tensor:
"""
Parameters
----------
q : torch.Tensor
The query tensor, shape: ``[cu_seqlens_q[-1], num_qo_heads, head_dim]``.
k : torch.Tensor
The key tensor, shape: ``[cu_seqlens_k[-1], num_kv_heads, head_dim]``.
v : torch.Tensor
The value tensor, shape: ``[cu_seqlens_k[-1], num_kv_heads, head_dim]``.
cu_seqlens_q : torch.Tensor
The cumulative sequence lengths for the query sequences in the batch, used to index into `q`.
Shape: ``[batch_size + 1]``, where each entry represents the cumulative length of sequences up to that batch index.
cu_seqlens_k : torch.Tensor
The cumulative sequence lengths for the key and value sequences in the batch, used to index into `k` and `v`.
Shape: ``[batch_size + 1]``, where each entry represents the cumulative length of sequences up to that batch index.
max_seqlen_q : int
The maximum sequence length for the query tensor in the batch.
max_seqlen_k : int
The maximum sequence length for the key and value tensors in the batch.
is_causal : bool
Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len for each sequence.
Default: False.
sm_scale : Optional[float]
The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
smooth_k : bool
Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
Default: True.
Returns
-------
torch.Tensor
The output tensor, shape: ``[cu_seqlens_q[-1], num_qo_heads, head_dim]``.
Note
----
- ``num_qo_heads`` must be divisible by ``num_kv_heads``.
- The tensors `q`, `k`, and `v` must have the dtype ``torch.float16``, ``torch.bfloat16`` or ``torch.float32``.
- The tensors `cu_seqlens_q` and `cu_seqlens_k` must have the dtype ``torch.int32`` or ``torch.int64``.
- All tensors must be on the same cuda device.
- `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
"""
dtype = q.dtype
assert q.is_cuda, "Input tensors must be on cuda."
assert dtype in [
torch.float16,
torch.bfloat16,
], "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
assert q.device == k.device == v.device, "All tensors must be on the same device."
assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
head_dim = q.size(-1)
assert head_dim in [64, 128], "varlen only support head_dim [64, 128]."
assert (
q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1
), "Last dim of qkv must be contiguous."
assert (
cu_seqlens_q.is_contiguous() and cu_seqlens_k.is_contiguous()
), "cu_seqlens_q and cu_seqlens_k must be contiguous."
if dtype == torch.bfloat16 or dtype == torch.float32:
v = v.to(torch.float16)
if smooth_k:
km = k.mean(
dim=0, keepdim=True
) # ! km is calculated on the all the batches. Calculate over each individual sequence requires dedicated kernel.
k -= km
(
q_int8,
q_scale,
k_int8,
k_scale,
cu_seqlens_q_scale,
cu_seqlens_k_scale,
) = per_block_int8_varlen_triton(
q, k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, sm_scale=sm_scale
)
o = attn_true_varlen(
q_int8,
k_int8,
v,
cu_seqlens_q,
cu_seqlens_k,
max_seqlen_q,
q_scale,
k_scale,
cu_seqlens_q_scale,
cu_seqlens_k_scale,
output_dtype=dtype,
)
return o
class SageAttentionFunction(Function):
@staticmethod
def forward(
ctx,
query,
key,
value,
attn_mask=None,
dropout_p=0.0,
is_causal=False,
scale=None,
):
"""
query: Tensor of shape [batch_size, num_heads, seq_len_q, head_dim]
key: Tensor of shape [batch_size, num_heads, seq_len_k, head_dim]
value: Tensor of shape [batch_size, num_heads, seq_len_k, head_dim]
attn_mask: Optional[Tensor], mask tensor
dropout_p: float, dropout probability
is_causal: bool, whether to apply causal masking
scale: Optional[float], scaling factor for attention scores
"""
# Ensure inputs are contiguous
query = query.contiguous()
key = key.contiguous()
value = value.contiguous()
# Handle default scale
if scale is None:
scale = 1.0 / (query.size(-1) ** 0.5)
# Save parameters needed for backward
ctx.scale = scale
ctx.is_causal = is_causal
ctx.dropout_p = dropout_p
ctx.attn_mask = attn_mask
# Prepare cumulative sequence lengths and max sequence lengths
# Assuming batch sizes are consistent across query, key, and value
batch_size, num_heads, seq_len_q, head_dim = query.shape
seq_len_k = key.shape[2]
# Flatten batch and head dimensions
q = query.view(
-1, seq_len_q, head_dim
) # [batch_size * num_heads, seq_len_q, head_dim]
k = key.view(-1, seq_len_k, head_dim)
v = value.view(-1, seq_len_k, head_dim)
# Create cumulative sequence lengths
cu_seqlens_q = torch.arange(
0,
(batch_size * num_heads + 1) * seq_len_q,
seq_len_q,
dtype=torch.int32,
device=query.device,
)
cu_seqlens_k = torch.arange(
0,
(batch_size * num_heads + 1) * seq_len_k,
seq_len_k,
dtype=torch.int32,
device=key.device,
)
max_seqlen_q = seq_len_q
max_seqlen_k = seq_len_k
# Call your custom per-block int8 quantization function
(
q_int8,
q_scale,
k_int8,
k_scale,
cu_seqlens_q_scale,
cu_seqlens_k_scale,
) = per_block_int8_varlen_triton(
q, k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, sm_scale=scale
)
# Call your custom attention function
if is_causal:
output = attn_true_varlen(
q_int8,
k_int8,
v,
cu_seqlens_q,
cu_seqlens_k,
max_seqlen_q,
q_scale,
k_scale,
cu_seqlens_q_scale,
cu_seqlens_k_scale,
output_dtype=query.dtype,
)
else:
raise NotImplementedError("Non-causal attention is not implemented yet.")
# Reshape output to match the expected shape
output = output.view(batch_size, num_heads, seq_len_q, head_dim)
# Save tensors for backward
ctx.save_for_backward(
query,
key,
value,
q_int8,
k_int8,
q_scale,
k_scale,
cu_seqlens_q,
cu_seqlens_k,
cu_seqlens_q_scale,
cu_seqlens_k_scale,
output,
)
return output
@staticmethod
def backward(ctx, grad_output):
(
query,
key,
value,
q_int8,
k_int8,
q_scale,
k_scale,
cu_seqlens_q,
cu_seqlens_k,
cu_seqlens_q_scale,
cu_seqlens_k_scale,
output,
) = ctx.saved_tensors
scale = ctx.scale
is_causal = ctx.is_causal
dropout_p = ctx.dropout_p
attn_mask = ctx.attn_mask
# Flatten batch and head dimensions
batch_size, num_heads, seq_len_q, head_dim = query.shape
seq_len_k = key.shape[2]
grad_output = grad_output.contiguous()
do = grad_output.view(-1, seq_len_q, head_dim)
# Compute gradients w.r.t. q, k, v
dq, dk, dv = sageattn_varlen_backward(
do,
query.view(-1, seq_len_q, head_dim),
key.view(-1, seq_len_k, head_dim),
value.view(-1, seq_len_k, head_dim),
cu_seqlens_q,
cu_seqlens_k,
seq_len_q,
seq_len_k,
q_int8,
k_int8,
q_scale,
k_scale,
cu_seqlens_q_scale,
cu_seqlens_k_scale,
scale,
is_causal,
)
# Reshape gradients to match the input shapes
dq = dq.view(batch_size, num_heads, seq_len_q, head_dim)
dk = dk.view(batch_size, num_heads, seq_len_k, head_dim)
dv = dv.view(batch_size, num_heads, seq_len_k, head_dim)
# Handle optional arguments
d_attn_mask = None # Assuming attn_mask does not require gradients
d_dropout_p = (
None # Dropout probability is a hyperparameter, typically not optimized
)
d_is_causal = None # Not differentiable
d_scale = None # If scale is a tensor and requires grad, compute its gradient
return dq, dk, dv, d_attn_mask, d_dropout_p, d_is_causal, d_scale
def scaled_dot_product_attention(
query,
key,
value,
attn_mask=None,
dropout_p=0.0,
is_causal=False,
scale=None,
):
"""
Custom scaled dot product attention using SageAttentionFunction.
"""
return SageAttentionFunction.apply(
query, key, value, attn_mask, dropout_p, is_causal, scale
)
def monkeypatch_sdp_w_sage_attention():
"""
Replace torch.nn.functional.scaled_dot_product_attention with custom scaled dot product attention using SageAttentionFunction.
"""
torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention

View File

@@ -1,622 +0,0 @@
"""
Copyright (c) 2024 by SageAttention team.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import math
import torch
import triton
import triton.language as tl
@triton.jit
def _attn_fwd_inner(
acc,
l_i,
m_i,
q,
q_scale,
kv_len,
K_ptrs,
K_scale_ptr,
V_ptrs,
stride_kn,
stride_vn,
start_m,
H: tl.constexpr,
BLOCK_M: tl.constexpr,
HEAD_DIM: tl.constexpr,
BLOCK_N: tl.constexpr,
STAGE: tl.constexpr,
offs_m: tl.constexpr,
offs_n: tl.constexpr,
):
if STAGE == 1:
lo, hi = 0, start_m * BLOCK_M
elif STAGE == 2:
lo, hi = start_m * BLOCK_M, (start_m + 1) * BLOCK_M
lo = tl.multiple_of(lo, BLOCK_M)
K_scale_ptr += (lo // BLOCK_N) * H
K_ptrs += stride_kn * lo
V_ptrs += stride_vn * lo
for start_n in range(lo, hi, BLOCK_N):
start_n = tl.multiple_of(start_n, BLOCK_N)
k_mask = offs_n[None, :] < (kv_len - start_n)
k = tl.load(K_ptrs, mask=k_mask)
k_scale = tl.load(K_scale_ptr)
qk = tl.dot(q, k).to(tl.float32) * q_scale * k_scale
if STAGE == 2:
mask = offs_m[:, None] >= (start_n + offs_n[None, :])
qk = qk + tl.where(mask, 0, -1.0e6)
m_ij = tl.maximum(m_i, tl.max(qk, 1))
qk -= m_ij[:, None]
else:
m_ij = tl.maximum(m_i, tl.max(qk, 1))
qk = qk - m_ij[:, None]
p = tl.math.exp2(qk)
l_ij = tl.sum(p, 1)
alpha = tl.math.exp2(m_i - m_ij)
l_i = l_i * alpha + l_ij
acc = acc * alpha[:, None]
v = tl.load(V_ptrs, mask=offs_n[:, None] < (kv_len - start_n))
p = p.to(tl.float16)
acc += tl.dot(p, v, out_dtype=tl.float16)
m_i = m_ij
K_ptrs += BLOCK_N * stride_kn
K_scale_ptr += H
V_ptrs += BLOCK_N * stride_vn
return acc, l_i, m_i
@triton.jit
def _attn_fwd(
Q,
K,
V,
cu_seqlens_q,
cu_seqlens_k,
Q_scale,
K_scale,
cu_seqlens_q_scale,
cu_seqlens_k_scale,
Out,
stride_qh,
stride_qn,
stride_kh,
stride_kn,
stride_vh,
stride_vn,
stride_oh,
stride_on,
H: tl.constexpr,
num_kv_groups: tl.constexpr,
HEAD_DIM: tl.constexpr,
BLOCK_M: tl.constexpr,
BLOCK_N: tl.constexpr,
STAGE: tl.constexpr,
):
start_m = tl.program_id(0)
off_z = tl.program_id(2).to(tl.int64)
off_h = tl.program_id(1).to(tl.int64)
cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
qo_len = cu_seqlens_q_end - cu_seqlens_q_start
if (start_m * BLOCK_M) >= qo_len:
return
cu_seq_lens_q_scale_start = tl.load(cu_seqlens_q_scale + off_z)
cu_seq_lens_k_scale_start = tl.load(cu_seqlens_k_scale + off_z)
q_scale_offset = cu_seq_lens_q_scale_start * H + off_h + start_m * H
k_scale_offset = (
cu_seq_lens_k_scale_start * (H // num_kv_groups) + off_h // num_kv_groups
)
cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
kv_len = cu_seqlens_k_end - cu_seqlens_k_start
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
offs_n = tl.arange(0, BLOCK_N)
offs_k = tl.arange(0, HEAD_DIM)
Q_ptrs = (
Q
+ (cu_seqlens_q_start * stride_qn + off_h * stride_qh)
+ offs_m[:, None] * stride_qn
+ offs_k[None, :]
)
Q_scale_ptr = Q_scale + q_scale_offset
K_ptrs = (
K
+ (cu_seqlens_k_start * stride_kn + (off_h // num_kv_groups) * stride_kh)
+ offs_n[None, :] * stride_kn
+ offs_k[:, None]
)
K_scale_ptr = K_scale + k_scale_offset
V_ptrs = (
V
+ (cu_seqlens_k_start * stride_vn + (off_h // num_kv_groups) * stride_vh)
+ offs_n[:, None] * stride_vn
+ offs_k[None, :]
)
O_block_ptr = (
Out
+ (cu_seqlens_q_start * stride_on + off_h * stride_oh)
+ offs_m[:, None] * stride_on
+ offs_k[None, :]
)
m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0
acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32)
q = tl.load(Q_ptrs, mask=offs_m[:, None] < qo_len)
q_scale = tl.load(Q_scale_ptr)
acc, l_i, m_i = _attn_fwd_inner(
acc,
l_i,
m_i,
q,
q_scale,
kv_len,
K_ptrs,
K_scale_ptr,
V_ptrs,
stride_kn,
stride_vn,
start_m,
H // num_kv_groups,
BLOCK_M,
HEAD_DIM,
BLOCK_N,
4 - STAGE,
offs_m,
offs_n,
)
acc, l_i, _ = _attn_fwd_inner(
acc,
l_i,
m_i,
q,
q_scale,
kv_len,
K_ptrs,
K_scale_ptr,
V_ptrs,
stride_kn,
stride_vn,
start_m,
H // num_kv_groups,
BLOCK_M,
HEAD_DIM,
BLOCK_N,
2,
offs_m,
offs_n,
)
acc = acc / l_i[:, None]
tl.store(O_block_ptr, acc.to(Out.type.element_ty), mask=(offs_m[:, None] < qo_len))
@triton.jit
def _attn_bwd_inner(
dq_acc,
dk_acc,
dv_acc,
l_i,
m_i,
q,
k,
v,
do,
q_scale,
k_scale,
kv_len,
stride_kn,
stride_vn,
start_m,
H,
BLOCK_M: tl.constexpr,
HEAD_DIM: tl.constexpr,
BLOCK_N: tl.constexpr,
STAGE: tl.constexpr,
offs_m: tl.constexpr,
offs_n: tl.constexpr,
):
if STAGE == 1:
lo, hi = 0, start_m * BLOCK_M
elif STAGE == 2:
lo, hi = start_m * BLOCK_M, (start_m + 1) * BLOCK_M
lo = tl.multiple_of(lo, BLOCK_M)
k += stride_kn * lo
v += stride_vn * lo
for start_n in range(lo, hi, BLOCK_N):
start_n = tl.multiple_of(start_n, BLOCK_N)
k_mask = offs_n[None, :] < (kv_len - start_n)
k_curr = tl.load(k, mask=k_mask)
v_curr = tl.load(v, mask=k_mask)
k_scale_curr = tl.load(k_scale)
s = tl.dot(q, k_curr, trans_b=True).to(tl.float32) * q_scale * k_scale_curr
if STAGE == 2:
mask = offs_m[:, None] >= (start_n + offs_n[None, :])
s = s + tl.where(mask, 0.0, -float("inf"))
m_ij = tl.maximum(m_i, tl.max(s, 1))
s = s - m_ij[:, None]
else:
m_ij = tl.maximum(m_i, tl.max(s, 1))
s = s - m_ij[:, None]
p = tl.math.exp2(s)
l_ij = tl.sum(p, 1)
alpha = tl.math.exp2(m_i - m_ij)
l_i = l_i * alpha + l_ij
m_i = m_ij
p = p / l_i[:, None] # Normalize probabilities
# Compute gradients
# Compute softmax gradient
do_scaled = do / l_i[:, None]
dv_contrib = tl.dot(p.to(tl.float16).T, do_scaled.to(tl.float16))
dv_acc += dv_contrib
dp = tl.dot(do_scaled.to(tl.float16), v_curr.to(tl.float16).T)
# Compute ds (gradient w.r.t. logits s)
p_dp = p * dp
sum_p_dp = tl.sum(p_dp, axis=1)
ds = (p_dp - p * sum_p_dp[:, None]) * tl.math.log(2.0) # Adjust for exp2
# Compute gradients w.r.t q and k
dq_contrib = tl.dot(ds.to(tl.float16), k_curr.to(tl.float16))
dk_contrib = tl.dot(ds.to(tl.float16).T, q.to(tl.float16))
dq_acc += dq_contrib * (q_scale * k_scale_curr)
dk_acc += dk_contrib * (q_scale * k_scale_curr)
k += BLOCK_N * stride_kn
k_scale += H
v += BLOCK_N * stride_vn
return dq_acc, dk_acc, dv_acc, l_i, m_i
@triton.jit
def _attn_bwd(
DO,
Q,
K,
V,
cu_seqlens_q,
cu_seqlens_k,
Q_scale,
K_scale,
cu_seqlens_q_scale,
cu_seqlens_k_scale,
L,
M,
DQ,
DK,
DV,
stride_qh,
stride_qn,
stride_kh,
stride_kn,
stride_vh,
stride_vn,
H: tl.constexpr,
num_kv_groups: tl.constexpr,
HEAD_DIM: tl.constexpr,
BLOCK_M: tl.constexpr,
BLOCK_N: tl.constexpr,
STAGE: tl.constexpr,
):
start_m = tl.program_id(0)
off_z = tl.program_id(2).to(tl.int64)
off_h = tl.program_id(1).to(tl.int64)
cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
qo_len = cu_seqlens_q_end - cu_seqlens_q_start
if (start_m * BLOCK_M) >= qo_len:
return
cu_seq_lens_q_scale_start = tl.load(cu_seqlens_q_scale + off_z)
cu_seq_lens_k_scale_start = tl.load(cu_seqlens_k_scale + off_z)
q_scale_offset = cu_seq_lens_q_scale_start * H + off_h + start_m * H
k_scale_offset = (
cu_seq_lens_k_scale_start * (H // num_kv_groups) + off_h // num_kv_groups
)
cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
kv_len = cu_seqlens_k_end - cu_seqlens_k_start
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
offs_n = tl.arange(0, BLOCK_N)
offs_k = tl.arange(0, HEAD_DIM)
Q_ptrs = (
Q
+ (cu_seqlens_q_start * stride_qn + off_h * stride_qh)
+ offs_m[:, None] * stride_qn
+ offs_k[None, :]
)
DO_ptrs = (
DO
+ (cu_seqlens_q_start * stride_qn + off_h * stride_qh)
+ offs_m[:, None] * stride_qn
+ offs_k[None, :]
)
Q_scale_ptr = Q_scale + q_scale_offset
K_ptrs = (
K
+ (cu_seqlens_k_start * stride_kn + (off_h // num_kv_groups) * stride_kh)
+ offs_n[None, :] * stride_kn
+ offs_k[:, None]
)
K_scale_ptr = K_scale + k_scale_offset
V_ptrs = (
V
+ (cu_seqlens_k_start * stride_vn + (off_h // num_kv_groups) * stride_vh)
+ offs_n[:, None] * stride_vn
+ offs_k[None, :]
)
DQ_ptrs = (
DQ
+ (cu_seqlens_q_start * stride_qn + off_h * stride_qh)
+ offs_m[:, None] * stride_qn
+ offs_k[None, :]
)
DK_ptrs = (
DK
+ (cu_seqlens_k_start * stride_kn + (off_h // num_kv_groups) * stride_kh)
+ offs_n[None, :] * stride_kn
+ offs_k[:, None]
)
DV_ptrs = (
DV
+ (cu_seqlens_k_start * stride_vn + (off_h // num_kv_groups) * stride_vh)
+ offs_n[:, None] * stride_vn
+ offs_k[None, :]
)
L_ptrs = L + (cu_seqlens_q_start + offs_m)
M_ptrs = M + (cu_seqlens_q_start + offs_m)
m_i = tl.load(M_ptrs, mask=offs_m < qo_len, other=float("-inf"))
l_i = tl.load(L_ptrs, mask=offs_m < qo_len, other=1.0)
dq_acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32)
dk_acc = tl.zeros([BLOCK_N, HEAD_DIM], dtype=tl.float32)
dv_acc = tl.zeros([BLOCK_N, HEAD_DIM], dtype=tl.float32)
q = tl.load(Q_ptrs, mask=offs_m[:, None] < qo_len)
do = tl.load(DO_ptrs, mask=offs_m[:, None] < qo_len)
q_scale = tl.load(Q_scale_ptr)
dq_acc, dk_acc, dv_acc, l_i, m_i = _attn_bwd_inner(
dq_acc,
dk_acc,
dv_acc,
l_i,
m_i,
q,
K_ptrs,
V_ptrs,
do,
q_scale,
K_scale_ptr,
kv_len,
stride_kn,
stride_vn,
start_m,
H // num_kv_groups,
BLOCK_M,
HEAD_DIM,
BLOCK_N,
4 - STAGE,
offs_m,
offs_n,
)
dq_acc, dk_acc, dv_acc, l_i, m_i = _attn_bwd_inner(
dq_acc,
dk_acc,
dv_acc,
l_i,
m_i,
q,
K_ptrs,
V_ptrs,
do,
q_scale,
K_scale_ptr,
kv_len,
stride_kn,
stride_vn,
start_m,
H // num_kv_groups,
BLOCK_M,
HEAD_DIM,
BLOCK_N,
2,
offs_m,
offs_n,
)
tl.store(DQ_ptrs, dq_acc.to(DQ.dtype.element_ty), mask=offs_m[:, None] < qo_len)
tl.store(DK_ptrs, dk_acc.to(DK.dtype.element_ty), mask=offs_n[None, :] < kv_len)
tl.store(DV_ptrs, dv_acc.to(DV.dtype.element_ty), mask=offs_n[:, None] < kv_len)
def forward(
q,
k,
v,
cu_seqlens_q,
cu_seqlens_k,
max_seqlen_q,
q_scale,
k_scale,
cu_seqlens_q_scale,
cu_seqlens_k_scale,
output_dtype=torch.float16,
):
BLOCK_M = 128
BLOCK_N = 64
stage = 3
o = torch.empty(q.shape, dtype=output_dtype, device=q.device)
b = cu_seqlens_q.shape[0] - 1
_, h_qo, head_dim = q.shape
_, h_kv, _ = k.shape
HEAD_DIM_K = head_dim
num_kv_groups = h_qo // h_kv
grid = (triton.cdiv(max_seqlen_q, BLOCK_M), h_qo, b)
_attn_fwd[grid](
q,
k,
v,
cu_seqlens_q,
cu_seqlens_k,
q_scale,
k_scale,
cu_seqlens_q_scale,
cu_seqlens_k_scale,
o,
q.stride(1),
q.stride(0),
k.stride(1),
k.stride(0),
v.stride(1),
v.stride(0),
o.stride(1),
o.stride(0),
h_qo,
num_kv_groups,
BLOCK_M=BLOCK_M,
BLOCK_N=BLOCK_N,
HEAD_DIM=HEAD_DIM_K,
STAGE=stage,
num_warps=4 if head_dim == 64 else 8,
num_stages=4,
)
return o
def backward(
do,
q,
k,
v,
cu_seqlens_q,
cu_seqlens_k,
max_seqlen_q,
q_scale,
k_scale,
cu_seqlens_q_scale,
cu_seqlens_k_scale,
l,
m,
output_dtype=torch.float16,
):
BLOCK_M = 128
BLOCK_N = 64
stage = 3
device = q.device
dtype = q.dtype
b = cu_seqlens_q.shape[0] - 1
_, h_qo, head_dim = q.shape
_, h_kv, _ = k.shape
num_kv_groups = h_qo // h_kv
dq = torch.zeros_like(q, dtype=output_dtype)
dk = torch.zeros_like(k, dtype=output_dtype)
dv = torch.zeros_like(v, dtype=output_dtype)
grid = (triton.cdiv(max_seqlen_q, BLOCK_M), h_qo, b)
_attn_bwd[grid](
do,
q,
k,
v,
cu_seqlens_q,
cu_seqlens_k,
q_scale,
k_scale,
cu_seqlens_q_scale,
cu_seqlens_k_scale,
l,
m,
dq,
dk,
dv,
q.stride(1),
q.stride(0),
k.stride(1),
k.stride(0),
v.stride(1),
v.stride(0),
h_qo,
num_kv_groups,
HEAD_DIM=head_dim,
BLOCK_M=BLOCK_M,
BLOCK_N=BLOCK_N,
STAGE=stage,
num_warps=4 if head_dim == 64 else 8,
num_stages=4,
)
return dq, dk, dv
# class TritonAttentionFunction(torch.autograd.Function):
# @staticmethod
# def forward(ctx, q, k, v, cu_seqlens_q, cu_seqlens_k, q_scale, k_scale, cu_seqlens_q_scale, cu_seqlens_k_scale):
# l = torch.zeros(q.shape[0], device=q.device, dtype=torch.float32)
# m = torch.zeros(q.shape[0], device=q.device, dtype=torch.float32)
# output = forward(q, k, v, cu_seqlens_q, cu_seqlens_k, q.shape[0], q_scale, k_scale, cu_seqlens_q_scale, cu_seqlens_k_scale, l, m)
# ctx.save_for_backward(q, k, v, cu_seqlens_q, cu_seqlens_k, q_scale, k_scale, cu_seqlens_q_scale, cu_seqlens_k_scale, l, m)
# return output
#
# @staticmethod
# def backward(ctx, do):
# q, k, v, cu_seqlens_q, cu_seqlens_k, q_scale, k_scale, cu_seqlens_q_scale, cu_seqlens_k_scale, l, m = ctx.saved_tensors
# dq, dk, dv = backward(
# do, q, k, v,
# cu_seqlens_q, cu_seqlens_k,
# q.shape[0], q_scale, k_scale,
# cu_seqlens_q_scale, cu_seqlens_k_scale,
# l, m,
# )
# return dq, dk, dv, None, None, None, None, None, None

View File

@@ -1,158 +0,0 @@
"""
Copyright (c) 2024 by SageAttention team.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import torch
import triton
import triton.language as tl
@triton.jit
def quant_per_block_int8_kernel(
Input,
Output,
Scale,
cu_seqlens_input,
cu_seqlens_scale,
stride_ih,
stride_in,
stride_oh,
stride_on,
sm_scale,
H: tl.constexpr,
C: tl.constexpr,
BLK: tl.constexpr,
):
off_blk = tl.program_id(0)
off_h = tl.program_id(1)
off_b = tl.program_id(2)
cu_seqlens_input_start = tl.load(cu_seqlens_input + off_b)
cu_seqlens_input_end = tl.load(cu_seqlens_input + off_b + 1)
L = cu_seqlens_input_end - cu_seqlens_input_start
if (off_blk * BLK) >= L:
return
cu_seqlens_scale_start = tl.load(cu_seqlens_scale + off_b)
offs_n = off_blk * BLK + tl.arange(0, BLK)
offs_k = tl.arange(0, C)
input_ptrs = (
Input
+ cu_seqlens_input_start * stride_in
+ off_h * stride_ih
+ offs_n[:, None] * stride_in
+ offs_k[None, :]
)
output_ptrs = (
Output
+ cu_seqlens_input_start * stride_on
+ off_h * stride_oh
+ offs_n[:, None] * stride_on
+ offs_k[None, :]
)
scale_ptrs = Scale + cu_seqlens_scale_start * H + off_h + off_blk * H
x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
x = x.to(tl.float32)
x *= sm_scale
scale = tl.max(tl.abs(x)) / 127.0
x_int8 = x / scale
x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
x_int8 = x_int8.to(tl.int8)
tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
tl.store(scale_ptrs, scale)
def per_block_int8(
q,
k,
cu_seqlens_q,
cu_seqlens_k,
max_seqlen_q,
max_seqlen_k,
BLKQ=128,
BLKK=64,
sm_scale=None,
):
q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
h_qo = q.shape[1]
h_kv = k.shape[1]
head_dim = q.shape[-1]
b = cu_seqlens_q.shape[0] - 1
q_batch_len = cu_seqlens_q[1:] - cu_seqlens_q[:-1]
k_batch_len = cu_seqlens_k[1:] - cu_seqlens_k[:-1]
q_scale_len = (q_batch_len + BLKQ - 1) // BLKQ
k_scale_len = (k_batch_len + BLKK - 1) // BLKK
cu_seqlens_q_scale = torch.nn.functional.pad(
torch.cumsum(q_scale_len, dim=0), (1, 0), value=0
)
cu_seqlens_k_scale = torch.nn.functional.pad(
torch.cumsum(k_scale_len, dim=0), (1, 0), value=0
)
q_scale = torch.empty(
(cu_seqlens_q_scale[-1], h_qo), device=q.device, dtype=torch.float32
)
k_scale = torch.empty(
(cu_seqlens_k_scale[-1], h_kv), device=k.device, dtype=torch.float32
)
if sm_scale is None:
sm_scale = head_dim**-0.5
grid = ((max_seqlen_q + BLKQ - 1) // BLKQ, h_qo, b)
quant_per_block_int8_kernel[grid](
q,
q_int8,
q_scale,
cu_seqlens_q,
cu_seqlens_q_scale,
q.stride(1),
q.stride(0),
q_int8.stride(1),
q_int8.stride(0),
sm_scale=(sm_scale * 1.44269504),
H=h_qo,
C=head_dim,
BLK=BLKQ,
)
grid = ((max_seqlen_k + BLKK - 1) // BLKK, h_kv, b)
quant_per_block_int8_kernel[grid](
k,
k_int8,
k_scale,
cu_seqlens_k,
cu_seqlens_k_scale,
k.stride(1),
k.stride(0),
k_int8.stride(1),
k_int8.stride(0),
sm_scale=1.0,
H=h_kv,
C=head_dim,
BLK=BLKK,
)
return q_int8, q_scale, k_int8, k_scale, cu_seqlens_q_scale, cu_seqlens_k_scale

View File

@@ -0,0 +1,231 @@
"""
monkeypatch to add a get_turns method
"""
import logging
from typing import Generator, Tuple
from fastchat.conversation import SeparatorStyle
LOG = logging.getLogger("axolotl.monkeypatch.fastchat_conversation_turns")
def get_prompt(self) -> str:
ret = ""
for role, msg in self.get_turns():
ret += role + msg
return ret
def get_turns( # pylint: disable=too-many-return-statements
self,
) -> Generator[Tuple[str, str], None, None]:
"""Get the prompt for generation."""
system_prompt = self.system_template.format(system_message=self.system_message)
if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
yield "", system_prompt + self.sep
for role, message in self.messages:
if message:
yield role + ": ", message + self.sep
else:
yield role + ":", ""
return
if self.sep_style == SeparatorStyle.ADD_COLON_TWO:
seps = [self.sep, self.sep2]
yield "", system_prompt + seps[0]
for i, (role, message) in enumerate(self.messages):
if message:
yield role + ": ", message + seps[i % 2]
else:
yield role + ":", ""
return
if self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
yield "", system_prompt + self.sep
for role, message in self.messages:
if message:
yield role + ": ", message + self.sep
else:
yield role + ": ", "" # must be end with a space
return
if self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
yield "", "" if system_prompt == "" else system_prompt + self.sep
for role, message in self.messages:
if message:
yield role + "\n", message + self.sep
else:
yield role + "\n", ""
return
if self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
yield "", system_prompt
for role, message in self.messages:
if message:
yield role, message + self.sep
else:
yield role, ""
return
if self.sep_style == SeparatorStyle.NO_COLON_TWO:
seps = [self.sep, self.sep2]
yield "", system_prompt
for i, (role, message) in enumerate(self.messages):
if message:
yield role, message + seps[i % 2]
else:
yield role, ""
return
if self.sep_style == SeparatorStyle.RWKV:
yield "", system_prompt
for i, (role, message) in enumerate(self.messages):
if message:
yield role + ": ", message.replace("\r\n", "\n").replace(
"\n\n", "\n"
) + "\n\n"
else:
yield role + ":", ""
return
if self.sep_style == SeparatorStyle.LLAMA2 and self.name != "mistral":
if self.system_message:
if self.messages:
# For llama, the system message is incorporated into the first human instruction
first_role, first_msg = self.messages[0]
if first_role == self.roles[0]:
system_prompt += first_msg
self.messages.pop(0)
yield "", system_prompt
for i, (role, message) in enumerate(self.messages):
if message:
if (i % 2 == 0 and not self.system_message) or (
i % 2 != 0 and self.system_message
):
role = "<s> " + role
yield role + " ", message
else:
yield role, ""
return
if self.sep_style == SeparatorStyle.LLAMA2 and self.name == "mistral":
contains_sys_msg = False
if self.system_message:
contains_sys_msg = True
if self.messages:
# There is no clear guidance on how to handle system messages in Mistral so we just prepend it to the first human instruction separated by a newline
first_role, first_msg = self.messages[0]
if first_role == self.roles[0]:
system_prompt = self.system_template.format(
system_message=" " + self.system_message
)
system_prompt += first_msg
self.messages.pop(0)
yield "", system_prompt
for i, (role, message) in enumerate(self.messages):
if message and i == 0 and not contains_sys_msg:
yield "", system_prompt.strip() + " " + message # if there is no system message, we need to make sure there is the a `<s> [INST]` at the beginning of the first instruction.
elif message:
yield role + " ", message
else:
yield role, ""
return
if self.sep_style == SeparatorStyle.LLAMA3:
if self.system_message:
# For llama3, the system message is NOT incorporated into the first human instruction
# All messages follow <|start_header_id|>' + role + '<|end_header_id|>\n\n'+ message + '<|eot_id|>
yield "", system_prompt
for i, (role, message) in enumerate(self.messages):
if message:
yield f"<|start_header_id|>{role}<|end_header_id|>\n\n", f"{message.strip()}<|eot_id|>"
else:
yield f"<|start_header_id|>{role}<|end_header_id|>\n\n", ""
return
if self.sep_style == SeparatorStyle.GEMMA:
if self.system_message:
raise ValueError("Gemma chat template does not support system messages")
for i, (role, message) in enumerate(self.messages):
prefix = "<bos>" if i == 0 else ""
message_str = message if message else ""
yield prefix + "<start_of_turn>" + role + "\n", message_str + "<end_of_turn>\n"
return
if self.sep_style == SeparatorStyle.CHATGLM:
# source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
# source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
round_add_n = 1 if self.name == "chatglm2" else 0
if system_prompt:
yield "", system_prompt + self.sep
for i, (role, message) in enumerate(self.messages):
if i % 2 == 0:
yield "", f"[Round {i//2 + round_add_n}]{self.sep}"
if message:
yield f"{role}", f"{message}{self.sep}"
else:
yield f"{role}", ""
return
if self.sep_style == SeparatorStyle.CHATML:
yield "", "" if system_prompt == "" else system_prompt + self.sep + "\n"
for role, message in self.messages:
if message:
yield role + "\n", message + self.sep + "\n"
else:
yield role + "\n", ""
return
if self.sep_style == SeparatorStyle.CHATGLM3:
if self.system_message:
yield "", system_prompt
for role, message in self.messages:
if message:
yield role + "\n", " " + message
else:
yield role
return
if self.sep_style == SeparatorStyle.CHATINTERN:
# source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
seps = [self.sep, self.sep2]
yield "", system_prompt
for i, (role, message) in enumerate(self.messages):
prefix = "<s>" if i % 2 == 0 else ""
if message:
yield prefix + role + ":", message + seps[i % 2] + "\n"
else:
yield role + ":", ""
return
if self.sep_style == SeparatorStyle.DOLLY:
seps = [self.sep, self.sep2]
yield "", system_prompt
for i, (role, message) in enumerate(self.messages):
if message:
suffix = "\n\n" if i % 2 == 1 else ""
yield role + ":\n", message + seps[i % 2] + suffix
else:
yield role + ":\n", ""
return
if self.sep_style == SeparatorStyle.PHOENIX:
yield "", system_prompt
for role, message in self.messages:
if message:
yield role + ": ", "<s>" + message + "</s>"
else:
yield role + ": " + "<s>", ""
return
if self.sep_style == SeparatorStyle.ROBIN:
yield "", system_prompt + self.sep
for role, message in self.messages:
if message:
yield role + ":\n", message + self.sep
else:
yield role + ":\n", ""
return
if self.sep_style == SeparatorStyle.FALCON_CHAT:
if self.system_message:
yield "", system_prompt + self.sep
for role, message in self.messages:
if message:
yield role + ": ", message + self.sep
else:
yield role + ":", ""
else:
raise ValueError(f"Invalid style: {self.sep_style}")
def add_get_turns_to_conversation():
import fastchat.conversation
fastchat.conversation.Conversation.get_turns = get_turns
fastchat.conversation.Conversation.get_prompt = get_prompt

View File

@@ -22,6 +22,7 @@ from transformers.models.llama.modeling_llama import (
apply_rotary_pos_emb,
repeat_kv,
)
from xformers.ops import SwiGLU
from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids, set_module_name
@@ -43,19 +44,7 @@ except ImportError:
LOG = logging.getLogger("axolotl")
def is_xformers_available() -> bool:
try:
import xformers # pylint: disable=unused-import # noqa: F401
return True
except ImportError:
return False
def is_xformers_swiglu_available() -> bool:
if not is_xformers_available():
return False
from xformers.ops.common import get_xformers_operator
try:
@@ -68,11 +57,6 @@ def is_xformers_swiglu_available() -> bool:
def replace_llama_mlp_with_swiglu(model):
if is_xformers_swiglu_available():
from axolotl.monkeypatch.xformers_ import FusedMLP
else:
raise RuntimeError("xformers SwiGLU not available for this environment")
for name, module in model.named_modules():
if isinstance(module, LlamaMLP):
mlp = FusedMLP(
@@ -197,6 +181,49 @@ class FusedAttention(LlamaAttention):
set_module_name(model, name, new_attn)
class FusedMLP(torch.nn.Module):
"""
Fused MLP layer for incrementally improved training efficiency
"""
def __init__(
self,
config,
gate_proj: torch.nn.Linear,
up_proj: torch.nn.Linear,
down_proj: torch.nn.Linear,
):
super().__init__()
self.config = config
self.swiglu = SwiGLU(
in_features=config.hidden_size,
hidden_features=config.intermediate_size,
bias=False,
_pack_weights=True,
)
# overwrite initialized weights with pretrained weights
self.swiglu.w12.weight.data = torch.cat(
(gate_proj.weight.data, up_proj.weight.data), dim=0
)
self.swiglu.w3.weight.data = down_proj.weight.data
def _post_training(self, model, name):
w1, w2 = torch.split( # pylint: disable=invalid-name
self.swiglu.w12.weight.data, self.config.intermediate_size, dim=0
)
# Assign the split weights back to the original layers
new_mlp = LlamaMLP(self.config)
new_mlp.gate_proj.weight.data = w1
new_mlp.up_proj.weight.data = w2
new_mlp.down_proj.weight.data = self.swiglu.w3.weight.data
set_module_name(model, name, new_mlp)
def forward(self, x: torch.Tensor) -> torch.Tensor: # pylint: disable=invalid-name
return self.swiglu(x)
# Disable the transformation of the attention mask in LlamaModel as the flash attention
# requires the attention mask to be the same as the key_padding_mask
def _prepare_decoder_attention_mask(

View File

@@ -1,5 +1,4 @@
"""multipack patching for v2 of sample packing"""
import importlib
import transformers
@@ -28,28 +27,71 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
]
def patch_for_multipack(model_type, model_name=None, has_remote_code=False):
if has_remote_code:
patch_remote(model_name)
elif hasattr(transformers, "modeling_flash_attention_utils"):
def patch_for_multipack(model_type, model_name=None, is_remote_code=False):
if model_type == "gemmoe":
patch_remote(model_name, ".configuration_gemmoe", ".modeling_gemmoe")
elif model_type == "deepseek_v2":
patch_remote(model_name, ".configuration_deepseek", ".modeling_deepseek")
elif hasattr(transformers, "modeling_flash_attention_utils") and not is_remote_code:
transformers.modeling_flash_attention_utils._get_unpad_data = ( # pylint: disable=protected-access
get_unpad_data
)
if model_type == "mixtral" and is_deepspeed_zero3_enabled():
patch_mixtral_moe_forward_zero3()
return
if model_type == "mixtral" and is_deepspeed_zero3_enabled():
patch_mixtral_moe_forward_zero3()
# retain for legacy
if model_type == "mixtral":
transformers.models.mixtral.modeling_mixtral._get_unpad_data = ( # pylint: disable=protected-access
get_unpad_data
)
if is_deepspeed_zero3_enabled():
patch_mixtral_moe_forward_zero3()
elif model_type == "llama":
if hasattr(transformers.models.llama.modeling_llama, "_get_unpad_data"):
transformers.models.llama.modeling_llama._get_unpad_data = ( # pylint: disable=protected-access
get_unpad_data
)
elif model_type == "mistral":
if hasattr(transformers.models.mistral.modeling_mistral, "_get_unpad_data"):
transformers.models.llama.modeling_llama._get_unpad_data = ( # pylint: disable=protected-access
get_unpad_data
)
elif model_type == "qwen2":
transformers.models.qwen2.modeling_qwen2._get_unpad_data = ( # pylint: disable=protected-access
get_unpad_data
)
elif model_type == "qwen2_moe":
transformers.models.qwen2_moe.modeling_qwen2_moe._get_unpad_data = ( # pylint: disable=protected-access
get_unpad_data
)
elif model_type == "falcon":
transformers.models.falcon.modeling_falcon._get_unpad_data = ( # pylint: disable=protected-access
get_unpad_data
)
elif model_type == "phi":
transformers.models.phi.modeling_phi._get_unpad_data = ( # pylint: disable=protected-access
get_unpad_data
)
elif model_type == "gemma":
transformers.models.gemma.modeling_gemma._get_unpad_data = ( # pylint: disable=protected-access
get_unpad_data
)
elif model_type == "gemma2":
transformers.models.gemma2.modeling_gemma2._get_unpad_data = ( # pylint: disable=protected-access
get_unpad_data
)
elif model_type == "starcoder2":
transformers.models.starcoder2.modeling_starcoder2._get_unpad_data = ( # pylint: disable=protected-access
get_unpad_data
)
def patch_remote(model_name):
def patch_remote(model_name, config_name, modeling_name):
model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
# we need to load the model here in order for modeling_* to be available
with init_empty_weights():
AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
parts = model_config.__class__.__module__.split(".")
parts[-1] = parts[-1].replace("configuration_", "modeling_", 1)
module_name = ".".join(parts)
module_name = model_config.__class__.__module__.replace(config_name, modeling_name)
modeling_arch = importlib.import_module(module_name)
if hasattr(modeling_arch, "_get_unpad_data"):
modeling_arch._get_unpad_data = ( # pylint: disable=protected-access
get_unpad_data
)
modeling_arch._get_unpad_data = get_unpad_data # pylint: disable=protected-access

View File

@@ -16,6 +16,26 @@ from transformers.models.llama.modeling_llama import (
LOG = get_logger("axolotl.monkeypatch.unsloth")
ORIGINAL_CEL_CODE = """# Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss()
shift_logits = shift_logits.view(-1, self.config.vocab_size)
shift_labels = shift_labels.view(-1)
# Enable model parallelism
shift_labels = shift_labels.to(shift_logits.device)
loss = loss_fct(shift_logits, shift_labels)
"""
PATCHED_CEL_CODE = """shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss = fast_cross_entropy_loss(
logits = shift_logits,
labels = shift_labels,
)
"""
ORIGINAL_QKV_CODE = """
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
@@ -60,6 +80,12 @@ def get_forward_code() -> str:
return forward
def check_cel_is_patchable() -> bool:
forward = get_forward_code()
forward, _ = detab_code(forward)
return ORIGINAL_CEL_CODE in forward
def get_self_attn_code() -> str:
forward = inspect.getsource(LlamaFlashAttention2.forward)
return forward
@@ -72,31 +98,48 @@ def check_self_attn_is_patchable() -> bool:
def integrate_cross_entropy_loss_patch(model_type: str = "llama") -> None:
from unsloth.kernels.cross_entropy_loss import fast_cross_entropy_loss
def UnslothForCausalLMLoss( # pylint: disable=invalid-name
logits,
labels,
vocab_size: int, # pylint: disable=unused-argument
num_items_in_batch: int = None,
ignore_index: int = -100, # pylint: disable=unused-argument
**kwargs, # pylint: disable=unused-argument
):
# Upcast to float if we need to compute the loss to avoid potential precision issues
logits = logits.float()
# Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss = fast_cross_entropy_loss(
logits=shift_logits, labels=shift_labels, n_items=num_items_in_batch
)
return loss
if model_type == "llama":
from transformers.loss import loss_utils
forward = get_forward_code()
LlamaForCausalLM._original_forward = forward # pylint: disable=protected-access
forward, _ = detab_code(forward)
assert ORIGINAL_CEL_CODE in forward, "Original forward code not found"
loss_utils.ForCausalLMLoss = UnslothForCausalLMLoss # type: ignore[assignment]
forward = forward.replace(
"@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)", ""
)
forward = forward.replace(
"@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)",
"",
)
forward = forward.replace(ORIGINAL_CEL_CODE, PATCHED_CEL_CODE)
forward = forward.replace(
"def forward(",
"def fast_cross_entropy_loss_forward(",
1,
)
# load imports necessary
import transformers.models.llama.modeling_llama
items_to_import = []
for item in dir(transformers.models.llama.modeling_llama):
if item in forward:
items_to_import.append(item)
exec( # pylint: disable=exec-used # nosec B102
"from unsloth.kernels.cross_entropy_loss import fast_cross_entropy_loss",
globals(),
)
exec( # pylint: disable=exec-used # nosec B102
"from transformers.models.llama.modeling_llama import ("
+ ", ".join(x for x in items_to_import)
+ ")",
globals(),
)
exec(forward, globals()) # pylint: disable=exec-used # nosec B102
LOG.info("patching unsloth fast_cross_entropy_loss", main_process_only=True)
LlamaForCausalLM.forward = fast_cross_entropy_loss_forward # pylint: disable=undefined-variable # noqa: F821
else:
raise ValueError("Unsupported model type")
@@ -188,7 +231,7 @@ def integrate_lora_mlp_patch(peft_model: PeftModelForCausalLM):
for module in layer_modules
)
mlp_not_dora = all(
len(getattr(module, "lora_magnitude_vector", []) or []) == 0
getattr(module, "lora_magnitude_vector", None) is None
for module in layer_modules
)
@@ -213,7 +256,7 @@ def integrate_lora_patch(peft_model: PeftModelForCausalLM, cfg):
for module in layer_modules
)
qkv_not_dora = all(
len(getattr(module, "lora_magnitude_vector", []) or []) == 0
getattr(module, "lora_magnitude_vector", None) is None
for module in layer_modules
)
@@ -232,7 +275,7 @@ def integrate_lora_patch(peft_model: PeftModelForCausalLM, cfg):
for module in layer_modules
)
o_not_dora = all(
len(getattr(module, "lora_magnitude_vector", []) or []) == 0
getattr(module, "lora_magnitude_vector", None) is None
for module in layer_modules
)

View File

@@ -1,51 +0,0 @@
"""
Fused MLP layer for incrementally improved training efficiency
"""
import torch
from transformers.models.llama.modeling_llama import LlamaMLP
from xformers.ops import SwiGLU
from axolotl.monkeypatch.utils import set_module_name
class FusedMLP(torch.nn.Module):
"""
Fused MLP layer for incrementally improved training efficiency
"""
def __init__(
self,
config,
gate_proj: torch.nn.Linear,
up_proj: torch.nn.Linear,
down_proj: torch.nn.Linear,
):
super().__init__()
self.config = config
self.swiglu = SwiGLU(
in_features=config.hidden_size,
hidden_features=config.intermediate_size,
bias=False,
_pack_weights=True,
)
# overwrite initialized weights with pretrained weights
self.swiglu.w12.weight.data = torch.cat(
(gate_proj.weight.data, up_proj.weight.data), dim=0
)
self.swiglu.w3.weight.data = down_proj.weight.data
def _post_training(self, model, name):
w1, w2 = torch.split( # pylint: disable=invalid-name
self.swiglu.w12.weight.data, self.config.intermediate_size, dim=0
)
# Assign the split weights back to the original layers
new_mlp = LlamaMLP(self.config)
new_mlp.gate_proj.weight.data = w1
new_mlp.up_proj.weight.data = w2
new_mlp.down_proj.weight.data = self.swiglu.w3.weight.data
set_module_name(model, name, new_mlp)
def forward(self, x: torch.Tensor) -> torch.Tensor: # pylint: disable=invalid-name
return self.swiglu(x)

View File

@@ -11,10 +11,6 @@ LOG = logging.getLogger("axolotl.prompt_strategies")
def load(strategy, tokenizer, cfg, ds_cfg, processor=None):
try:
if strategy == "messages":
from .messages import load as messages_load
return messages_load(tokenizer, cfg, ds_cfg, processor=processor)
load_fn = "load"
if strategy.split(".")[-1].startswith("load_"):
load_fn = strategy.split(".")[-1]
@@ -35,5 +31,4 @@ def load(strategy, tokenizer, cfg, ds_cfg, processor=None):
return None
except Exception as exc: # pylint: disable=broad-exception-caught
LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}")
raise exc
return None
return None

View File

@@ -1,10 +0,0 @@
### example yaml
```yaml
chat_template: gemma
datasets:
- path: argilla/distilabel-intel-orca-dpo-pairs
type: bradley_terry.chat_template
val_set_size: 0.0
output_dir: ./outputs/out
```

View File

@@ -1,35 +0,0 @@
"""Module to load prompt strategies."""
import importlib
import inspect
import logging
from axolotl.prompt_strategies.user_defined import UserDefinedDatasetConfig
LOG = logging.getLogger("axolotl.prompt_strategies.bradley_terry")
def load(strategy, tokenizer, cfg, ds_cfg):
# pylint: disable=duplicate-code
try:
load_fn = "load"
if strategy.split(".")[-1].startswith("load_"):
load_fn = strategy.split(".")[-1]
strategy = ".".join(strategy.split(".")[:-1])
mod = importlib.import_module(
f".{strategy}", "axolotl.prompt_strategies.bradley_terry"
)
func = getattr(mod, load_fn)
load_kwargs = {}
if strategy == "user_defined":
load_kwargs["ds_cfg"] = UserDefinedDatasetConfig(**ds_cfg)
else:
sig = inspect.signature(func)
if "ds_cfg" in sig.parameters:
load_kwargs["ds_cfg"] = ds_cfg
return func(tokenizer, cfg, **load_kwargs)
except ModuleNotFoundError:
return None
except Exception as exc: # pylint: disable=broad-exception-caught
LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}")
return None

View File

@@ -1,102 +0,0 @@
"""
Bradley-Terry model with chat template prompt strategy.
"""
import logging
from typing import Any, Dict, Optional
from axolotl.prompt_strategies.chat_template import (
ChatTemplatePrompter,
ChatTemplateStrategy,
)
from axolotl.utils.chat_templates import get_chat_template_from_config
# Configure the logger
LOG = logging.getLogger("axolotl.prompt_strategies.bradley_terry.chat_template")
LOG.setLevel(logging.INFO)
class BTChatTemplateStrategy(ChatTemplateStrategy):
"""
Bradley-Terry reward model pairwise chat template prompt strategy.
"""
def tokenize_prompt(self, prompt):
"""
:param prompt: the actual row of data from the underlying dataset
:return:
"""
self.messages = "chosen_messages"
# pylint: disable=duplicate-code
prompt[self.messages] = []
if prompt["system"]:
prompt[self.messages].append(
{"role": "system", "content": prompt["system"]}
)
prompt[self.messages].append({"role": "user", "content": prompt["input"]})
prompt[self.messages].append({"role": "assistant", "content": prompt["chosen"]})
chosen_tokenized = super().tokenize_prompt(prompt)
self.messages = "rejected_messages"
# pylint: disable=duplicate-code
prompt[self.messages] = []
if prompt["system"]:
prompt[self.messages].append(
{"role": "system", "content": prompt["system"]}
)
prompt[self.messages].append({"role": "user", "content": prompt["input"]})
prompt[self.messages].append(
{"role": "assistant", "content": prompt["rejected"]}
)
rejected_tokenized = super().tokenize_prompt(prompt)
return {
"input_ids_chosen": chosen_tokenized["input_ids"],
"attention_mask_chosen": chosen_tokenized["attention_mask"],
"labels_chosen": 1.0,
"input_ids_rejected": rejected_tokenized["input_ids"],
"attention_mask_rejected": rejected_tokenized["attention_mask"],
"labels_rejected": 0.0,
}
def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
ds_cfg = ds_cfg or {}
chat_template_string = get_chat_template_from_config(
cfg=cfg, ds_cfg=ds_cfg, tokenizer=tokenizer
)
prompter_params = {
"tokenizer": tokenizer,
"chat_template": chat_template_string,
"message_field_role": ds_cfg.get("message_field_role", "role"),
"message_field_content": ds_cfg.get("message_field_content", "content"),
"message_field_training": ds_cfg.get("message_field_training", None),
"message_field_training_detail": ds_cfg.get(
"message_field_training_detail", None
),
"roles": ds_cfg.get("roles"),
"drop_system_message": ds_cfg.get("drop_system_message", False),
# we need to add one for detecting sequences with exceeding the `sequence_len` limit.
"max_length": cfg.sequence_len + 1
if not cfg.reward_model
else cfg.sequence_len,
}
strategy_params = {
"train_on_inputs": cfg.train_on_inputs,
"sequence_len": cfg.sequence_len,
"roles_to_train": ds_cfg.get("roles_to_train", []),
"train_on_eos": ds_cfg.get("train_on_eos", None),
}
strategy = BTChatTemplateStrategy(
ChatTemplatePrompter(**prompter_params), tokenizer=tokenizer, **strategy_params
)
if "field_messages" in ds_cfg and hasattr(strategy, "messages"):
strategy.messages = ds_cfg["field_messages"]
return strategy

View File

@@ -1,27 +0,0 @@
"""
chatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template
"""
def icr(
cfg,
**kwargs,
): # pylint: disable=possibly-unused-variable,unused-argument
"""
chatml transforms for datasets with system, input, chosen, rejected
ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs
"""
def transform_fn(sample):
if "system" in sample and sample["system"]:
prompt = (
f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>"
f"<|start_header_id|>user<|end_header_id|>\n\n{sample['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
)
else:
prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{sample['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
sample["chosen"] = prompt + f"{sample['chosen']}<|eot_id|>"
sample["rejected"] = prompt + f"{sample['rejected']}<|eot_id|>"
return sample
return transform_fn

View File

@@ -9,7 +9,7 @@ from transformers import ProcessorMixin
from axolotl.prompt_tokenizers import PromptTokenizingStrategy
from axolotl.prompters import IGNORE_TOKEN_ID, Prompter
from axolotl.utils.chat_templates import get_chat_template_from_config
from axolotl.utils.chat_templates import chat_templates
# Configure the logger
LOG = logging.getLogger("axolotl")
@@ -403,16 +403,11 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None, processor=None):
# pylint: disable=duplicate-code
ds_cfg = ds_cfg or {}
chat_template_string = get_chat_template_from_config(
cfg=cfg, ds_cfg=ds_cfg, tokenizer=tokenizer
)
LOG.info(f"Using chat template:\n---\n{chat_template_string!s}\n---")
prompter_params = {
"tokenizer": tokenizer,
"chat_template": chat_template_string,
"chat_template": chat_templates(ds_cfg.get("chat_template", "chatml")),
"message_field_role": ds_cfg.get("message_field_role", "role"),
"message_field_content": ds_cfg.get("message_field_content", "content"),
"message_field_training": ds_cfg.get("message_field_training", None),

View File

@@ -2,16 +2,15 @@
DPO prompt strategies for using tokenizer chat templates.
"""
from axolotl.utils.chat_templates import extract_chat_template_args, get_chat_template
from axolotl.utils.chat_templates import chat_templates
def default(
cfg, dataset_idx=0, **kwargs
): # pylint: disable=possibly-unused-variable,unused-argument
ds_cfg = cfg["datasets"][dataset_idx]
chat_template_choice, chat_template_jinja = extract_chat_template_args(
cfg=cfg, ds_cfg=ds_cfg
)
chat_template_str = chat_templates(cfg.chat_template)
field_messages = ds_cfg.get("field_messages", "messages")
field_chosen = ds_cfg.get("field_chosen", "chosen")
field_rejected = ds_cfg.get("field_rejected", "rejected")
@@ -31,12 +30,6 @@ def default(
role_map[source] = target
def transform_fn(sample, tokenizer=None):
chat_template_string = get_chat_template(
user_choice=chat_template_choice,
jinja_template=chat_template_jinja,
tokenizer=tokenizer,
)
messages = sample[field_messages]
messages = [
{
@@ -53,29 +46,28 @@ def default(
"role": role_map[sample[field_rejected][field_message_role]],
"content": sample[field_rejected][field_message_content],
}
dummy_user_message = {"role": "user", "content": "[[dummy_message]]"}
result = {}
result["prompt"] = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
chat_template=chat_template_string,
chat_template=chat_template_str,
tokenize=False,
)
result["chosen"] = tokenizer.apply_chat_template(
[dummy_user_message, chosen],
[chosen],
add_generation_prompt=False,
chat_template=chat_template_string,
chat_template=chat_template_str,
tokenize=False,
)
chosen_strip_index = result["chosen"].find(chosen["content"])
result["chosen"] = result["chosen"][chosen_strip_index:].rstrip()
result["rejected"] = tokenizer.apply_chat_template(
[dummy_user_message, rejected],
[rejected],
add_generation_prompt=False,
chat_template=chat_template_string,
chat_template=chat_template_str,
tokenize=False,
)
rejected_strip_index = result["rejected"].find(rejected["content"])

View File

@@ -0,0 +1,33 @@
"""Module containing the InstructShareGPTPromptTokenizingStrategy class"""
from typing import Any, Dict, Optional
from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
from axolotl.prompters import ShareGPTPrompterV2
def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
conversation = (
ds_cfg["conversation"] if ds_cfg and "conversation" in ds_cfg else None
)
strategy = InstructShareGPTPromptTokenizingStrategy(
# pylint: disable=duplicate-code
ShareGPTPrompterV2(
conversation=conversation,
),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
)
return strategy
class InstructShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
"""
basic sharegpt strategy to grab conversations from the sample row
"""
def get_conversation_thread(self, prompt):
return [
{"from": "human", "value": prompt["instruction"]},
{"from": "gpt", "value": prompt["output"]},
]

Some files were not shown because too many files have changed in this diff Show More