Compare commits
7 Commits
lisa
...
scatter_mo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9c221a6761 | ||
|
|
301cc4c006 | ||
|
|
035e680631 | ||
|
|
26fc10df01 | ||
|
|
1bc008e901 | ||
|
|
3f7ed6a784 | ||
|
|
feea977923 |
11
.github/workflows/base.yml
vendored
11
.github/workflows/base.yml
vendored
@@ -16,22 +16,17 @@ jobs:
|
||||
cuda_version: 11.8.0
|
||||
python_version: "3.10"
|
||||
pytorch: 2.1.2
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
||||
- cuda: "121"
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.10"
|
||||
pytorch: 2.1.2
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
||||
- cuda: "121"
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.1.2
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
- cuda: "121"
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.2.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
31
.github/workflows/docs.yml
vendored
31
.github/workflows/docs.yml
vendored
@@ -1,31 +0,0 @@
|
||||
name: Publish Docs
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
pages: write
|
||||
|
||||
jobs:
|
||||
build-deploy:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up Quarto
|
||||
uses: quarto-dev/quarto-actions/setup@v2
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v3
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: install dependencies
|
||||
run: |
|
||||
python3 -m pip install jupyter
|
||||
- name: Publish to GitHub Pages (and render)
|
||||
uses: quarto-dev/quarto-actions/publish@v2
|
||||
with:
|
||||
target: gh-pages
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
8
.github/workflows/main.yml
vendored
8
.github/workflows/main.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.2.1
|
||||
pytorch: 2.1.2
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
|
||||
build-axolotl-cloud:
|
||||
build-axolotl-runpod:
|
||||
needs: build-axolotl
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
@@ -84,7 +84,7 @@ jobs:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.2.1
|
||||
pytorch: 2.1.2
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
@@ -113,5 +113,7 @@ jobs:
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: |
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
winglian/axolotl-runpod:main-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
||||
${{ (matrix.is_latest) && format('{0}-latest', 'winglian/axolotl-runpod:main') || '' }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
|
||||
118
.github/workflows/nightlies.yml
vendored
118
.github/workflows/nightlies.yml
vendored
@@ -1,118 +0,0 @@
|
||||
name: docker-nightlies
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '0 0 * * *' # Runs at 00:00 UTC every day
|
||||
|
||||
jobs:
|
||||
build-axolotl:
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 118
|
||||
cuda_version: 11.8.0
|
||||
python_version: "3.10"
|
||||
pytorch: 2.1.2
|
||||
axolotl_extras:
|
||||
axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
|
||||
is_latest: true
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.10"
|
||||
pytorch: 2.1.2
|
||||
axolotl_extras:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.2.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Docker metadata
|
||||
id: metadata
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: winglian/axolotl
|
||||
tags: |
|
||||
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
# guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
|
||||
- name: Build and export to Docker
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
PYTORCH_VERSION=${{ matrix.pytorch }}
|
||||
AXOLOTL_ARGS=${{ matrix.axolotl_args }}
|
||||
file: ./docker/Dockerfile
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: |
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
|
||||
build-axolotl-cloud:
|
||||
needs: build-axolotl
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 118
|
||||
cuda_version: 11.8.0
|
||||
python_version: "3.10"
|
||||
pytorch: 2.1.2
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.10"
|
||||
pytorch: 2.1.2
|
||||
axolotl_extras:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.2.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Docker metadata
|
||||
id: metadata
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: winglian/axolotl-cloud
|
||||
tags: |
|
||||
type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Build
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
file: ./docker/Dockerfile-cloud
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: |
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
2
.github/workflows/pypi.yml
vendored
2
.github/workflows/pypi.yml
vendored
@@ -25,7 +25,7 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip3 install wheel packaging
|
||||
pip3 install wheel
|
||||
pip3 install -e .
|
||||
pip3 install -r requirements-tests.txt
|
||||
|
||||
|
||||
9
.github/workflows/tests.yml
vendored
9
.github/workflows/tests.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.10", "3.11"]
|
||||
timeout-minutes: 20
|
||||
timeout-minutes: 10
|
||||
|
||||
steps:
|
||||
- name: Check out repository code
|
||||
@@ -48,8 +48,6 @@ jobs:
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip3 install --upgrade pip
|
||||
pip3 install --upgrade packaging
|
||||
pip3 install -U -e .
|
||||
pip3 install -r requirements-tests.txt
|
||||
|
||||
@@ -79,11 +77,6 @@ jobs:
|
||||
python_version: "3.10"
|
||||
pytorch: 2.1.2
|
||||
num_gpus: 1
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.2.1
|
||||
num_gpus: 1
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -2,7 +2,6 @@
|
||||
configs
|
||||
last_run_prepared/
|
||||
.vscode
|
||||
_site/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
@@ -173,5 +172,3 @@ wandb
|
||||
lora-out/*
|
||||
qlora-out/*
|
||||
mlruns/*
|
||||
|
||||
/.quarto/
|
||||
|
||||
88
README.md
88
README.md
@@ -13,9 +13,6 @@ Features:
|
||||
- Log results and optionally checkpoints to wandb or mlflow
|
||||
- And more!
|
||||
|
||||
<a href="https://www.phorm.ai/query?projectId=e315ba4a-4e14-421f-ab05-38a1f9076f25">
|
||||
<img alt="phorm.ai" src="https://img.shields.io/badge/Phorm-Ask_AI-%23F2777A.svg?&logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNSIgaGVpZ2h0PSI0IiBmaWxsPSJub25lIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPgogIDxwYXRoIGQ9Ik00LjQzIDEuODgyYTEuNDQgMS40NCAwIDAgMS0uMDk4LjQyNmMtLjA1LjEyMy0uMTE1LjIzLS4xOTIuMzIyLS4wNzUuMDktLjE2LjE2NS0uMjU1LjIyNmExLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxMmMtLjA5OS4wMTItLjE5Mi4wMTQtLjI3OS4wMDZsLTEuNTkzLS4xNHYtLjQwNmgxLjY1OGMuMDkuMDAxLjE3LS4xNjkuMjQ2LS4xOTFhLjYwMy42MDMgMCAwIDAgLjItLjEwNi41MjkuNTI5IDAgMCAwIC4xMzgtLjE3LjY1NC42NTQgMCAwIDAgLjA2NS0uMjRsLjAyOC0uMzJhLjkzLjkzIDAgMCAwLS4wMzYtLjI0OS41NjcuNTY3IDAgMCAwLS4xMDMtLjIuNTAyLjUwMiAwIDAgMC0uMTY4LS4xMzguNjA4LjYwOCAwIDAgMC0uMjQtLjA2N0wyLjQzNy43MjkgMS42MjUuNjcxYS4zMjIuMzIyIDAgMCAwLS4yMzIuMDU4LjM3NS4zNzUgMCAwIDAtLjExNi4yMzJsLS4xMTYgMS40NS0uMDU4LjY5Ny0uMDU4Ljc1NEwuNzA1IDRsLS4zNTctLjA3OUwuNjAyLjkwNkMuNjE3LjcyNi42NjMuNTc0LjczOS40NTRhLjk1OC45NTggMCAwIDEgLjI3NC0uMjg1Ljk3MS45NzEgMCAwIDEgLjMzNy0uMTRjLjExOS0uMDI2LjIyNy0uMDM0LjMyNS0uMDI2TDMuMjMyLjE2Yy4xNTkuMDE0LjMzNi4wMy40NTkuMDgyYTEuMTczIDEuMTczIDAgMCAxIC41NDUuNDQ3Yy4wNi4wOTQuMTA5LjE5Mi4xNDQuMjkzYTEuMzkyIDEuMzkyIDAgMCAxIC4wNzguNThsLS4wMjkuMzJaIiBmaWxsPSIjRjI3NzdBIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+Cjwvc3ZnPgo=">
|
||||
</a>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
@@ -31,8 +28,6 @@ Features:
|
||||
- [Cloud GPU](#cloud-gpu) - Latitude.sh, JarvisLabs, RunPod
|
||||
- [Bare Metal Cloud GPU](#bare-metal-cloud-gpu)
|
||||
- [Windows](#windows)
|
||||
- [Mac](#mac)
|
||||
- [Google Colab](#google-colab)
|
||||
- [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
|
||||
- [Dataset](#dataset)
|
||||
- [How to Add Custom Prompts](#how-to-add-custom-prompts)
|
||||
@@ -43,8 +38,8 @@ Features:
|
||||
- [Merge LORA to Base](#merge-lora-to-base)
|
||||
- [Special Tokens](#special-tokens)
|
||||
- Advanced Topics
|
||||
- [Multipack](./docs/multipack.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
|
||||
- [RLHF & DPO](./docs/rlhf.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
|
||||
- [Multipack](./docs/multipack.md)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
|
||||
- [RLHF & DPO](./docs/rlhf.md)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
|
||||
- [Common Errors](#common-errors-)
|
||||
- [Tokenization Mismatch b/w Training & Inference](#tokenization-mismatch-bw-inference--training)
|
||||
- [Debugging Axolotl](#debugging-axolotl)
|
||||
@@ -104,14 +99,24 @@ Get started with Axolotl in just a few steps! This quickstart guide will walk yo
|
||||
|
||||
**Requirements**: Python >=3.10 and Pytorch >=2.1.1.
|
||||
|
||||
### For developers
|
||||
```bash
|
||||
git clone https://github.com/OpenAccess-AI-Collective/axolotl
|
||||
cd axolotl
|
||||
|
||||
pip3 install packaging
|
||||
```
|
||||
|
||||
General case:
|
||||
```
|
||||
pip3 install -e '.[flash-attn,deepspeed]'
|
||||
```
|
||||
|
||||
Mac: see https://github.com/OpenAccess-AI-Collective/axolotl/blob/13199f678b9aab39e92961323bdbce3234ee4b2b/docs/mac.md
|
||||
```
|
||||
pip3 install -e '.'
|
||||
```
|
||||
|
||||
### Usage
|
||||
```bash
|
||||
# preprocess datasets - optional but recommended
|
||||
@@ -150,7 +155,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/OpenAcc
|
||||
```
|
||||
|
||||
>[!Tip]
|
||||
> If you want to debug axolotl or prefer to use Docker as your development environment, see the [debugging guide's section on Docker](docs/debugging.qmd#debugging-with-docker).
|
||||
> If you want to debug axolotl or prefer to use Docker as your development environment, see the [debugging guide's section on Docker](docs/debugging.md#debugging-with-docker).
|
||||
|
||||
<details>
|
||||
|
||||
@@ -244,35 +249,9 @@ For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud
|
||||
```
|
||||
</details>
|
||||
|
||||
##### GCP
|
||||
|
||||
<details>
|
||||
|
||||
<summary>Click to Expand</summary>
|
||||
|
||||
Use a Deeplearning linux OS with cuda and pytorch installed. Then follow instructions on quickstart.
|
||||
|
||||
Make sure to run the below to uninstall xla.
|
||||
```bash
|
||||
pip uninstall -y torch_xla[tpu]
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
#### Windows
|
||||
Please use WSL or Docker!
|
||||
|
||||
#### Mac
|
||||
|
||||
Use the below instead of the install method in QuickStart.
|
||||
```
|
||||
pip3 install -e '.'
|
||||
```
|
||||
More info: [mac.md](/docs/mac.qmd)
|
||||
|
||||
#### Google Colab
|
||||
|
||||
Please use this example [notebook](examples/colab-notebooks/colab-axolotl-example.ipynb).
|
||||
|
||||
#### Launching on public clouds via SkyPilot
|
||||
To launch on GPU instances (both on-demand and spot instances) on 7+ clouds (GCP, AWS, Azure, OCI, and more), you can use [SkyPilot](https://skypilot.readthedocs.io/en/latest/index.html):
|
||||
@@ -414,7 +393,7 @@ pretraining_dataset: # hf path only
|
||||
{"segments": [{"label": true|false, "text": "..."}]}
|
||||
```
|
||||
|
||||
This is a special format that allows you to construct prompts without using templates. This is for advanced users who want more freedom with prompt construction. See [these docs](docs/input_output.qmd) for more details.
|
||||
This is a special format that allows you to construct prompts without using templates. This is for advanced users who want more freedom with prompt construction. See [these docs](docs/input_output.md) for more details.
|
||||
|
||||
##### Conversation
|
||||
|
||||
@@ -656,13 +635,9 @@ datasets:
|
||||
train_on_split: train # Optional[str] name of dataset split to load from
|
||||
|
||||
# Optional[str] fastchat conversation type, only used with type: sharegpt
|
||||
conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
|
||||
conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
|
||||
field_human: # Optional[str]. Human key to use for conversation.
|
||||
field_model: # Optional[str]. Assistant key to use for conversation.
|
||||
# Add additional keys from your dataset as input or output roles
|
||||
roles:
|
||||
input: # Optional[List[str]]. These will be masked based on train_on_input
|
||||
output: # Optional[List[str]].
|
||||
|
||||
# Custom user instruction prompt
|
||||
- path: repo
|
||||
@@ -687,10 +662,6 @@ datasets:
|
||||
# For `completion` datsets only, uses the provided field instead of `text` column
|
||||
field:
|
||||
|
||||
# If false, the datasets will not be shuffled and will keep their original order in `datasets`.
|
||||
# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
|
||||
shuffle_merged_datasets: true
|
||||
|
||||
# A list of one or more datasets to eval the model with.
|
||||
# You can use either test_datasets, or val_set_size, but not both.
|
||||
test_datasets:
|
||||
@@ -872,7 +843,7 @@ group_by_length: false
|
||||
gradient_checkpointing: false
|
||||
# additional kwargs to pass to the trainer for gradient checkpointing
|
||||
# gradient_checkpointing_kwargs:
|
||||
# use_reentrant: true
|
||||
# use_reentrant: false
|
||||
|
||||
# Stop training after this many evaluation losses have increased in a row
|
||||
# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
|
||||
@@ -912,26 +883,7 @@ lr_div_factor: # Learning rate div factor
|
||||
# - paged_adamw_8bit
|
||||
# - paged_lion_32bit
|
||||
# - paged_lion_8bit
|
||||
# - galore_adamw
|
||||
# - galore_adamw_8bit
|
||||
# - galore_adafactor
|
||||
# - galore_adamw_layerwise
|
||||
# - galore_adamw_8bit_layerwise
|
||||
# - galore_adafactor_layerwise
|
||||
optimizer:
|
||||
# Dictionary of arguments to pass to the optimizer
|
||||
optim_args:
|
||||
# For Galore Optimizers the following optim_args are available
|
||||
# rank: # type: int
|
||||
# update_proj_gap # type: int
|
||||
# scale # type: float
|
||||
# proj_type: # type: str, default = std
|
||||
|
||||
# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm
|
||||
optim_target_modules:
|
||||
# - self_attn # for llama
|
||||
# - mlp
|
||||
|
||||
# Specify weight decay
|
||||
weight_decay:
|
||||
# adamw hyperparams
|
||||
@@ -1130,7 +1082,7 @@ fsdp_config:
|
||||
|
||||
##### FSDP + QLoRA
|
||||
|
||||
Axolotl supports training with FSDP and QLoRA, see [these docs](docs/fsdp_qlora.qmd) for more information.
|
||||
Axolotl supports training with FSDP and QLoRA, see [these docs](docs/fsdp_qlora.md) for more information.
|
||||
|
||||
##### Weights & Biases Logging
|
||||
|
||||
@@ -1209,7 +1161,7 @@ although this will be very slow, and using the config options above are recommen
|
||||
|
||||
## Common Errors 🧰
|
||||
|
||||
See also the [FAQ's](./docs/faq.qmd) and [debugging guide](docs/debugging.qmd).
|
||||
See also the [FAQ's](./docs/faq.md) and [debugging guide](docs/debugging.md).
|
||||
|
||||
> If you encounter a 'Cuda out of memory' error, it means your GPU ran out of memory during the training process. Here's how to resolve it:
|
||||
|
||||
@@ -1243,7 +1195,7 @@ It's safe to ignore it.
|
||||
|
||||
> NCCL Timeouts during training
|
||||
|
||||
See the [NCCL](docs/nccl.qmd) guide.
|
||||
See the [NCCL](docs/nccl.md) guide.
|
||||
|
||||
|
||||
### Tokenization Mismatch b/w Inference & Training
|
||||
@@ -1261,7 +1213,7 @@ Having misalignment between your prompts during training and inference can cause
|
||||
|
||||
## Debugging Axolotl
|
||||
|
||||
See [this debugging guide](docs/debugging.qmd) for tips on debugging Axolotl, along with an example configuration for debugging with VSCode.
|
||||
See [this debugging guide](docs/debugging.md) for tips on debugging Axolotl, along with an example configuration for debugging with VSCode.
|
||||
|
||||
## Need help? 🙋
|
||||
|
||||
|
||||
51
_quarto.yml
51
_quarto.yml
@@ -1,51 +0,0 @@
|
||||
project:
|
||||
type: website
|
||||
|
||||
website:
|
||||
title: "Axolotl"
|
||||
description: "Fine-tuning"
|
||||
favicon: favicon.jpg
|
||||
navbar:
|
||||
title: Axolotl
|
||||
background: dark
|
||||
pinned: false
|
||||
collapse: false
|
||||
tools:
|
||||
- icon: twitter
|
||||
href: https://twitter.com/axolotl_ai
|
||||
- icon: github
|
||||
href: https://github.com/OpenAccess-AI-Collective/axolotl/
|
||||
- icon: discord
|
||||
href: https://discord.gg/7m9sfhzaf3
|
||||
|
||||
sidebar:
|
||||
pinned: true
|
||||
collapse-level: 2
|
||||
style: docked
|
||||
contents:
|
||||
- text: Home
|
||||
href: index.qmd
|
||||
- section: "How-To Guides"
|
||||
contents:
|
||||
# TODO Edit folder structure after we have more docs.
|
||||
- docs/debugging.qmd
|
||||
- docs/multipack.qmd
|
||||
- docs/fdsp_qlora.qmd
|
||||
- docs/input_output.qmd
|
||||
- docs/rlhf.qmd
|
||||
- docs/nccl.qmd
|
||||
- docs/mac.qmd
|
||||
- docs/multi-node.qmd
|
||||
- section: "Reference"
|
||||
contents:
|
||||
- docs/config.qmd
|
||||
- docs/faq.qmd
|
||||
|
||||
|
||||
|
||||
|
||||
format:
|
||||
html:
|
||||
theme: materia
|
||||
css: styles.css
|
||||
toc: true
|
||||
@@ -22,11 +22,10 @@ RUN git fetch origin +$GITHUB_REF && \
|
||||
git checkout FETCH_HEAD
|
||||
|
||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||
RUN pip install causal_conv1d
|
||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
else \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
|
||||
fi
|
||||
|
||||
# So we can test the Docker image
|
||||
|
||||
@@ -1 +1 @@
|
||||
This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information.
|
||||
This directory contains example config files that might be useful for debugging. Please see [docs/debugging.md](../docs/debugging.md) for more information.
|
||||
|
||||
@@ -20,11 +20,10 @@ RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
||||
WORKDIR /workspace/axolotl
|
||||
|
||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||
RUN pip install causal_conv1d
|
||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
else \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
|
||||
fi
|
||||
|
||||
# So we can test the Docker image
|
||||
|
||||
2
docs/.gitignore
vendored
2
docs/.gitignore
vendored
@@ -1,2 +0,0 @@
|
||||
/.quarto/
|
||||
_site/
|
||||
@@ -1,17 +0,0 @@
|
||||
---
|
||||
title: Config options
|
||||
description: A complete list of all configuration options.
|
||||
---
|
||||
|
||||
```{python}
|
||||
#|echo: false
|
||||
#|output: asis
|
||||
import re
|
||||
# Regex pattern to match the YAML block including its code fence
|
||||
pattern = r'<details[^>]*id="all-yaml-options"[^>]*>.*?<summary>All yaml options.*?```yaml(.*?)```.*?</details>'
|
||||
|
||||
with open('../README.md', 'r') as f:
|
||||
doc = f.read()
|
||||
match = re.search(pattern, doc, re.DOTALL)
|
||||
print("```yaml", match.group(1).strip(), "```", sep="\n")
|
||||
```
|
||||
@@ -1,8 +1,4 @@
|
||||
---
|
||||
title: Debugging
|
||||
description: How to debug Axolotl
|
||||
---
|
||||
|
||||
# Debugging Axolotl
|
||||
|
||||
This document provides some tips and tricks for debugging Axolotl. It also provides an example configuration for debugging with VSCode. A good debugging setup is essential to understanding how Axolotl code works behind the scenes.
|
||||
|
||||
18
docs/faq.md
Normal file
18
docs/faq.md
Normal file
@@ -0,0 +1,18 @@
|
||||
# Axolotl FAQ's
|
||||
|
||||
|
||||
> The trainer stopped and hasn't progressed in several minutes.
|
||||
|
||||
Usually an issue with the GPU's communicating with each other. See the [NCCL doc](../docs/nccl.md)
|
||||
|
||||
> Exitcode -9
|
||||
|
||||
This usually happens when you run out of system RAM.
|
||||
|
||||
> Exitcode -7 while using deepspeed
|
||||
|
||||
Try upgrading deepspeed w: `pip install -U deepspeed`
|
||||
|
||||
> AttributeError: 'DummyOptim' object has no attribute 'step'
|
||||
|
||||
You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
|
||||
21
docs/faq.qmd
21
docs/faq.qmd
@@ -1,21 +0,0 @@
|
||||
---
|
||||
title: FAQ
|
||||
description: Frequently asked questions
|
||||
---
|
||||
|
||||
|
||||
**Q: The trainer stopped and hasn't progressed in several minutes.**
|
||||
|
||||
> A: Usually an issue with the GPUs communicating with each other. See the [NCCL doc](nccl.qmd)
|
||||
|
||||
**Q: Exitcode -9**
|
||||
|
||||
> A: This usually happens when you run out of system RAM.
|
||||
|
||||
**Q: Exitcode -7 while using deepspeed**
|
||||
|
||||
> A: Try upgrading deepspeed w: `pip install -U deepspeed`
|
||||
|
||||
**Q: AttributeError: 'DummyOptim' object has no attribute 'step'**
|
||||
|
||||
> A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
|
||||
@@ -1,10 +1,4 @@
|
||||
---
|
||||
title: FDSP + QLoRA
|
||||
description: Use FSDP with QLoRA to fine-tune large LLMs on consumer GPUs.
|
||||
format:
|
||||
html:
|
||||
toc: true
|
||||
---
|
||||
# FDSP + QLoRA
|
||||
|
||||
## Background
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
---
|
||||
title: Template-free prompt construction
|
||||
description: "Template-free prompt construction with the `input_output` format"
|
||||
---
|
||||
# Template-free prompt construction with the `input_output` format
|
||||
|
||||
<!-- TOC -->
|
||||
|
||||
@@ -1,12 +1,8 @@
|
||||
---
|
||||
title: Mac M-series
|
||||
description: Mac M-series support
|
||||
---
|
||||
# Mac M series support
|
||||
|
||||
Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.
|
||||
|
||||
Current support:
|
||||
|
||||
- [x] Support for all models
|
||||
- [x] Full training of models
|
||||
- [x] LoRA training
|
||||
@@ -1,7 +1,4 @@
|
||||
---
|
||||
title: Multi Node
|
||||
description: How to use Axolotl on multiple machines
|
||||
---
|
||||
# Multi Node
|
||||
|
||||
You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
---
|
||||
title: Multipack (Sample Packing)
|
||||
description: Multipack is a technique to pack multiple sequences into a single batch to increase training throughput.
|
||||
---
|
||||
# Multipack (Sample Packing)
|
||||
|
||||
## Visualization of Multipack with Flash Attention
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
---
|
||||
title: NCCL
|
||||
description: Troubleshooting NCCL issues
|
||||
---
|
||||
# NCCL
|
||||
|
||||
NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several [environment variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html). A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
---
|
||||
title: "RLHF (Beta)"
|
||||
description: "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback."
|
||||
---
|
||||
# RLHF (Beta)
|
||||
|
||||
### Overview
|
||||
|
||||
@@ -37,21 +34,6 @@ datasets:
|
||||
rl: ipo
|
||||
```
|
||||
|
||||
#### ORPO
|
||||
|
||||
Paper: https://arxiv.org/abs/2403.07691
|
||||
|
||||
```yaml
|
||||
rl: orpo
|
||||
orpo_alpha: 0.1
|
||||
remove_unused_columns: false
|
||||
|
||||
chat_template: chatml
|
||||
datasets:
|
||||
- path: argilla/ultrafeedback-binarized-preferences-cleaned
|
||||
type: orpo.chat_template
|
||||
```
|
||||
|
||||
#### Using local dataset files
|
||||
```yaml
|
||||
datasets:
|
||||
@@ -21,8 +21,7 @@ lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
wandb_project:
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
# Jamba
|
||||
|
||||
- ✅ qlora w/ deepspeed Zero-2 needs at least 2x GPUs and
|
||||
- 35GiB VRAM per GPU w minimal context length
|
||||
- 56GiB VRAM per GPU (w multipack enabled)
|
||||
- ✅ qlora w/ deepspeed Zero-3 needs at least 2x GPUs and 67GiB VRAM (wtf?)
|
||||
- ✅ qlora single-gpu, ~51GiB VRAM
|
||||
- ✅ multipack
|
||||
- ❓ FSDP
|
||||
- ❓ 8-bit LoRA
|
||||
@@ -1,62 +0,0 @@
|
||||
base_model: ai21labs/Jamba-v0.1
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.0
|
||||
output_dir: ./out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: false
|
||||
eval_sample_packing: false
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
adapter: qlora
|
||||
lora_r: 8
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
|
||||
low_cpu_mem_usage: true
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 1
|
||||
num_epochs: 2
|
||||
optimizer: paged_adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.00001
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch:
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
@@ -1,62 +0,0 @@
|
||||
base_model: ai21labs/Jamba-v0.1
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.0
|
||||
output_dir: ./out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: false
|
||||
eval_sample_packing: false
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
adapter: qlora
|
||||
lora_r: 8
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
|
||||
low_cpu_mem_usage: true
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 1
|
||||
num_epochs: 2
|
||||
optimizer: paged_adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.00001
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch:
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed: deepspeed_configs/zero2.json
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
@@ -1,75 +0,0 @@
|
||||
base_model: NousResearch/Llama-2-7b-hf
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.05
|
||||
output_dir: ./lisa-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter:
|
||||
lora_model_dir:
|
||||
lora_r:
|
||||
lora_alpha:
|
||||
lora_dropout:
|
||||
lora_target_linear:
|
||||
lora_fan_in_fan_out:
|
||||
|
||||
lisa_n_layers: 2
|
||||
lisa_step_interval: 20
|
||||
lisa_layers_attribute: model.layers
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 5e-5 # recommendation from lisa paper for 7b
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
flash_attn_cross_entropy: false
|
||||
flash_attn_rms_norm: true
|
||||
flash_attn_fuse_qkv: false
|
||||
flash_attn_fuse_mlp: true
|
||||
|
||||
warmup_steps: 100
|
||||
evals_per_epoch: 4
|
||||
eval_table_size:
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.1
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
bos_token: "<s>"
|
||||
eos_token: "</s>"
|
||||
unk_token: "<unk>"
|
||||
@@ -36,7 +36,7 @@ wandb_log_model:
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 4
|
||||
num_epochs: 4
|
||||
optimizer: adamw_torch
|
||||
optimizer: paged_adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.00001
|
||||
|
||||
@@ -66,11 +66,5 @@ weight_decay: 0.0
|
||||
fsdp:
|
||||
- full_shard
|
||||
fsdp_config:
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: true
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
fsdp_state_dict_type: SHARDED_STATE_DICT
|
||||
special_tokens:
|
||||
|
||||
12
examples/mistral/Mistral-7b-example/README.md
Normal file
12
examples/mistral/Mistral-7b-example/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
# Description
|
||||
This repository presents an in-depth guide for fine-tuning Mistral-7b or any other compatible model using Axolotl, tailored specifically for chatbot development. It streamlines the process of fine-tuning and uploading the enhanced model to HuggingFace 🤗, thereby serving as an invaluable tool for developers in the AI and chatbot domain.
|
||||
|
||||
**What’s Inside:**
|
||||
|
||||
Beginner-Friendly Instructions: Comprehensive steps to guide you through fine-tuning your chosen model, including details on the data structure (jsonl), configuration, and the code itself.
|
||||
|
||||
Hardware Utilized: For reference, the fine-tuning in this guide was performed using 4x NVIDIA GeForce RTX 3090 (rented 2.1.2-cuda12.1-cudnn8-devel).
|
||||
|
||||
**Uploading to HuggingFace 🤗:**
|
||||
To upload your fine-tuned model to Hugging Face, include the following files:
|
||||

|
||||
970
examples/mistral/Mistral-7b-example/code.ipynb
Normal file
970
examples/mistral/Mistral-7b-example/code.ipynb
Normal file
@@ -0,0 +1,970 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "3fe31229-8f6b-48bc-a86d-af8e5466d11c",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"GPU available? True\n",
|
||||
"BF16 is supported? True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Check if GPU is available I used 4x NVIDIA GeForce RTX 3090 (rented 2.1.2-cuda12.1-cudnn8-devel)\n",
|
||||
"import torch\n",
|
||||
"print('GPU available?', torch.cuda.is_available())\n",
|
||||
"print('BF16 is supported?', torch.cuda.is_bf16_supported())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "1dee845b-f3cb-4b1e-bdd9-1a918eac140b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Collecting huggingface_hub\n",
|
||||
" Downloading huggingface_hub-0.20.1-py3-none-any.whl.metadata (12 kB)\n",
|
||||
"Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.9.0)\n",
|
||||
"Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.10.0)\n",
|
||||
"Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.31.0)\n",
|
||||
"Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.65.0)\n",
|
||||
"Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n",
|
||||
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.7.1)\n",
|
||||
"Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n",
|
||||
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n",
|
||||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.7.22)\n",
|
||||
"Downloading huggingface_hub-0.20.1-py3-none-any.whl (330 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m330.1/330.1 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
|
||||
"\u001b[?25hInstalling collected packages: huggingface_hub\n",
|
||||
"Successfully installed huggingface_hub-0.20.1\n",
|
||||
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip install huggingface_hub"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "88731672-9050-4034-8266-11aaace2a44e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from huggingface_hub import notebook_login"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "6b5aa7d7-3b18-4c14-afd4-043c2c545259",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "60df98d7b0294289aad8b6c8cd023c3b",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#Login to huggingface so you can push the model to hub later\n",
|
||||
"import sys\n",
|
||||
"stdout = sys.stdout\n",
|
||||
"notebook_login()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "b74d0635-5033-4494-b7bd-ff6822103d93",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#I noticed that when you use notebook_login() nothing gets printed after so we use sys \n",
|
||||
"sys.stdout = stdout"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "e3c3b088-45e7-484b-ae39-66beabc48da8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Cloning into 'axolotl'...\n",
|
||||
"remote: Enumerating objects: 235, done.\u001b[K\n",
|
||||
"remote: Counting objects: 100% (235/235), done.\u001b[K\n",
|
||||
"remote: Compressing objects: 100% (207/207), done.\u001b[K\n",
|
||||
"remote: Total 235 (delta 48), reused 123 (delta 13), pack-reused 0\u001b[K\n",
|
||||
"Receiving objects: 100% (235/235), 1.46 MiB | 11.65 MiB/s, done.\n",
|
||||
"Resolving deltas: 100% (48/48), done.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#axolotl\n",
|
||||
"!git clone -b main --depth 1 https://github.com/OpenAccess-AI-Collective/axolotl"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "66927751-4fd6-4477-97fc-6ab08c9d9a74",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/axolotl\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cd axolotl"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "fcccf8da-353b-4d70-8f55-5cfe08c7e6b9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (23.1)\n",
|
||||
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0mObtaining file:///axolotl\n",
|
||||
" Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hCollecting auto-gptq==0.5.1\n",
|
||||
" Downloading auto_gptq-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)\n",
|
||||
"Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (23.1)\n",
|
||||
"Collecting peft==0.6.0\n",
|
||||
" Downloading peft-0.6.0-py3-none-any.whl.metadata (23 kB)\n",
|
||||
"Collecting transformers==4.36.2\n",
|
||||
" Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m126.8/126.8 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting tokenizers==0.15.0\n",
|
||||
" Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
|
||||
"Collecting bitsandbytes>=0.41.1\n",
|
||||
" Downloading bitsandbytes-0.41.3.post2-py3-none-any.whl.metadata (9.8 kB)\n",
|
||||
"Collecting accelerate==0.24.1\n",
|
||||
" Downloading accelerate-0.24.1-py3-none-any.whl.metadata (18 kB)\n",
|
||||
"Collecting addict\n",
|
||||
" Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n",
|
||||
"Collecting fire\n",
|
||||
" Downloading fire-0.5.0.tar.gz (88 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m88.3/88.3 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hRequirement already satisfied: PyYAML>=6.0 in /opt/conda/lib/python3.10/site-packages (6.0.1)\n",
|
||||
"Collecting datasets>=2.15.0\n",
|
||||
" Downloading datasets-2.16.0-py3-none-any.whl.metadata (20 kB)\n",
|
||||
"Collecting sentencepiece\n",
|
||||
" Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m47.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting wandb\n",
|
||||
" Downloading wandb-0.16.1-py3-none-any.whl.metadata (9.8 kB)\n",
|
||||
"Collecting einops\n",
|
||||
" Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)\n",
|
||||
"Collecting optimum==1.13.2\n",
|
||||
" Downloading optimum-1.13.2.tar.gz (300 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m301.0/301.0 kB\u001b[0m \u001b[31m72.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25h Installing build dependencies ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hCollecting hf_transfer\n",
|
||||
" Downloading hf_transfer-0.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
|
||||
"Collecting colorama\n",
|
||||
" Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
|
||||
"Collecting numba\n",
|
||||
" Downloading numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)\n",
|
||||
"Requirement already satisfied: numpy>=1.24.4 in /opt/conda/lib/python3.10/site-packages (1.26.0)\n",
|
||||
"Collecting bert-score==0.3.13\n",
|
||||
" Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.1/61.1 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting evaluate==0.4.0\n",
|
||||
" Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.4/81.4 kB\u001b[0m \u001b[31m26.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting rouge-score==0.1.2\n",
|
||||
" Downloading rouge_score-0.1.2.tar.gz (17 kB)\n",
|
||||
" Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hCollecting scipy\n",
|
||||
" Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.4/60.4 kB\u001b[0m \u001b[31m17.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting scikit-learn==1.2.2\n",
|
||||
" Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.6/9.6 MB\u001b[0m \u001b[31m83.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
|
||||
"\u001b[?25hCollecting pynvml\n",
|
||||
" Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting art\n",
|
||||
" Downloading art-6.1-py3-none-any.whl.metadata (69 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.9/69.9 kB\u001b[0m \u001b[31m21.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting fschat==0.2.34\n",
|
||||
" Downloading fschat-0.2.34-py3-none-any.whl.metadata (20 kB)\n",
|
||||
"Collecting gradio==3.50.2\n",
|
||||
" Downloading gradio-3.50.2-py3-none-any.whl.metadata (17 kB)\n",
|
||||
"Collecting tensorboard\n",
|
||||
" Downloading tensorboard-2.15.1-py3-none-any.whl.metadata (1.7 kB)\n",
|
||||
"Collecting s3fs\n",
|
||||
" Downloading s3fs-2023.12.2-py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
"Collecting gcsfs\n",
|
||||
" Downloading gcsfs-2023.12.2.post1-py2.py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
"Collecting xformers==0.0.23\n",
|
||||
" Downloading xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)\n",
|
||||
"Collecting deepspeed\n",
|
||||
" Downloading deepspeed-0.12.6.tar.gz (1.2 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m109.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hCollecting flash-attn==2.3.3\n",
|
||||
" Downloading flash_attn-2.3.3.tar.gz (2.3 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m111.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hRequirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (5.9.0)\n",
|
||||
"Requirement already satisfied: torch>=1.10.0 in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (2.1.1)\n",
|
||||
"Requirement already satisfied: huggingface-hub in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (0.20.1)\n",
|
||||
"Collecting rouge (from auto-gptq==0.5.1)\n",
|
||||
" Downloading rouge-1.0.1-py3-none-any.whl (13 kB)\n",
|
||||
"Collecting gekko (from auto-gptq==0.5.1)\n",
|
||||
" Downloading gekko-1.0.6-py3-none-any.whl (12.2 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.2/12.2 MB\u001b[0m \u001b[31m77.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n",
|
||||
"\u001b[?25hCollecting safetensors (from auto-gptq==0.5.1)\n",
|
||||
" Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)\n",
|
||||
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from auto-gptq==0.5.1) (4.65.0)\n",
|
||||
"Collecting pandas>=1.0.1 (from bert-score==0.3.13)\n",
|
||||
" Downloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n",
|
||||
"Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from bert-score==0.3.13) (2.31.0)\n",
|
||||
"Collecting matplotlib (from bert-score==0.3.13)\n",
|
||||
" Downloading matplotlib-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)\n",
|
||||
"Collecting dill (from evaluate==0.4.0)\n",
|
||||
" Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)\n",
|
||||
"Collecting xxhash (from evaluate==0.4.0)\n",
|
||||
" Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
|
||||
"Collecting multiprocess (from evaluate==0.4.0)\n",
|
||||
" Downloading multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)\n",
|
||||
"Requirement already satisfied: fsspec>=2021.05.0 in /opt/conda/lib/python3.10/site-packages (from fsspec[http]>=2021.05.0->evaluate==0.4.0) (2023.10.0)\n",
|
||||
"Collecting responses<0.19 (from evaluate==0.4.0)\n",
|
||||
" Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
|
||||
"Collecting ninja (from flash-attn==2.3.3)\n",
|
||||
" Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)\n",
|
||||
"Collecting aiohttp (from fschat==0.2.34)\n",
|
||||
" Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)\n",
|
||||
"Collecting fastapi (from fschat==0.2.34)\n",
|
||||
" Downloading fastapi-0.108.0-py3-none-any.whl.metadata (24 kB)\n",
|
||||
"Collecting httpx (from fschat==0.2.34)\n",
|
||||
" Downloading httpx-0.26.0-py3-none-any.whl.metadata (7.6 kB)\n",
|
||||
"Collecting markdown2[all] (from fschat==0.2.34)\n",
|
||||
" Downloading markdown2-2.4.12-py2.py3-none-any.whl.metadata (2.0 kB)\n",
|
||||
"Collecting nh3 (from fschat==0.2.34)\n",
|
||||
" Downloading nh3-0.2.15-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)\n",
|
||||
"Requirement already satisfied: prompt-toolkit>=3.0.0 in /opt/conda/lib/python3.10/site-packages (from fschat==0.2.34) (3.0.36)\n",
|
||||
"Collecting pydantic<2,>=1 (from fschat==0.2.34)\n",
|
||||
" Downloading pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (149 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m149.6/149.6 kB\u001b[0m \u001b[31m42.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting rich>=10.0.0 (from fschat==0.2.34)\n",
|
||||
" Downloading rich-13.7.0-py3-none-any.whl.metadata (18 kB)\n",
|
||||
"Collecting shortuuid (from fschat==0.2.34)\n",
|
||||
" Downloading shortuuid-1.0.11-py3-none-any.whl (10 kB)\n",
|
||||
"Collecting tiktoken (from fschat==0.2.34)\n",
|
||||
" Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
|
||||
"Collecting uvicorn (from fschat==0.2.34)\n",
|
||||
" Downloading uvicorn-0.25.0-py3-none-any.whl.metadata (6.4 kB)\n",
|
||||
"Collecting aiofiles<24.0,>=22.0 (from gradio==3.50.2)\n",
|
||||
" Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)\n",
|
||||
"Collecting altair<6.0,>=4.2.0 (from gradio==3.50.2)\n",
|
||||
" Downloading altair-5.2.0-py3-none-any.whl.metadata (8.7 kB)\n",
|
||||
"Collecting ffmpy (from gradio==3.50.2)\n",
|
||||
" Downloading ffmpy-0.3.1.tar.gz (5.5 kB)\n",
|
||||
" Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hCollecting gradio-client==0.6.1 (from gradio==3.50.2)\n",
|
||||
" Downloading gradio_client-0.6.1-py3-none-any.whl.metadata (7.1 kB)\n",
|
||||
"Collecting importlib-resources<7.0,>=1.3 (from gradio==3.50.2)\n",
|
||||
" Downloading importlib_resources-6.1.1-py3-none-any.whl.metadata (4.1 kB)\n",
|
||||
"Requirement already satisfied: jinja2<4.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (3.1.2)\n",
|
||||
"Requirement already satisfied: markupsafe~=2.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (2.1.1)\n",
|
||||
"Collecting orjson~=3.0 (from gradio==3.50.2)\n",
|
||||
" Downloading orjson-3.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (49 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: pillow<11.0,>=8.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (10.0.1)\n",
|
||||
"Collecting pydub (from gradio==3.50.2)\n",
|
||||
" Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
|
||||
"Collecting python-multipart (from gradio==3.50.2)\n",
|
||||
" Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting semantic-version~=2.0 (from gradio==3.50.2)\n",
|
||||
" Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n",
|
||||
"Requirement already satisfied: typing-extensions~=4.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (4.7.1)\n",
|
||||
"Collecting websockets<12.0,>=10.0 (from gradio==3.50.2)\n",
|
||||
" Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m30.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting coloredlogs (from optimum==1.13.2)\n",
|
||||
" Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from optimum==1.13.2) (1.11.1)\n",
|
||||
"Collecting absl-py (from rouge-score==0.1.2)\n",
|
||||
" Downloading absl_py-2.0.0-py3-none-any.whl.metadata (2.3 kB)\n",
|
||||
"Collecting nltk (from rouge-score==0.1.2)\n",
|
||||
" Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m90.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: six>=1.14.0 in /opt/conda/lib/python3.10/site-packages (from rouge-score==0.1.2) (1.16.0)\n",
|
||||
"Collecting joblib>=1.1.1 (from scikit-learn==1.2.2)\n",
|
||||
" Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)\n",
|
||||
"Collecting threadpoolctl>=2.0.0 (from scikit-learn==1.2.2)\n",
|
||||
" Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)\n",
|
||||
"Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from transformers==4.36.2) (3.9.0)\n",
|
||||
"Collecting regex!=2019.12.17 (from transformers==4.36.2)\n",
|
||||
" Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.9/40.9 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate==0.24.1) (3.1)\n",
|
||||
"Collecting pyarrow>=8.0.0 (from datasets>=2.15.0)\n",
|
||||
" Downloading pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)\n",
|
||||
"Collecting pyarrow-hotfix (from datasets>=2.15.0)\n",
|
||||
" Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)\n",
|
||||
"Collecting hjson (from deepspeed)\n",
|
||||
" Downloading hjson-3.1.0-py3-none-any.whl (54 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.0/54.0 kB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting py-cpuinfo (from deepspeed)\n",
|
||||
" Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)\n",
|
||||
"Collecting termcolor (from fire)\n",
|
||||
" Downloading termcolor-2.4.0-py3-none-any.whl.metadata (6.1 kB)\n",
|
||||
"Requirement already satisfied: decorator>4.1.2 in /opt/conda/lib/python3.10/site-packages (from gcsfs) (5.1.1)\n",
|
||||
"INFO: pip is looking at multiple versions of gcsfs to determine which version is compatible with other requirements. This could take a while.\n",
|
||||
"Collecting gcsfs\n",
|
||||
" Downloading gcsfs-2023.12.1-py2.py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
" Downloading gcsfs-2023.12.0-py2.py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
" Downloading gcsfs-2023.10.0-py2.py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
"Collecting google-auth>=1.2 (from gcsfs)\n",
|
||||
" Downloading google_auth-2.25.2-py2.py3-none-any.whl.metadata (4.7 kB)\n",
|
||||
"Collecting google-auth-oauthlib (from gcsfs)\n",
|
||||
" Downloading google_auth_oauthlib-1.2.0-py2.py3-none-any.whl.metadata (2.7 kB)\n",
|
||||
"Collecting google-cloud-storage (from gcsfs)\n",
|
||||
" Downloading google_cloud_storage-2.14.0-py2.py3-none-any.whl.metadata (6.1 kB)\n",
|
||||
"Collecting llvmlite<0.42,>=0.41.0dev0 (from numba)\n",
|
||||
" Downloading llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)\n",
|
||||
"Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)\n",
|
||||
" Downloading aiobotocore-2.9.0-py3-none-any.whl.metadata (20 kB)\n",
|
||||
"INFO: pip is looking at multiple versions of s3fs to determine which version is compatible with other requirements. This could take a while.\n",
|
||||
"Collecting s3fs\n",
|
||||
" Downloading s3fs-2023.12.1-py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
" Downloading s3fs-2023.10.0-py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
"Collecting aiobotocore~=2.7.0 (from s3fs)\n",
|
||||
" Downloading aiobotocore-2.7.0-py3-none-any.whl.metadata (20 kB)\n",
|
||||
"Collecting grpcio>=1.48.2 (from tensorboard)\n",
|
||||
" Downloading grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)\n",
|
||||
"Collecting markdown>=2.6.8 (from tensorboard)\n",
|
||||
" Downloading Markdown-3.5.1-py3-none-any.whl.metadata (7.1 kB)\n",
|
||||
"Collecting protobuf<4.24,>=3.19.6 (from tensorboard)\n",
|
||||
" Downloading protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl.metadata (540 bytes)\n",
|
||||
"Requirement already satisfied: setuptools>=41.0.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard) (68.0.0)\n",
|
||||
"Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)\n",
|
||||
" Downloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)\n",
|
||||
"Collecting werkzeug>=1.0.1 (from tensorboard)\n",
|
||||
" Downloading werkzeug-3.0.1-py3-none-any.whl.metadata (4.1 kB)\n",
|
||||
"Requirement already satisfied: Click!=8.0.0,>=7.1 in /opt/conda/lib/python3.10/site-packages (from wandb) (8.1.7)\n",
|
||||
"Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)\n",
|
||||
" Downloading GitPython-3.1.40-py3-none-any.whl.metadata (12 kB)\n",
|
||||
"Collecting sentry-sdk>=1.0.0 (from wandb)\n",
|
||||
" Downloading sentry_sdk-1.39.1-py2.py3-none-any.whl.metadata (9.7 kB)\n",
|
||||
"Collecting docker-pycreds>=0.4.0 (from wandb)\n",
|
||||
" Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\n",
|
||||
"Collecting setproctitle (from wandb)\n",
|
||||
" Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)\n",
|
||||
"Collecting appdirs>=1.4.3 (from wandb)\n",
|
||||
" Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\n",
|
||||
"Collecting botocore<1.31.65,>=1.31.16 (from aiobotocore~=2.7.0->s3fs)\n",
|
||||
" Downloading botocore-1.31.64-py3-none-any.whl.metadata (6.1 kB)\n",
|
||||
"Collecting wrapt<2.0.0,>=1.10.10 (from aiobotocore~=2.7.0->s3fs)\n",
|
||||
" Downloading wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
|
||||
"Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore~=2.7.0->s3fs)\n",
|
||||
" Downloading aioitertools-0.11.0-py3-none-any.whl (23 kB)\n",
|
||||
"Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->fschat==0.2.34) (23.1.0)\n",
|
||||
"Collecting multidict<7.0,>=4.5 (from aiohttp->fschat==0.2.34)\n",
|
||||
" Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.5/114.5 kB\u001b[0m \u001b[31m37.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting yarl<2.0,>=1.0 (from aiohttp->fschat==0.2.34)\n",
|
||||
" Downloading yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)\n",
|
||||
"Collecting frozenlist>=1.1.1 (from aiohttp->fschat==0.2.34)\n",
|
||||
" Downloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
|
||||
"Collecting aiosignal>=1.1.2 (from aiohttp->fschat==0.2.34)\n",
|
||||
" Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n",
|
||||
"Collecting async-timeout<5.0,>=4.0 (from aiohttp->fschat==0.2.34)\n",
|
||||
" Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)\n",
|
||||
"Requirement already satisfied: jsonschema>=3.0 in /opt/conda/lib/python3.10/site-packages (from altair<6.0,>=4.2.0->gradio==3.50.2) (4.20.0)\n",
|
||||
"Requirement already satisfied: toolz in /opt/conda/lib/python3.10/site-packages (from altair<6.0,>=4.2.0->gradio==3.50.2) (0.12.0)\n",
|
||||
"Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb)\n",
|
||||
" Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)\n",
|
||||
"Collecting cachetools<6.0,>=2.0.0 (from google-auth>=1.2->gcsfs)\n",
|
||||
" Downloading cachetools-5.3.2-py3-none-any.whl.metadata (5.2 kB)\n",
|
||||
"Collecting pyasn1-modules>=0.2.1 (from google-auth>=1.2->gcsfs)\n",
|
||||
" Downloading pyasn1_modules-0.3.0-py2.py3-none-any.whl (181 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m181.3/181.3 kB\u001b[0m \u001b[31m59.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting rsa<5,>=3.1.4 (from google-auth>=1.2->gcsfs)\n",
|
||||
" Downloading rsa-4.9-py3-none-any.whl (34 kB)\n",
|
||||
"Collecting requests-oauthlib>=0.7.0 (from google-auth-oauthlib->gcsfs)\n",
|
||||
" Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)\n",
|
||||
"Collecting contourpy>=1.0.1 (from matplotlib->bert-score==0.3.13)\n",
|
||||
" Downloading contourpy-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)\n",
|
||||
"Collecting cycler>=0.10 (from matplotlib->bert-score==0.3.13)\n",
|
||||
" Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)\n",
|
||||
"Collecting fonttools>=4.22.0 (from matplotlib->bert-score==0.3.13)\n",
|
||||
" Downloading fonttools-4.47.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (157 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m157.2/157.2 kB\u001b[0m \u001b[31m41.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting kiwisolver>=1.3.1 (from matplotlib->bert-score==0.3.13)\n",
|
||||
" Downloading kiwisolver-1.4.5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (6.4 kB)\n",
|
||||
"Collecting pyparsing>=2.3.1 (from matplotlib->bert-score==0.3.13)\n",
|
||||
" Downloading pyparsing-3.1.1-py3-none-any.whl.metadata (5.1 kB)\n",
|
||||
"Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib->bert-score==0.3.13) (2.8.2)\n",
|
||||
"Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.0.1->bert-score==0.3.13) (2023.3.post1)\n",
|
||||
"Collecting tzdata>=2022.1 (from pandas>=1.0.1->bert-score==0.3.13)\n",
|
||||
" Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.8/341.8 kB\u001b[0m \u001b[31m72.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: wcwidth in /opt/conda/lib/python3.10/site-packages (from prompt-toolkit>=3.0.0->fschat==0.2.34) (0.2.5)\n",
|
||||
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (2.0.4)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (3.4)\n",
|
||||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (1.26.18)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (2023.7.22)\n",
|
||||
"Collecting markdown-it-py>=2.2.0 (from rich>=10.0.0->fschat==0.2.34)\n",
|
||||
" Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)\n",
|
||||
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from rich>=10.0.0->fschat==0.2.34) (2.15.1)\n",
|
||||
"Collecting h11>=0.8 (from uvicorn->fschat==0.2.34)\n",
|
||||
" Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m21.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting humanfriendly>=9.1 (from coloredlogs->optimum==1.13.2)\n",
|
||||
" Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting starlette<0.33.0,>=0.29.0 (from fastapi->fschat==0.2.34)\n",
|
||||
" Downloading starlette-0.32.0.post1-py3-none-any.whl.metadata (5.8 kB)\n",
|
||||
"Collecting typing-extensions~=4.0 (from gradio==3.50.2)\n",
|
||||
" Downloading typing_extensions-4.9.0-py3-none-any.whl.metadata (3.0 kB)\n",
|
||||
"Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 (from google-cloud-storage->gcsfs)\n",
|
||||
" Downloading google_api_core-2.15.0-py3-none-any.whl.metadata (2.7 kB)\n",
|
||||
"Collecting google-cloud-core<3.0dev,>=2.3.0 (from google-cloud-storage->gcsfs)\n",
|
||||
" Downloading google_cloud_core-2.4.1-py2.py3-none-any.whl.metadata (2.7 kB)\n",
|
||||
"Collecting google-resumable-media>=2.6.0 (from google-cloud-storage->gcsfs)\n",
|
||||
" Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl.metadata (2.2 kB)\n",
|
||||
"Collecting google-crc32c<2.0dev,>=1.0 (from google-cloud-storage->gcsfs)\n",
|
||||
" Downloading google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32 kB)\n",
|
||||
"Requirement already satisfied: anyio in /opt/conda/lib/python3.10/site-packages (from httpx->fschat==0.2.34) (4.2.0)\n",
|
||||
"Collecting httpcore==1.* (from httpx->fschat==0.2.34)\n",
|
||||
" Downloading httpcore-1.0.2-py3-none-any.whl.metadata (20 kB)\n",
|
||||
"Requirement already satisfied: sniffio in /opt/conda/lib/python3.10/site-packages (from httpx->fschat==0.2.34) (1.3.0)\n",
|
||||
"Collecting wavedrom (from markdown2[all]->fschat==0.2.34)\n",
|
||||
" Downloading wavedrom-2.0.3.post3.tar.gz (137 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m137.7/137.7 kB\u001b[0m \u001b[31m47.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hRequirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->optimum==1.13.2) (1.3.0)\n",
|
||||
"Collecting jmespath<2.0.0,>=0.7.1 (from botocore<1.31.65,>=1.31.16->aiobotocore~=2.7.0->s3fs)\n",
|
||||
" Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
|
||||
"Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb)\n",
|
||||
" Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)\n",
|
||||
"Collecting googleapis-common-protos<2.0.dev0,>=1.56.2 (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage->gcsfs)\n",
|
||||
" Downloading googleapis_common_protos-1.62.0-py2.py3-none-any.whl.metadata (1.5 kB)\n",
|
||||
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (2023.12.1)\n",
|
||||
"Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (0.32.0)\n",
|
||||
"Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (0.15.2)\n",
|
||||
"Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=10.0.0->fschat==0.2.34)\n",
|
||||
" Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n",
|
||||
"Collecting pyasn1<0.6.0,>=0.4.6 (from pyasn1-modules>=0.2.1->google-auth>=1.2->gcsfs)\n",
|
||||
" Downloading pyasn1-0.5.1-py2.py3-none-any.whl.metadata (8.6 kB)\n",
|
||||
"Collecting oauthlib>=3.0.0 (from requests-oauthlib>=0.7.0->google-auth-oauthlib->gcsfs)\n",
|
||||
" Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m151.7/151.7 kB\u001b[0m \u001b[31m50.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: exceptiongroup>=1.0.2 in /opt/conda/lib/python3.10/site-packages (from anyio->httpx->fschat==0.2.34) (1.0.4)\n",
|
||||
"Collecting svgwrite (from wavedrom->markdown2[all]->fschat==0.2.34)\n",
|
||||
" Downloading svgwrite-1.4.3-py3-none-any.whl (67 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.1/67.1 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m53.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading auto_gptq-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.8/4.8 MB\u001b[0m \u001b[31m89.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading fschat-0.2.34-py3-none-any.whl (220 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m220.1/220.1 kB\u001b[0m \u001b[31m63.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading gradio-3.50.2-py3-none-any.whl (20.3 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.3/20.3 MB\u001b[0m \u001b[31m82.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading peft-0.6.0-py3-none-any.whl (134 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.9/134.9 kB\u001b[0m \u001b[31m40.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m87.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading transformers-4.36.2-py3-none-any.whl (8.2 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.2/8.2 MB\u001b[0m \u001b[31m90.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl (213.0 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.0/213.0 MB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading gradio_client-0.6.1-py3-none-any.whl (299 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m299.2/299.2 kB\u001b[0m \u001b[31m64.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading bitsandbytes-0.41.3.post2-py3-none-any.whl (92.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.6/92.6 MB\u001b[0m \u001b[31m56.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading datasets-2.16.0-py3-none-any.whl (507 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m87.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.4/36.4 MB\u001b[0m \u001b[31m77.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading art-6.1-py3-none-any.whl (599 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m599.8/599.8 kB\u001b[0m \u001b[31m96.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading einops-0.7.0-py3-none-any.whl (44 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.6/44.6 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading gcsfs-2023.10.0-py2.py3-none-any.whl (33 kB)\n",
|
||||
"Downloading hf_transfer-0.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.9/3.9 MB\u001b[0m \u001b[31m99.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m100.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading s3fs-2023.10.0-py3-none-any.whl (28 kB)\n",
|
||||
"Downloading tensorboard-2.15.1-py3-none-any.whl (5.5 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m96.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading wandb-0.16.1-py3-none-any.whl (2.1 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m99.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading absl_py-2.0.0-py3-none-any.whl (130 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading aiobotocore-2.7.0-py3-none-any.whl (73 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.5/73.5 kB\u001b[0m \u001b[31m25.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading aiofiles-23.2.1-py3-none-any.whl (15 kB)\n",
|
||||
"Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m99.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading altair-5.2.0-py3-none-any.whl (996 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m996.9/996.9 kB\u001b[0m \u001b[31m110.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m34.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading GitPython-3.1.40-py3-none-any.whl (190 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m190.6/190.6 kB\u001b[0m \u001b[31m47.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading google_auth-2.25.2-py2.py3-none-any.whl (184 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m184.2/184.2 kB\u001b[0m \u001b[31m44.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading google_auth_oauthlib-1.2.0-py2.py3-none-any.whl (24 kB)\n",
|
||||
"Downloading grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m102.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading importlib_resources-6.1.1-py3-none-any.whl (33 kB)\n",
|
||||
"Downloading joblib-1.3.2-py3-none-any.whl (302 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.2/302.2 kB\u001b[0m \u001b[31m64.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 MB\u001b[0m \u001b[31m74.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading Markdown-3.5.1-py3-none-any.whl (102 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m102.2/102.2 kB\u001b[0m \u001b[31m34.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading matplotlib-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.6/11.6 MB\u001b[0m \u001b[31m99.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading orjson-3.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.7/138.7 kB\u001b[0m \u001b[31m38.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m96.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl (304 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m304.5/304.5 kB\u001b[0m \u001b[31m68.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl (38.0 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.0/38.0 MB\u001b[0m \u001b[31m78.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m95.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m774.0/774.0 kB\u001b[0m \u001b[31m116.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading rich-13.7.0-py3-none-any.whl (240 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m240.6/240.6 kB\u001b[0m \u001b[31m59.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m102.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading sentry_sdk-1.39.1-py2.py3-none-any.whl (254 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m254.1/254.1 kB\u001b[0m \u001b[31m71.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl (6.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.6/6.6 MB\u001b[0m \u001b[31m104.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)\n",
|
||||
"Downloading uvicorn-0.25.0-py3-none-any.whl (60 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.3/60.3 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading werkzeug-3.0.1-py3-none-any.whl (226 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m226.7/226.7 kB\u001b[0m \u001b[31m67.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading fastapi-0.108.0-py3-none-any.whl (92 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.0/92.0 kB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)\n",
|
||||
"Downloading google_cloud_storage-2.14.0-py2.py3-none-any.whl (121 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 kB\u001b[0m \u001b[31m36.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading httpx-0.26.0-py3-none-any.whl (75 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading httpcore-1.0.2-py3-none-any.whl (76 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m48.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading nh3-0.2.15-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m108.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m307.2/307.2 kB\u001b[0m \u001b[31m66.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
|
||||
"Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\n",
|
||||
"Downloading termcolor-2.4.0-py3-none-any.whl (7.7 kB)\n",
|
||||
"Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m101.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m44.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)\n",
|
||||
"Downloading botocore-1.31.64-py3-none-any.whl (11.3 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.3/11.3 MB\u001b[0m \u001b[31m98.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading cachetools-5.3.2-py3-none-any.whl (9.3 kB)\n",
|
||||
"Downloading contourpy-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (310 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.7/310.7 kB\u001b[0m \u001b[31m69.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading cycler-0.12.1-py3-none-any.whl (8.3 kB)\n",
|
||||
"Downloading fonttools-4.47.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.6/4.6 MB\u001b[0m \u001b[31m102.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (239 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m239.5/239.5 kB\u001b[0m \u001b[31m71.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading google_api_core-2.15.0-py3-none-any.whl (121 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m122.0/122.0 kB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading google_cloud_core-2.4.1-py2.py3-none-any.whl (29 kB)\n",
|
||||
"Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl (80 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.6/80.6 kB\u001b[0m \u001b[31m22.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading kiwisolver-1.4.5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m102.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.5/87.5 kB\u001b[0m \u001b[31m25.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading pyparsing-3.1.1-py3-none-any.whl (103 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m103.1/103.1 kB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading starlette-0.32.0.post1-py3-none-any.whl (70 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m70.0/70.0 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (80 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.3/80.3 kB\u001b[0m \u001b[31m30.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (301 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m301.6/301.6 kB\u001b[0m \u001b[31m80.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading markdown2-2.4.12-py2.py3-none-any.whl (41 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.2/41.2 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading googleapis_common_protos-1.62.0-py2.py3-none-any.whl (228 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m228.7/228.7 kB\u001b[0m \u001b[31m57.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading pyasn1-0.5.1-py2.py3-none-any.whl (84 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.9/84.9 kB\u001b[0m \u001b[31m30.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading smmap-5.0.1-py3-none-any.whl (24 kB)\n",
|
||||
"Building wheels for collected packages: flash-attn, optimum, rouge-score, deepspeed, fire, ffmpy, wavedrom\n",
|
||||
" Building wheel for flash-attn (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for flash-attn: filename=flash_attn-2.3.3-cp310-cp310-linux_x86_64.whl size=57042553 sha256=b1df92cb5bd7657d38b789dd48e907aa3e0bd2715c817eb85f3c4320bb11fb3f\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/e5/e6/fa/941802ec61d1afd320d27160ab1db98e6dba65381f84b76d4a\n",
|
||||
" Building wheel for optimum (pyproject.toml) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for optimum: filename=optimum-1.13.2-py3-none-any.whl size=395599 sha256=ff3a73120e1b6eeeda28f76e3fc8cd4cd826e5d66c869b7848ba150e7af79c62\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/6e/b7/2c/79405d98f0943373d8546daeae25a3d377f7659ca0cbe48699\n",
|
||||
" Building wheel for rouge-score (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=8118ecbbcd3529085e794c803f0ddb182fc6c6d3e8a494103b49a94abf1bec37\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n",
|
||||
" Building wheel for deepspeed (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for deepspeed: filename=deepspeed-0.12.6-py3-none-any.whl size=1306729 sha256=35c46b6f0275b0d3063522e0af4f3cbd9ec1c310114d8917d87cbe2bf43346e2\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/a3/dc/a2/f585faaed4dec84108916dcc8e8a7c129a216df8202ca32984\n",
|
||||
" Building wheel for fire (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for fire: filename=fire-0.5.0-py2.py3-none-any.whl size=116934 sha256=e76d5185f237f34ec69bb8aa657497bef07408978e4f7efdaef48663bb8cd4ef\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/90/d4/f7/9404e5db0116bd4d43e5666eaa3e70ab53723e1e3ea40c9a95\n",
|
||||
" Building wheel for ffmpy (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for ffmpy: filename=ffmpy-0.3.1-py3-none-any.whl size=5579 sha256=da3b54dc0ac1a825a1a233315970ac80b8b4c53ebd9cb2a2cfdeab118f453a64\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/01/a6/d1/1c0828c304a4283b2c1639a09ad86f83d7c487ef34c6b4a1bf\n",
|
||||
" Building wheel for wavedrom (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for wavedrom: filename=wavedrom-2.0.3.post3-py2.py3-none-any.whl size=30052 sha256=7f0cbd15d63ee9c120190bac122ab51bbbfc91ee374bc3c046fadb320816c17e\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/9c/52/8c/38b454b42f712f325e26f633287484c7dc1ad469e1580c5954\n",
|
||||
"Successfully built flash-attn optimum rouge-score deepspeed fire ffmpy wavedrom\n",
|
||||
"Installing collected packages: sentencepiece, pydub, py-cpuinfo, ninja, nh3, hjson, ffmpy, bitsandbytes, appdirs, addict, xxhash, wrapt, werkzeug, websockets, tzdata, typing-extensions, threadpoolctl, termcolor, tensorboard-data-server, svgwrite, smmap, shortuuid, setproctitle, sentry-sdk, semantic-version, scipy, safetensors, rouge, regex, python-multipart, pyparsing, pynvml, pyasn1, pyarrow-hotfix, pyarrow, protobuf, orjson, oauthlib, multidict, mdurl, markdown2, markdown, llvmlite, kiwisolver, joblib, jmespath, importlib-resources, humanfriendly, hf_transfer, h11, grpcio, google-crc32c, gekko, frozenlist, fonttools, einops, docker-pycreds, dill, cycler, contourpy, colorama, cachetools, async-timeout, art, aioitertools, aiofiles, absl-py, yarl, wavedrom, uvicorn, tiktoken, scikit-learn, rsa, responses, requests-oauthlib, pydantic, pyasn1-modules, pandas, numba, nltk, multiprocess, matplotlib, markdown-it-py, httpcore, googleapis-common-protos, google-resumable-media, gitdb, fire, coloredlogs, botocore, aiosignal, xformers, tokenizers, starlette, rouge-score, rich, httpx, google-auth, GitPython, flash-attn, deepspeed, aiohttp, accelerate, wandb, transformers, gradio-client, google-auth-oauthlib, google-api-core, fastapi, altair, aiobotocore, tensorboard, s3fs, peft, gradio, google-cloud-core, fschat, datasets, bert-score, optimum, google-cloud-storage, evaluate, auto-gptq, gcsfs, axolotl\n",
|
||||
" Attempting uninstall: typing-extensions\n",
|
||||
" Found existing installation: typing_extensions 4.7.1\n",
|
||||
" Uninstalling typing_extensions-4.7.1:\n",
|
||||
" Successfully uninstalled typing_extensions-4.7.1\n",
|
||||
" Running setup.py develop for axolotl\n",
|
||||
"Successfully installed GitPython-3.1.40 absl-py-2.0.0 accelerate-0.24.1 addict-2.4.0 aiobotocore-2.7.0 aiofiles-23.2.1 aiohttp-3.9.1 aioitertools-0.11.0 aiosignal-1.3.1 altair-5.2.0 appdirs-1.4.4 art-6.1 async-timeout-4.0.3 auto-gptq-0.5.1 axolotl-0.3.0 bert-score-0.3.13 bitsandbytes-0.41.3.post2 botocore-1.31.64 cachetools-5.3.2 colorama-0.4.6 coloredlogs-15.0.1 contourpy-1.2.0 cycler-0.12.1 datasets-2.16.0 deepspeed-0.12.6 dill-0.3.7 docker-pycreds-0.4.0 einops-0.7.0 evaluate-0.4.0 fastapi-0.108.0 ffmpy-0.3.1 fire-0.5.0 flash-attn-2.3.3 fonttools-4.47.0 frozenlist-1.4.1 fschat-0.2.34 gcsfs-2023.10.0 gekko-1.0.6 gitdb-4.0.11 google-api-core-2.15.0 google-auth-2.25.2 google-auth-oauthlib-1.2.0 google-cloud-core-2.4.1 google-cloud-storage-2.14.0 google-crc32c-1.5.0 google-resumable-media-2.7.0 googleapis-common-protos-1.62.0 gradio-3.50.2 gradio-client-0.6.1 grpcio-1.60.0 h11-0.14.0 hf_transfer-0.1.4 hjson-3.1.0 httpcore-1.0.2 httpx-0.26.0 humanfriendly-10.0 importlib-resources-6.1.1 jmespath-1.0.1 joblib-1.3.2 kiwisolver-1.4.5 llvmlite-0.41.1 markdown-3.5.1 markdown-it-py-3.0.0 markdown2-2.4.12 matplotlib-3.8.2 mdurl-0.1.2 multidict-6.0.4 multiprocess-0.70.15 nh3-0.2.15 ninja-1.11.1.1 nltk-3.8.1 numba-0.58.1 oauthlib-3.2.2 optimum-1.13.2 orjson-3.9.10 pandas-2.1.4 peft-0.6.0 protobuf-4.23.4 py-cpuinfo-9.0.0 pyarrow-14.0.2 pyarrow-hotfix-0.6 pyasn1-0.5.1 pyasn1-modules-0.3.0 pydantic-1.10.13 pydub-0.25.1 pynvml-11.5.0 pyparsing-3.1.1 python-multipart-0.0.6 regex-2023.12.25 requests-oauthlib-1.3.1 responses-0.18.0 rich-13.7.0 rouge-1.0.1 rouge-score-0.1.2 rsa-4.9 s3fs-2023.10.0 safetensors-0.4.1 scikit-learn-1.2.2 scipy-1.11.4 semantic-version-2.10.0 sentencepiece-0.1.99 sentry-sdk-1.39.1 setproctitle-1.3.3 shortuuid-1.0.11 smmap-5.0.1 starlette-0.32.0.post1 svgwrite-1.4.3 tensorboard-2.15.1 tensorboard-data-server-0.7.2 termcolor-2.4.0 threadpoolctl-3.2.0 tiktoken-0.5.2 tokenizers-0.15.0 transformers-4.36.2 typing-extensions-4.8.0 tzdata-2023.3 uvicorn-0.25.0 wandb-0.16.1 wavedrom-2.0.3.post3 websockets-11.0.3 werkzeug-3.0.1 wrapt-1.16.0 xformers-0.0.23 xxhash-3.4.1 yarl-1.9.4\n",
|
||||
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0mCollecting git+https://github.com/huggingface/peft.git\n",
|
||||
" Cloning https://github.com/huggingface/peft.git to /tmp/pip-req-build-hka8xgk2\n",
|
||||
" Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /tmp/pip-req-build-hka8xgk2\n",
|
||||
" Resolved https://github.com/huggingface/peft.git to commit cf04d0353f0343cbf66627228c4495f51669af34\n",
|
||||
" Installing build dependencies ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (1.26.0)\n",
|
||||
"Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (23.1)\n",
|
||||
"Requirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (5.9.0)\n",
|
||||
"Requirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (6.0.1)\n",
|
||||
"Requirement already satisfied: torch>=1.13.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (2.1.1)\n",
|
||||
"Requirement already satisfied: transformers in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (4.36.2)\n",
|
||||
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (4.65.0)\n",
|
||||
"Requirement already satisfied: accelerate>=0.21.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.24.1)\n",
|
||||
"Requirement already satisfied: safetensors in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.4.1)\n",
|
||||
"Requirement already satisfied: huggingface-hub>=0.17.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.20.1)\n",
|
||||
"Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (3.9.0)\n",
|
||||
"Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2023.10.0)\n",
|
||||
"Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2.31.0)\n",
|
||||
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (4.8.0)\n",
|
||||
"Requirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (1.11.1)\n",
|
||||
"Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (3.1)\n",
|
||||
"Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (3.1.2)\n",
|
||||
"Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers->peft==0.7.2.dev0) (2023.12.25)\n",
|
||||
"Requirement already satisfied: tokenizers<0.19,>=0.14 in /opt/conda/lib/python3.10/site-packages (from transformers->peft==0.7.2.dev0) (0.15.0)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.13.0->peft==0.7.2.dev0) (2.1.1)\n",
|
||||
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2.0.4)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (3.4)\n",
|
||||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (1.26.18)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2023.7.22)\n",
|
||||
"Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.13.0->peft==0.7.2.dev0) (1.3.0)\n",
|
||||
"Building wheels for collected packages: peft\n",
|
||||
" Building wheel for peft (pyproject.toml) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for peft: filename=peft-0.7.2.dev0-py3-none-any.whl size=169456 sha256=4c70d23e759fa6abb3827fb2f3a8683be3b24d78777d0f403bbc2c0548e5dd4b\n",
|
||||
" Stored in directory: /tmp/pip-ephem-wheel-cache-my5ncou6/wheels/d7/c7/de/1368fac8590e1b103ddc2ec2a28ad51d83aded1a3830e8a087\n",
|
||||
"Successfully built peft\n",
|
||||
"Installing collected packages: peft\n",
|
||||
" Attempting uninstall: peft\n",
|
||||
" Found existing installation: peft 0.6.0\n",
|
||||
" Uninstalling peft-0.6.0:\n",
|
||||
" Successfully uninstalled peft-0.6.0\n",
|
||||
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
|
||||
"axolotl 0.3.0 requires peft==0.6.0, but you have peft 0.7.2.dev0 which is incompatible.\u001b[0m\u001b[31m\n",
|
||||
"\u001b[0mSuccessfully installed peft-0.7.2.dev0\n",
|
||||
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#instaling what is needed inside axolotl file\n",
|
||||
"!pip install packaging\n",
|
||||
"!pip install -e '.[flash-attn,deepspeed]'\n",
|
||||
"!pip install -U git+https://github.com/huggingface/peft.git"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "82d1a380-1e87-48fe-89fe-25331326014d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The following values were not passed to `accelerate launch` and had defaults used instead:\n",
|
||||
"\t`--num_processes` was set to a value of `3`\n",
|
||||
"\t\tMore than one GPU was found, enabling multi-GPU training.\n",
|
||||
"\t\tIf this was unintended please pass in `--num_processes=1`.\n",
|
||||
"\t`--num_machines` was set to a value of `1`\n",
|
||||
"\t`--mixed_precision` was set to a value of `'no'`\n",
|
||||
"\t`--dynamo_backend` was set to a value of `'no'`\n",
|
||||
"To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
|
||||
" warnings.warn(\n",
|
||||
"[2023-12-28 15:44:09,979] [INFO] [datasets.<module>:58] [PID:2814] PyTorch version 2.1.1 available.\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
|
||||
" warnings.warn(\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
|
||||
" warnings.warn(\n",
|
||||
"[2023-12-28 15:44:10,011] [INFO] [datasets.<module>:58] [PID:2812] PyTorch version 2.1.1 available.\n",
|
||||
"[2023-12-28 15:44:10,013] [INFO] [datasets.<module>:58] [PID:2813] PyTorch version 2.1.1 available.\n",
|
||||
"[2023-12-28 15:44:10,805] [INFO] [axolotl.normalize_config:150] [PID:2814] [RANK:2] GPU memory usage baseline: 0.000GB (+0.317GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:10,830] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
|
||||
"[2023-12-28 15:44:10,842] [INFO] [axolotl.normalize_config:150] [PID:2813] [RANK:1] GPU memory usage baseline: 0.000GB (+0.317GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:10,865] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
|
||||
"[2023-12-28 15:44:10,869] [INFO] [axolotl.normalize_config:150] [PID:2812] [RANK:0] GPU memory usage baseline: 0.000GB (+0.351GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:10,887] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
|
||||
"[2023-12-28 15:44:10,961] [INFO] [comm.py:637:init_distributed] cdb=None\n",
|
||||
"[2023-12-28 15:44:10,994] [INFO] [comm.py:637:init_distributed] cdb=None\n",
|
||||
"[2023-12-28 15:44:11,015] [INFO] [comm.py:637:init_distributed] cdb=None\n",
|
||||
"[2023-12-28 15:44:11,015] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n",
|
||||
" dP dP dP \n",
|
||||
" 88 88 88 \n",
|
||||
" .d8888b. dP. .dP .d8888b. 88 .d8888b. d8888P 88 \n",
|
||||
" 88' `88 `8bd8' 88' `88 88 88' `88 88 88 \n",
|
||||
" 88. .88 .d88b. 88. .88 88 88. .88 88 88 \n",
|
||||
" `88888P8 dP' `dP `88888P' dP `88888P' dP dP \n",
|
||||
" \n",
|
||||
" \n",
|
||||
"\n",
|
||||
"[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:184] [PID:2812] [RANK:0] EOS: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:185] [PID:2812] [RANK:0] BOS: 1 / <s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:186] [PID:2812] [RANK:0] PAD: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:187] [PID:2812] [RANK:0] UNK: 0 / <unk>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,413] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2812] [RANK:0] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,415] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2812] [RANK:0] Prepared dataset loaded from disk...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:184] [PID:2814] [RANK:2] EOS: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:185] [PID:2814] [RANK:2] BOS: 1 / <s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:186] [PID:2814] [RANK:2] PAD: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:187] [PID:2814] [RANK:2] UNK: 0 / <unk>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,530] [DEBUG] [axolotl.load_tokenizer:184] [PID:2813] [RANK:1] EOS: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:185] [PID:2813] [RANK:1] BOS: 1 / <s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:186] [PID:2813] [RANK:1] PAD: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:187] [PID:2813] [RANK:1] UNK: 0 / <unk>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,158] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2813] [RANK:1] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,158] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2814] [RANK:2] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,160] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2813] [RANK:1] Prepared dataset loaded from disk...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,161] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2814] [RANK:2] Prepared dataset loaded from disk...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,236] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] total_num_tokens: 28120\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,238] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] `total_supervised_tokens: 7990`\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,238] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] total_num_steps: 6\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,242] [DEBUG] [axolotl.train.log:60] [PID:2812] [RANK:0] loading tokenizer... mistralai/Mistral-7B-v0.1\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:184] [PID:2812] [RANK:0] EOS: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:185] [PID:2812] [RANK:0] BOS: 1 / <s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:186] [PID:2812] [RANK:0] PAD: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:187] [PID:2812] [RANK:0] UNK: 0 / <unk>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,518] [DEBUG] [axolotl.train.log:60] [PID:2812] [RANK:0] loading model and peft_config...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:184] [PID:2814] [RANK:2] EOS: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:185] [PID:2814] [RANK:2] BOS: 1 / <s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:186] [PID:2814] [RANK:2] PAD: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:187] [PID:2814] [RANK:2] UNK: 0 / <unk>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:184] [PID:2813] [RANK:1] EOS: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:185] [PID:2813] [RANK:1] BOS: 1 / <s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:186] [PID:2813] [RANK:1] PAD: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:187] [PID:2813] [RANK:1] UNK: 0 / <unk>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:13,049] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 291, num_elems = 7.24B\n",
|
||||
"Loading checkpoint shards: 100%|██████████████████| 2/2 [00:11<00:00, 5.81s/it]\n",
|
||||
"Loading checkpoint shards: 100%|██████████████████| 2/2 [00:11<00:00, 5.98s/it]\n",
|
||||
"[2023-12-28 15:44:25,395] [INFO] [axolotl.load_model:503] [PID:2813] [RANK:1] GPU memory usage after model load: 7.576GB (+0.524GB cache, +0.708GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:25,399] [INFO] [axolotl.load_model:526] [PID:2813] [RANK:1] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
|
||||
"[2023-12-28 15:44:25,403] [INFO] [axolotl.load_model:538] [PID:2813] [RANK:1] converting modules to torch.bfloat16 for flash attention\u001b[39m\n",
|
||||
"trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n",
|
||||
"[2023-12-28 15:44:25,480] [INFO] [axolotl.load_model:568] [PID:2813] [RANK:1] GPU memory usage after adapters: 7.589GB (+1.501GB cache, +0.708GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:25,572] [INFO] [axolotl.load_model:503] [PID:2814] [RANK:2] GPU memory usage after model load: 7.576GB (+0.410GB cache, +0.708GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:25,576] [INFO] [axolotl.load_model:526] [PID:2814] [RANK:2] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
|
||||
"[2023-12-28 15:44:25,580] [INFO] [axolotl.load_model:538] [PID:2814] [RANK:2] converting modules to torch.bfloat16 for flash attention\u001b[39m\n",
|
||||
"trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n",
|
||||
"[2023-12-28 15:44:25,660] [INFO] [axolotl.load_model:568] [PID:2814] [RANK:2] GPU memory usage after adapters: 7.589GB (+1.388GB cache, +0.708GB misc)\u001b[39m\n",
|
||||
"Loading checkpoint shards: 100%|██████████████████| 2/2 [00:12<00:00, 6.30s/it]\n",
|
||||
"[2023-12-28 15:44:26,170] [INFO] [axolotl.load_model:503] [PID:2812] [RANK:0] GPU memory usage after model load: 7.576GB (+0.776GB cache, +0.741GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:26,177] [INFO] [axolotl.load_model:526] [PID:2812] [RANK:0] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
|
||||
"[2023-12-28 15:44:26,181] [INFO] [axolotl.load_model:538] [PID:2812] [RANK:0] converting modules to torch.bfloat16 for flash attention\u001b[39m\n",
|
||||
"trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n",
|
||||
"[2023-12-28 15:44:26,259] [INFO] [axolotl.load_model:568] [PID:2812] [RANK:0] GPU memory usage after adapters: 7.589GB (+1.753GB cache, +0.741GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:26,293] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Pre-saving adapter config to ./out\u001b[39m\n",
|
||||
"[2023-12-28 15:44:26,296] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Starting trainer...\u001b[39m\n",
|
||||
"Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
|
||||
"Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
|
||||
"Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
|
||||
"Detected CUDA files, patching ldflags\n",
|
||||
"Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
|
||||
"Building extension module fused_adam...\n",
|
||||
"Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
|
||||
"ninja: no work to do.\n",
|
||||
"Loading extension module fused_adam...\n",
|
||||
"Time to load fused_adam op: 0.05891108512878418 seconds\n",
|
||||
"Loading extension module fused_adam...\n",
|
||||
"Time to load fused_adam op: 0.10173463821411133 seconds\n",
|
||||
"Loading extension module fused_adam...\n",
|
||||
"Time to load fused_adam op: 0.10152459144592285 seconds\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
|
||||
" self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
|
||||
" self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
|
||||
" self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
|
||||
"Parameter Offload: Total persistent parameters: 3674112 in 193 params\n",
|
||||
" 0%| | 0/17 [00:00<?, ?it/s]/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
|
||||
" warnings.warn(\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
|
||||
" warnings.warn(\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
|
||||
" warnings.warn(\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
|
||||
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
|
||||
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
|
||||
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
|
||||
"{'loss': 2.0448, 'learning_rate': 2e-05, 'epoch': 0.06} \n",
|
||||
" 6%|██▌ | 1/17 [00:28<07:32, 28.30s/it]\n",
|
||||
" 0%| | 0/3 [00:00<?, ?it/s]\u001b[A\n",
|
||||
" 67%|██████████████████████████████ | 2/3 [00:03<00:01, 1.85s/it]\u001b[A\n",
|
||||
" \u001b[A\n",
|
||||
"\u001b[A{'eval_loss': 1.9694719314575195, 'eval_runtime': 11.391, 'eval_samples_per_second': 1.492, 'eval_steps_per_second': 0.263, 'epoch': 0.06}\n",
|
||||
" 6%|██▌ | 1/17 [00:39<07:32, 28.30s/it]\n",
|
||||
"100%|█████████████████████████████████████████████| 3/3 [00:07<00:00, 2.65s/it]\u001b[A\n",
|
||||
" \u001b[A[2023-12-28 15:45:35,358] [INFO] [axolotl.callbacks.on_step_end:122] [PID:2812] [RANK:0] GPU memory usage while training: 12.210GB (+4.259GB cache, +0.776GB misc)\u001b[39m\n",
|
||||
" 12%|█████▏ | 2/17 [01:04<08:18, 33.20s/it][2023-12-28 15:45:35,358] [INFO] [axolotl.callbacks.on_step_end:122] [PID:2814] [RANK:2] GPU memory usage while training: 12.269GB (+4.522GB cache, +0.743GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:45:35,358] [INFO] [axolotl.callbacks.on_step_end:122] [PID:2813] [RANK:1] GPU memory usage while training: 12.283GB (+4.493GB cache, +0.743GB misc)\u001b[39m\n",
|
||||
"{'loss': 2.0022, 'learning_rate': 4e-05, 'epoch': 0.12} \n",
|
||||
"{'loss': 2.1054, 'learning_rate': 6e-05, 'epoch': 0.17} \n",
|
||||
"{'loss': 1.9004, 'learning_rate': 8e-05, 'epoch': 0.23} \n",
|
||||
"{'loss': 1.8794, 'learning_rate': 0.0001, 'epoch': 0.29} \n",
|
||||
" 29%|████████████▉ | 5/17 [02:20<05:23, 26.92s/it]\n",
|
||||
" 0%| | 0/3 [00:00<?, ?it/s]\u001b[A\n",
|
||||
" 67%|██████████████████████████████ | 2/3 [00:03<00:01, 1.88s/it]\u001b[A\n",
|
||||
" \u001b[A\n",
|
||||
"\u001b[A{'eval_loss': 1.7912336587905884, 'eval_runtime': 11.3106, 'eval_samples_per_second': 1.503, 'eval_steps_per_second': 0.265, 'epoch': 0.29}\n",
|
||||
" 29%|████████████▉ | 5/17 [02:32<05:23, 26.92s/it]\n",
|
||||
"100%|█████████████████████████████████████████████| 3/3 [00:07<00:00, 2.67s/it]\u001b[A\n",
|
||||
"{'loss': 1.7871, 'learning_rate': 0.00012, 'epoch': 0.35} \u001b[A\n",
|
||||
"{'loss': 1.7758, 'learning_rate': 0.00014, 'epoch': 0.4} \n",
|
||||
"{'loss': 1.4645, 'learning_rate': 0.00016, 'epoch': 0.46} \n",
|
||||
"{'loss': 1.4009, 'learning_rate': 0.00018, 'epoch': 0.52} \n",
|
||||
"{'loss': 1.3927, 'learning_rate': 0.0002, 'epoch': 0.58} \n",
|
||||
" 59%|█████████████████████████▎ | 10/17 [04:38<03:04, 26.33s/it]\n",
|
||||
" 0%| | 0/3 [00:00<?, ?it/s]\u001b[A\n",
|
||||
" 67%|██████████████████████████████ | 2/3 [00:03<00:01, 1.89s/it]\u001b[A\n",
|
||||
" \u001b[A\n",
|
||||
"\u001b[A{'eval_loss': 1.1426481008529663, 'eval_runtime': 11.3344, 'eval_samples_per_second': 1.5, 'eval_steps_per_second': 0.265, 'epoch': 0.58}\n",
|
||||
" 59%|█████████████████████████▎ | 10/17 [04:49<03:04, 26.33s/it]\n",
|
||||
"100%|█████████████████████████████████████████████| 3/3 [00:07<00:00, 2.68s/it]\u001b[A\n",
|
||||
"{'loss': 1.0122, 'learning_rate': 0.0001900968867902419, 'epoch': 0.63} \u001b[A\n",
|
||||
"{'loss': 1.0019, 'learning_rate': 0.00016234898018587337, 'epoch': 0.69} \n",
|
||||
"{'loss': 0.8976, 'learning_rate': 0.00012225209339563145, 'epoch': 0.75} \n",
|
||||
"{'loss': 0.9301, 'learning_rate': 7.774790660436858e-05, 'epoch': 0.81} \n",
|
||||
"{'loss': 0.8595, 'learning_rate': 3.7651019814126654e-05, 'epoch': 0.87} \n",
|
||||
" 88%|█████████████████████████████████████▉ | 15/17 [06:55<00:52, 26.17s/it]\n",
|
||||
" 0%| | 0/3 [00:00<?, ?it/s]\u001b[A\n",
|
||||
" 67%|██████████████████████████████ | 2/3 [00:03<00:01, 1.88s/it]\u001b[A\n",
|
||||
" \u001b[A\n",
|
||||
"\u001b[A{'eval_loss': 0.8175248503684998, 'eval_runtime': 11.2932, 'eval_samples_per_second': 1.505, 'eval_steps_per_second': 0.266, 'epoch': 0.87}\n",
|
||||
" 88%|█████████████████████████████████████▉ | 15/17 [07:06<00:52, 26.17s/it]\n",
|
||||
"100%|█████████████████████████████████████████████| 3/3 [00:07<00:00, 2.67s/it]\u001b[A\n",
|
||||
"{'loss': 0.7931, 'learning_rate': 9.903113209758096e-06, 'epoch': 0.92} \u001b[A\n",
|
||||
"{'loss': 0.6909, 'learning_rate': 0.0, 'epoch': 0.98} \n",
|
||||
"100%|███████████████████████████████████████████| 17/17 [07:56<00:00, 28.03s/it]/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
|
||||
" warnings.warn(\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
|
||||
" warnings.warn(\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
|
||||
" warnings.warn(\n",
|
||||
"{'train_runtime': 489.0649, 'train_samples_per_second': 0.63, 'train_steps_per_second': 0.035, 'train_loss': 1.408153467318591, 'epoch': 0.98}\n",
|
||||
"100%|███████████████████████████████████████████| 17/17 [08:09<00:00, 28.77s/it]\n",
|
||||
"[2023-12-28 15:52:39,488] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Training Completed!!! Saving pre-trained model to ./out\u001b[39m\n",
|
||||
"\u001b[0m\u001b[0m\u001b[0m"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\"\"\"\n",
|
||||
"Training using the config.yml file and using deepspeed:zero3_bf16 the most aggressive optimization out of zero1,zero2,zero3 stages which partitions \n",
|
||||
"not only optimizer states but also gradients and parameters across GPUs. The bf16 indicate mixed precision training using bfloat16.\n",
|
||||
"For more information read axolotl's readme\n",
|
||||
"\"\"\"\n",
|
||||
"!accelerate launch -m axolotl.cli.train /folder/config.yml --deepspeed deepspeed_configs/zero3_bf16.json"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
#Mistral-7b
|
||||
base_model: mistralai/Mistral-7B-v0.1
|
||||
model_type: MistralForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
@@ -7,32 +8,26 @@ load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./lora-out
|
||||
- path: tilemachos/Demo-Dataset #Path to json dataset file in huggingface
|
||||
#for type,conversation arguments read axolotl readme and pick what is suited for your project, I wanted a chatbot and put sharegpt and chatml
|
||||
type: sharegpt
|
||||
conversation: chatml
|
||||
dataset_prepared_path: tilemachos/Demo-Dataset #Path to json dataset file in huggingface
|
||||
val_set_size: 0.05
|
||||
output_dir: ./out
|
||||
|
||||
#using lora for lower cost
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 8192
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
lora_r: 32
|
||||
lora_r: 8
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
sequence_len: 512
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
@@ -40,17 +35,18 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
#only 2 epochs because of small dataset
|
||||
gradient_accumulation_steps: 3
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
num_epochs: 2
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
bf16: true
|
||||
fp16: false
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
@@ -61,17 +57,18 @@ logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
loss_watchdog_threshold: 5.0
|
||||
loss_watchdog_patience: 3
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
eval_table_size:
|
||||
eval_max_new_tokens: 128
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
#default deepspeed, can use more aggresive if needed like zero2, zero3
|
||||
deepspeed: deepspeed_configs/zero1.json
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
bos_token: "<s>"
|
||||
eos_token: "</s>"
|
||||
unk_token: "<unk>"
|
||||
10
examples/mistral/Mistral-7b-example/data.jsonl
Normal file
10
examples/mistral/Mistral-7b-example/data.jsonl
Normal file
@@ -0,0 +1,10 @@
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: Who is the Founder of Apple\""}, {"from": "gpt", "value": "\"<Chatbot>: The founder of Apple is Steve Jobs\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the capital of France?\""}, {"from": "gpt", "value": "\"<Chatbot>: The capital of France is Paris.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: How far is the Moon from Earth?\""}, {"from": "gpt", "value": "\"<Chatbot>: The Moon is approximately 384,400 kilometers from Earth.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the tallest mountain in the world?\""}, {"from": "gpt", "value": "\"<Chatbot>: The tallest mountain in the world is Mount Everest.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: Who wrote Romeo and Juliet?\""}, {"from": "gpt", "value": "\"<Chatbot>: Romeo and Juliet was written by William Shakespeare.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the boiling point of water?\""}, {"from": "gpt", "value": "\"<Chatbot>: The boiling point of water is 100 degrees Celsius.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: When was the first man on the moon?\""}, {"from": "gpt", "value": "\"<Chatbot>: The first man landed on the moon in 1969.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the largest ocean?\""}, {"from": "gpt", "value": "\"<Chatbot>: The largest ocean is the Pacific Ocean.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: Who invented the telephone?\""}, {"from": "gpt", "value": "\"<Chatbot>: The telephone was invented by Alexander Graham Bell.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the formula for water?\""}, {"from": "gpt", "value": "\"<Chatbot>: The chemical formula for water is H2O.\""}]}
|
||||
@@ -56,3 +56,6 @@ weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
bos_token: "<s>"
|
||||
eos_token: "</s>"
|
||||
unk_token: "<unk>"
|
||||
|
||||
@@ -75,3 +75,6 @@ weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
bos_token: "<s>"
|
||||
eos_token: "</s>"
|
||||
unk_token: "<unk>"
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
# Qwen
|
||||
|
||||
TODO
|
||||
|
||||
# Qwen2 MoE
|
||||
|
||||
✅ multipack
|
||||
✅ qwen2_moe 4-bit QLoRA
|
||||
✅ qwen2_moe 16-bit LoRA
|
||||
❓ qwen2_moe 8-bit LoRA
|
||||
@@ -1,64 +0,0 @@
|
||||
base_model: Qwen/Qwen1.5-MoE-A2.7B
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./out
|
||||
|
||||
sequence_len: 1024 # supports up to 32k
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: false
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
optimizer: paged_adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
@@ -1,64 +0,0 @@
|
||||
base_model: Qwen/Qwen1.5-MoE-A2.7B
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./out
|
||||
|
||||
sequence_len: 1024 # supports up to 32k
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: false
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
optimizer: paged_adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
BIN
favicon.jpg
BIN
favicon.jpg
Binary file not shown.
|
Before Width: | Height: | Size: 4.5 KiB |
19
index.qmd
19
index.qmd
@@ -1,19 +0,0 @@
|
||||
|
||||
|
||||
```{python}
|
||||
#|output: asis
|
||||
#|echo: false
|
||||
|
||||
# This cell steals the README as the home page for now, but excludes the table of contents (quarto adds its own)
|
||||
import re
|
||||
pattern = re.compile(
|
||||
r"<table>\s*<tr>\s*<td>\s*## Table of Contents.*?</td>\s*</tr>\s*</table>",
|
||||
re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
|
||||
with open('README.md', 'r') as f:
|
||||
txt = f.read()
|
||||
|
||||
cleaned = pattern.sub("", txt)
|
||||
print(cleaned)
|
||||
```
|
||||
@@ -1,10 +1,10 @@
|
||||
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||
packaging==23.2
|
||||
peft==0.10.0
|
||||
transformers @ git+https://github.com/huggingface/transformers.git@43d17c18360ac9c3d3491389328e2fe55fe8f9ce
|
||||
peft==0.9.0
|
||||
transformers==4.38.2
|
||||
tokenizers==0.15.0
|
||||
bitsandbytes==0.43.0
|
||||
accelerate==0.28.0
|
||||
bitsandbytes>=0.43.0
|
||||
accelerate==0.26.1
|
||||
deepspeed==0.13.1
|
||||
pydantic==2.6.3
|
||||
addict
|
||||
@@ -32,11 +32,12 @@ fschat==0.2.36
|
||||
gradio==3.50.2
|
||||
tensorboard
|
||||
|
||||
mamba-ssm==1.2.0.post1
|
||||
mamba-ssm==1.1.1
|
||||
|
||||
# remote filesystems
|
||||
s3fs
|
||||
gcsfs
|
||||
# adlfs
|
||||
|
||||
trl @ git+https://github.com/huggingface/trl.git@0ee349dcd43b0f4b3169449f16751c38ac4a609f
|
||||
trl>=0.7.9
|
||||
fastcore>=1.5.29
|
||||
|
||||
5
setup.py
5
setup.py
@@ -78,7 +78,7 @@ setup(
|
||||
"deepspeed-kernels",
|
||||
],
|
||||
"mamba-ssm": [
|
||||
"mamba-ssm==1.2.0.post1",
|
||||
"mamba-ssm==1.0.1",
|
||||
],
|
||||
"auto-gptq": [
|
||||
"auto-gptq==0.5.1",
|
||||
@@ -89,8 +89,5 @@ setup(
|
||||
"lion-pytorch": [
|
||||
"lion-pytorch==0.1.2",
|
||||
],
|
||||
"galore": [
|
||||
"galore_torch",
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
@@ -54,7 +54,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
|
||||
LOG.warning(msg)
|
||||
parsed_cfg.dataset_prepared_path = DEFAULT_DATASET_PREPARED_PATH
|
||||
|
||||
if parsed_cfg.rl and parsed_cfg.rl != "orpo":
|
||||
if parsed_cfg.rl:
|
||||
load_rl_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
||||
else:
|
||||
load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
||||
|
||||
@@ -47,7 +47,7 @@ def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
|
||||
else:
|
||||
register_chatml_template()
|
||||
|
||||
if cfg.rl and cfg.rl != "orpo":
|
||||
if cfg.rl:
|
||||
dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
|
||||
else:
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
0
src/axolotl/core/policies/__init__.py
Normal file
0
src/axolotl/core/policies/__init__.py
Normal file
55
src/axolotl/core/policies/auto_wrap.py
Normal file
55
src/axolotl/core/policies/auto_wrap.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""module for building the auto wrap policy for FSDP"""
|
||||
import functools
|
||||
|
||||
from peft import PrefixEncoder, PromptEmbedding, PromptEncoder
|
||||
from torch.distributed.fsdp.wrap import (
|
||||
_or_policy,
|
||||
lambda_auto_wrap_policy,
|
||||
transformer_auto_wrap_policy,
|
||||
)
|
||||
from transformers.models.llama.modeling_llama import LlamaDecoderLayer
|
||||
from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
|
||||
from transformers.models.mixtral.modeling_mixtral import MixtralDecoderLayer
|
||||
|
||||
SUPPORTED_AUTO_WRAP_MODEL_TYPES = [
|
||||
"llama",
|
||||
"mistral",
|
||||
"mixtral",
|
||||
]
|
||||
|
||||
|
||||
def get_wrapping_policy_factory(model_type):
|
||||
if model_type == "llama":
|
||||
layer_to_wrap = LlamaDecoderLayer
|
||||
elif model_type == "mistral":
|
||||
layer_to_wrap = MistralDecoderLayer
|
||||
elif model_type == "mixtral":
|
||||
layer_to_wrap = MixtralDecoderLayer
|
||||
|
||||
def get_wrapping_policy():
|
||||
"""This checks for lora layers (has weight and requires_grad)"""
|
||||
|
||||
def lambda_policy_fn(module):
|
||||
return (
|
||||
len(list(module.named_children())) == 0
|
||||
and getattr(module, "weight", None) is not None
|
||||
and module.weight.requires_grad
|
||||
)
|
||||
|
||||
lambda_policy = functools.partial(
|
||||
lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn
|
||||
)
|
||||
transformer_layer_name = layer_to_wrap
|
||||
transformer_wrap_policy = functools.partial(
|
||||
transformer_auto_wrap_policy,
|
||||
transformer_layer_cls=(
|
||||
PrefixEncoder,
|
||||
PromptEncoder,
|
||||
PromptEmbedding,
|
||||
transformer_layer_name,
|
||||
),
|
||||
)
|
||||
policies = [lambda_policy, transformer_wrap_policy]
|
||||
return functools.partial(_or_policy, policies=policies)
|
||||
|
||||
return get_wrapping_policy
|
||||
@@ -8,17 +8,20 @@ import importlib
|
||||
import importlib.util
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
from abc import abstractmethod
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from functools import wraps
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Literal, Optional, Type, Union
|
||||
from typing import List, Optional, Type, Union
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
from accelerate import FullyShardedDataParallelPlugin
|
||||
from accelerate.utils import str_to_bool
|
||||
from datasets import Dataset
|
||||
from torch.distributed.fsdp import MixedPrecision
|
||||
from torch.optim.lr_scheduler import OneCycleLR
|
||||
from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
|
||||
from transformers import (
|
||||
@@ -30,8 +33,8 @@ from transformers import (
|
||||
from transformers.trainer_utils import seed_worker
|
||||
from transformers.utils import is_sagemaker_mp_enabled
|
||||
from trl import DPOTrainer
|
||||
from trl.trainer.utils import pad_to_length
|
||||
|
||||
from axolotl.core.policies.auto_wrap import get_wrapping_policy_factory
|
||||
from axolotl.loraplus import create_loraplus_optimizer
|
||||
from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
|
||||
from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
|
||||
@@ -45,7 +48,6 @@ from axolotl.utils.callbacks import (
|
||||
causal_lm_bench_eval_callback_factory,
|
||||
log_prediction_callback_factory,
|
||||
)
|
||||
from axolotl.utils.callbacks.lisa import lisa_callback_factory
|
||||
from axolotl.utils.collators import (
|
||||
BatchSamplerDataCollatorForSeq2Seq,
|
||||
DataCollatorForSeq2Seq,
|
||||
@@ -198,21 +200,6 @@ class AxolotlTrainingArguments(TrainingArguments):
|
||||
default=False,
|
||||
metadata={"help": "whether this is a qlora training"},
|
||||
)
|
||||
orpo_alpha: Optional[float] = field(
|
||||
default=None,
|
||||
)
|
||||
lisa_n_layers: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={"help": "the number of activate layers in LISA"},
|
||||
)
|
||||
lisa_step_interval: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={"help": "how often to switch layers in LISA"},
|
||||
)
|
||||
lisa_layers_attribute: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "path under the model to access the layers"},
|
||||
)
|
||||
|
||||
|
||||
class AxolotlTrainer(Trainer):
|
||||
@@ -229,16 +216,13 @@ class AxolotlTrainer(Trainer):
|
||||
num_epochs=1,
|
||||
bench_data_collator=None,
|
||||
eval_data_collator=None,
|
||||
**kwargs,
|
||||
**kwargs
|
||||
):
|
||||
self.num_epochs = num_epochs
|
||||
self.bench_data_collator = bench_data_collator
|
||||
self.eval_data_collator = eval_data_collator
|
||||
super().__init__(*_args, **kwargs)
|
||||
self.train_data_collator = self.data_collator
|
||||
self._stored_metrics = defaultdict(lambda: defaultdict(list))
|
||||
if self.args.orpo_alpha:
|
||||
self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
|
||||
|
||||
def create_optimizer(self):
|
||||
if self.args.loraplus_lr_ratio is None:
|
||||
@@ -248,7 +232,6 @@ class AxolotlTrainer(Trainer):
|
||||
if self.optimizer is None: # pylint: disable=access-member-before-definition
|
||||
optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(
|
||||
self.args,
|
||||
opt_model,
|
||||
)
|
||||
|
||||
loraplus_lr_ratio = getattr(self.args, "loraplus_lr_ratio", None)
|
||||
@@ -482,165 +465,8 @@ class AxolotlTrainer(Trainer):
|
||||
# outputs = model(**inputs)
|
||||
# loss = trainer_weighted_loss(outputs, labels, shift_labels=True)
|
||||
# return (loss, outputs) if return_outputs else loss
|
||||
if self.args.orpo_alpha:
|
||||
return self.orpo_compute_loss(model, inputs, return_outputs=return_outputs)
|
||||
return super().compute_loss(model, inputs, return_outputs=return_outputs)
|
||||
|
||||
@staticmethod
|
||||
def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
|
||||
concatenated_batch = {}
|
||||
|
||||
max_length = max(
|
||||
inputs["input_ids"].shape[1], inputs["rejected_input_ids"].shape[1]
|
||||
)
|
||||
# Concatenate positive and negative inputs
|
||||
concatenated_batch["input_ids"] = pad_to_length(
|
||||
inputs["input_ids"], max_length, pad_token
|
||||
)
|
||||
concatenated_batch["rejected_input_ids"] = pad_to_length(
|
||||
inputs["rejected_input_ids"], max_length, pad_token
|
||||
)
|
||||
concatenated_batch["labels"] = pad_to_length(
|
||||
inputs["labels"], max_length, label_pad_token
|
||||
)
|
||||
concatenated_batch["rejected_labels"] = pad_to_length(
|
||||
inputs["rejected_labels"], max_length, label_pad_token
|
||||
)
|
||||
concatenated_batch["attention_mask"] = pad_to_length(
|
||||
inputs["attention_mask"], max_length, 0
|
||||
)
|
||||
concatenated_batch["rejected_attention_mask"] = pad_to_length(
|
||||
inputs["rejected_attention_mask"], max_length, 0
|
||||
)
|
||||
concatenated_batch["prompt_attention_mask"] = pad_to_length(
|
||||
inputs["prompt_attention_mask"], max_length, 0
|
||||
).to(device=device)
|
||||
|
||||
input_ids = torch.cat(
|
||||
[concatenated_batch["input_ids"], concatenated_batch["rejected_input_ids"]],
|
||||
dim=0,
|
||||
).to(device=device)
|
||||
attention_mask = torch.cat(
|
||||
[
|
||||
concatenated_batch["attention_mask"],
|
||||
concatenated_batch["rejected_attention_mask"],
|
||||
],
|
||||
dim=0,
|
||||
).to(device=device)
|
||||
labels = torch.cat(
|
||||
[concatenated_batch["labels"], concatenated_batch["rejected_labels"]], dim=0
|
||||
).to(device=device)
|
||||
|
||||
return {
|
||||
"input_ids": input_ids,
|
||||
"labels": labels,
|
||||
"attention_mask": attention_mask,
|
||||
"prompt_attention_mask": concatenated_batch["prompt_attention_mask"],
|
||||
}
|
||||
|
||||
def orpo_compute_custom_loss(self, logits, labels):
|
||||
logits = logits.contiguous()
|
||||
loss = 0.0
|
||||
|
||||
if labels is not None:
|
||||
# move labels to correct device to enable model parallelism
|
||||
labels = labels.to(logits.device)
|
||||
# Shift so that tokens < n predict n
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
|
||||
# Flatten the tokens
|
||||
loss = self.loss_fct(shift_logits.transpose(2, 1), shift_labels).mean(
|
||||
dim=-1
|
||||
)
|
||||
|
||||
return loss
|
||||
|
||||
def orpo_compute_logps(
|
||||
self, prompt_attention_mask, chosen_inputs, chosen_attention_mask, logits
|
||||
):
|
||||
# Get the shape of chosen_attention_mask[:, :-1]
|
||||
chosen_shape = chosen_attention_mask[:, :-1].shape
|
||||
|
||||
# Calculate the padding size
|
||||
pad_length = chosen_shape[1] - (prompt_attention_mask.shape[1] - 1)
|
||||
|
||||
# Pad prompt_attention_mask with zeros to match the desired shape
|
||||
prompt_attention_mask_padded = torch.nn.functional.pad(
|
||||
prompt_attention_mask[:, 1:], (0, pad_length), mode="constant", value=0
|
||||
)
|
||||
|
||||
# Perform the subtraction operation
|
||||
mask = chosen_attention_mask[:, :-1] > prompt_attention_mask_padded
|
||||
|
||||
per_token_logps = torch.gather(
|
||||
logits[:, :-1, :].log_softmax(-1),
|
||||
dim=2,
|
||||
index=(mask * chosen_inputs[:, 1:]).unsqueeze(2),
|
||||
).squeeze(2)
|
||||
return torch.mul(per_token_logps, mask).sum(dim=1) / mask.sum(dim=1)
|
||||
|
||||
def orpo_compute_loss(self, model, inputs, return_outputs=False):
|
||||
concat_inputs = AxolotlTrainer.orpo_concatenate_inputs(
|
||||
inputs,
|
||||
label_pad_token=-100,
|
||||
pad_token=self.tokenizer.pad_token_id,
|
||||
device=self.accelerator.device,
|
||||
)
|
||||
|
||||
# Perform a single forward pass
|
||||
outputs = model(
|
||||
**{
|
||||
"input_ids": concat_inputs["input_ids"],
|
||||
"attention_mask": concat_inputs["attention_mask"],
|
||||
"labels": concat_inputs["labels"],
|
||||
},
|
||||
output_hidden_states=True,
|
||||
)
|
||||
|
||||
# Split the outputs for positive and negative examples
|
||||
outputs_pos, outputs_neg = outputs.logits.chunk(2)
|
||||
|
||||
# Calculate NLL loss
|
||||
pos_loss = self.orpo_compute_custom_loss(
|
||||
logits=outputs_pos, labels=concat_inputs["input_ids"].chunk(2)[0]
|
||||
)
|
||||
|
||||
# Calculate Log Probability
|
||||
pos_prob = self.orpo_compute_logps(
|
||||
prompt_attention_mask=concat_inputs["prompt_attention_mask"],
|
||||
chosen_inputs=concat_inputs["input_ids"].chunk(2)[0],
|
||||
chosen_attention_mask=concat_inputs["attention_mask"].chunk(2)[0],
|
||||
logits=outputs_pos,
|
||||
)
|
||||
neg_prob = self.orpo_compute_logps(
|
||||
prompt_attention_mask=concat_inputs["prompt_attention_mask"],
|
||||
chosen_inputs=concat_inputs["input_ids"].chunk(2)[1],
|
||||
chosen_attention_mask=concat_inputs["attention_mask"].chunk(2)[1],
|
||||
logits=outputs_neg,
|
||||
)
|
||||
|
||||
# Calculate log odds
|
||||
log_odds = (pos_prob - neg_prob) - (
|
||||
torch.log(1 - torch.exp(pos_prob)) - torch.log(1 - torch.exp(neg_prob))
|
||||
)
|
||||
sig_ratio = torch.nn.functional.sigmoid(log_odds)
|
||||
ratio = torch.log(sig_ratio)
|
||||
|
||||
# Calculate the Final Loss
|
||||
loss = torch.mean(pos_loss - self.args.orpo_alpha * ratio).to(
|
||||
dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
metrics = {}
|
||||
metrics["chosen_geometric_mean"] = torch.mean(pos_prob).cpu().item()
|
||||
metrics["rejected_geometric_mean"] = torch.mean(neg_prob).cpu().item()
|
||||
metrics["log_odds_ratio"] = torch.mean(ratio).cpu().item()
|
||||
metrics["log_odds"] = torch.mean(log_odds).cpu().item()
|
||||
self.store_metrics(metrics, train_eval="train")
|
||||
|
||||
return (loss, outputs_pos) if return_outputs else loss
|
||||
|
||||
@wraps(Trainer.push_to_hub)
|
||||
def push_to_hub(self, *args, **kwargs) -> str:
|
||||
"""
|
||||
@@ -653,39 +479,54 @@ class AxolotlTrainer(Trainer):
|
||||
|
||||
@wraps(Trainer.create_accelerator_and_postprocess)
|
||||
def create_accelerator_and_postprocess(self):
|
||||
rank = int(os.environ.get("LOCAL_RANK", 0))
|
||||
res = super().create_accelerator_and_postprocess()
|
||||
|
||||
if self.args.qlora is False:
|
||||
return res
|
||||
|
||||
# the rest of this method override is specific to fsdp + qlora (for now)
|
||||
sync_module_states = (
|
||||
str_to_bool(os.environ.get("FSDP_SYNC_MODULE_STATES", "True")) == 1
|
||||
)
|
||||
|
||||
mp_policy = None
|
||||
amp = os.environ["ACCELERATE_MIXED_PRECISION"]
|
||||
if amp == "fp16":
|
||||
mp_policy = MixedPrecision(
|
||||
param_dtype=torch.float32,
|
||||
reduce_dtype=torch.float32,
|
||||
buffer_dtype=torch.float32,
|
||||
)
|
||||
elif amp == "bf16":
|
||||
mp_policy = MixedPrecision(
|
||||
param_dtype=torch.float32,
|
||||
reduce_dtype=torch.float32,
|
||||
buffer_dtype=torch.float32,
|
||||
)
|
||||
|
||||
# If somehow we figure out how we want to parameterize we want to autocast buffers...
|
||||
# mp_policy = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.bfloat16, buffer_dtype=torch.float32)
|
||||
# load_param_skip_names = ['inv_freq']
|
||||
|
||||
if self.is_fsdp_enabled:
|
||||
if (
|
||||
"limit_all_gathers" in self.args.fsdp_config
|
||||
and self.args.fsdp_config["limit_all_gathers"]
|
||||
):
|
||||
self.accelerator.state.fsdp_plugin.limit_all_gathers = True
|
||||
wrapping_policy = get_wrapping_policy_factory(self.args.model_type)
|
||||
fsdp_plugin = FullyShardedDataParallelPlugin(
|
||||
auto_wrap_policy=wrapping_policy(),
|
||||
cpu_offload=False,
|
||||
use_orig_params=False,
|
||||
limit_all_gathers=True,
|
||||
param_init_fn=lambda module: module.to_empty(
|
||||
device=torch.device("cuda"), recurse=False
|
||||
)
|
||||
if (rank != 0 and sync_module_states)
|
||||
else None,
|
||||
mixed_precision_policy=mp_policy,
|
||||
)
|
||||
self.accelerator.state.fsdp_plugin = fsdp_plugin
|
||||
|
||||
return res
|
||||
|
||||
def log(self, logs: Dict[str, float]) -> None:
|
||||
"""
|
||||
Log `logs` on the various objects watching training, including stored metrics.
|
||||
|
||||
Args:
|
||||
logs (`Dict[str, float]`):
|
||||
The values to log.
|
||||
"""
|
||||
# logs either has 'loss' or 'eval_loss'
|
||||
train_eval = "train" if "loss" in logs else "eval"
|
||||
# Add averaged stored metrics to logs
|
||||
for key, metrics in self._stored_metrics[train_eval].items():
|
||||
logs[key] = torch.tensor(metrics).mean().item()
|
||||
del self._stored_metrics[train_eval]
|
||||
return super().log(logs)
|
||||
|
||||
def store_metrics(
|
||||
self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train"
|
||||
) -> None:
|
||||
for key, value in metrics.items():
|
||||
self._stored_metrics[train_eval][key].append(value)
|
||||
|
||||
|
||||
class AxolotlMambaTrainer(AxolotlTrainer):
|
||||
"""
|
||||
@@ -818,12 +659,6 @@ class TrainerBuilderBase(abc.ABC):
|
||||
self.model = model
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
# in case the model supports tagging, add the axolotl tag.
|
||||
# This makes sure the tag is correctly pushed even if a user calls
|
||||
# model.push_to_hub instad of trainer.push_to_hub.
|
||||
if hasattr(model, "add_model_tags"):
|
||||
model.add_model_tags(["axolotl"])
|
||||
|
||||
@property
|
||||
def model_ref(self):
|
||||
return self._model_ref
|
||||
@@ -951,8 +786,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
)
|
||||
callbacks.append(early_stop_cb)
|
||||
|
||||
if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
|
||||
callbacks.append(lisa_callback_factory(trainer))
|
||||
return callbacks
|
||||
|
||||
def _get_trainer_cls(self):
|
||||
@@ -1004,6 +837,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
training_arguments_kwargs[
|
||||
"gradient_checkpointing_kwargs"
|
||||
] = self.cfg.gradient_checkpointing_kwargs
|
||||
else:
|
||||
training_arguments_kwargs["gradient_checkpointing_kwargs"] = {
|
||||
"use_reentrant": False
|
||||
}
|
||||
if self.cfg.fsdp:
|
||||
training_arguments_kwargs["fsdp"] = self.cfg.fsdp
|
||||
if self.cfg.fsdp_config:
|
||||
@@ -1066,11 +903,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
elif self.cfg.sample_packing and self.cfg.eval_sample_packing is False:
|
||||
training_arguments_kwargs["dataloader_drop_last"] = True
|
||||
|
||||
if self.cfg.remove_unused_columns is not None:
|
||||
training_arguments_kwargs[
|
||||
"remove_unused_columns"
|
||||
] = self.cfg.remove_unused_columns
|
||||
|
||||
if not self.cfg.test_datasets and self.cfg.val_set_size == 0:
|
||||
# no eval set, so don't eval
|
||||
training_arguments_kwargs["evaluation_strategy"] = "no"
|
||||
@@ -1184,18 +1016,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
training_arguments_kwargs["optim"] = (
|
||||
self.cfg.optimizer if self.cfg.optimizer else "adamw_hf"
|
||||
)
|
||||
if self.cfg.optim_args:
|
||||
if isinstance(self.cfg.optim_args, dict):
|
||||
optim_args = ",".join(
|
||||
[f"{key}={value}" for key, value in self.cfg.optim_args.items()]
|
||||
)
|
||||
else:
|
||||
optim_args = self.cfg.optim_args
|
||||
training_arguments_kwargs["optim_args"] = optim_args
|
||||
if self.cfg.optim_target_modules:
|
||||
training_arguments_kwargs[
|
||||
"optim_target_modules"
|
||||
] = self.cfg.optim_target_modules
|
||||
training_arguments_kwargs["loraplus_lr_ratio"] = self.cfg.loraplus_lr_ratio
|
||||
training_arguments_kwargs[
|
||||
"loraplus_lr_embedding"
|
||||
@@ -1244,24 +1064,12 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
"relora_prune_ratio"
|
||||
] = self.cfg.relora_prune_ratio
|
||||
|
||||
if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
|
||||
training_arguments_kwargs["lisa_n_layers"] = self.cfg.lisa_n_layers
|
||||
training_arguments_kwargs[
|
||||
"lisa_step_interval"
|
||||
] = self.cfg.lisa_step_interval
|
||||
training_arguments_kwargs[
|
||||
"lisa_layers_attribute"
|
||||
] = self.cfg.lisa_layers_attribute
|
||||
|
||||
training_arguments_kwargs = self.hook_pre_create_training_args(
|
||||
training_arguments_kwargs
|
||||
)
|
||||
training_arguments_kwargs["model_type"] = self.cfg.model_config_type
|
||||
training_arguments_kwargs["pretraining"] = bool(self.cfg.pretraining_dataset)
|
||||
|
||||
if self.cfg.rl == "orpo":
|
||||
training_arguments_kwargs["orpo_alpha"] = self.cfg.orpo_alpha
|
||||
|
||||
if self.cfg.neftune_noise_alpha is not None:
|
||||
training_arguments_kwargs[
|
||||
"neftune_noise_alpha"
|
||||
@@ -1325,7 +1133,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
train_dataset=self.train_dataset,
|
||||
eval_dataset=self.eval_dataset,
|
||||
args=training_args,
|
||||
tokenizer=self.tokenizer,
|
||||
data_collator=self.build_collator(training_args, **data_collator_kwargs),
|
||||
eval_data_collator=self.build_collator(
|
||||
training_args, is_eval=True, **data_collator_kwargs
|
||||
|
||||
@@ -284,7 +284,12 @@ def flashattn_forward_with_s2attn(
|
||||
# [bsz, nh, q_len, hd]
|
||||
# pylint: disable=duplicate-code
|
||||
|
||||
cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
|
||||
kv_seq_len = key_states.shape[-2]
|
||||
if past_key_value is not None:
|
||||
kv_seq_len += past_key_value[0].shape[-2]
|
||||
cos, sin = self.rotary_emb(
|
||||
value_states, seq_len=kv_seq_len, position_ids=position_ids
|
||||
)
|
||||
query_states, key_states = apply_rotary_pos_emb(
|
||||
query_states, key_states, cos, sin, position_ids
|
||||
)
|
||||
@@ -430,7 +435,13 @@ def flashattn_forward(
|
||||
# [bsz, q_len, nh, hd]
|
||||
# [bsz, nh, q_len, hd]
|
||||
|
||||
cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
|
||||
kv_seq_len = key_states.shape[-2]
|
||||
if past_key_value is not None:
|
||||
kv_seq_len += past_key_value[0].shape[-2]
|
||||
|
||||
cos, sin = self.rotary_emb(
|
||||
value_states, seq_len=kv_seq_len, position_ids=position_ids
|
||||
)
|
||||
query_states, key_states = apply_rotary_pos_emb(
|
||||
query_states, key_states, cos, sin, position_ids
|
||||
)
|
||||
|
||||
@@ -80,7 +80,11 @@ def xformers_forward(
|
||||
# [bsz, q_len, nh, hd]
|
||||
# [bsz, nh, q_len, hd]
|
||||
|
||||
cos, sin = self.rotary_emb(value_states)
|
||||
kv_seq_len = key_states.shape[-2]
|
||||
if past_key_value is not None:
|
||||
kv_seq_len += past_key_value[0].shape[-2]
|
||||
|
||||
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
||||
query_states, key_states = apply_rotary_pos_emb(
|
||||
query_states, key_states, cos, sin, position_ids
|
||||
)
|
||||
|
||||
0
src/axolotl/monkeypatch/moe/__init__.py
Normal file
0
src/axolotl/monkeypatch/moe/__init__.py
Normal file
147
src/axolotl/monkeypatch/moe/linear.py
Normal file
147
src/axolotl/monkeypatch/moe/linear.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""
|
||||
Adapted from:
|
||||
https://github.com/shawntan/scattermoe
|
||||
https://arxiv.org/abs/2403.08245
|
||||
"""
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from axolotl.monkeypatch.moe import ops
|
||||
|
||||
class ParallelLinear(torch.autograd.Function):
|
||||
@staticmethod
|
||||
def forward(
|
||||
ctx, x, expert_weights, k,
|
||||
sorted_expert_idxs, sorted_scattered_idxs,
|
||||
padded_block_idxs, expert_offsets,
|
||||
gates=None, grouped_in=False, grouped_out=False,
|
||||
):
|
||||
|
||||
output = ops.scatter2scatter(
|
||||
X=x, W=expert_weights,
|
||||
sorted_expert_idxs=sorted_expert_idxs,
|
||||
sorted_scattered_idxs=sorted_scattered_idxs,
|
||||
padded_block_idxs=padded_block_idxs,
|
||||
k=k, x_grouped=grouped_in, y_grouped=grouped_out
|
||||
)
|
||||
if gates is not None:
|
||||
output_expanded = output.view(gates.size(0), gates.size(1), output.size(-1))
|
||||
output = torch.bmm(
|
||||
gates[:, None, :],
|
||||
output_expanded
|
||||
).squeeze(1)
|
||||
else:
|
||||
output_expanded = None
|
||||
|
||||
ctx.save_for_backward(
|
||||
x, expert_weights,
|
||||
sorted_expert_idxs,
|
||||
sorted_scattered_idxs,
|
||||
padded_block_idxs, expert_offsets,
|
||||
gates,
|
||||
output_expanded
|
||||
)
|
||||
ctx.grouped_in = grouped_in
|
||||
ctx.grouped_out = grouped_out
|
||||
ctx.k = k
|
||||
return output
|
||||
@staticmethod
|
||||
def backward(ctx, grad_out):
|
||||
(x, expert_weights,
|
||||
sorted_expert_idxs,
|
||||
sorted_scattered_idxs,
|
||||
padded_block_idxs, expert_offsets,
|
||||
gates, output_expanded) = ctx.saved_tensors
|
||||
k = ctx.k
|
||||
grouped_in = ctx.grouped_in
|
||||
grouped_out = ctx.grouped_out
|
||||
# print("backward")
|
||||
if gates is not None:
|
||||
# calculate gates gradient
|
||||
d_gates = torch.bmm(output_expanded, grad_out[:, :, None]).squeeze(-1)
|
||||
gates_flat = gates.flatten()
|
||||
gate_fan = gates.size(1)
|
||||
# print("expanded and grouping")
|
||||
grouped_grad_out = output_expanded.flatten(0, 1) # reuse expanded buffer later
|
||||
else:
|
||||
d_gates = None
|
||||
gates_flat = None
|
||||
gate_fan = 1
|
||||
grouped_grad_out = None
|
||||
|
||||
if grouped_out:
|
||||
grouped_grad_out = grad_out
|
||||
else:
|
||||
grouped_grad_out = ops.group(grad_out, sorted_scattered_idxs,
|
||||
fan_out=gate_fan, coeff=gates_flat,
|
||||
out=grouped_grad_out)
|
||||
if grouped_in:
|
||||
grouped_x = x
|
||||
d_expanded_input = None
|
||||
else:
|
||||
grouped_x = ops.group(x, sorted_scattered_idxs, fan_out=k)
|
||||
d_expanded_input = grouped_x
|
||||
d_weights = ops.group_bwd_W(
|
||||
DY=grouped_grad_out, X=grouped_x,
|
||||
expert_offsets=expert_offsets,
|
||||
E=expert_weights.size(0)
|
||||
)
|
||||
d_expanded_input = ops.scatter2scatter(
|
||||
X=grouped_grad_out, x_grouped=True,
|
||||
W=expert_weights.permute(0, 2, 1),
|
||||
padded_block_idxs=padded_block_idxs,
|
||||
sorted_expert_idxs=sorted_expert_idxs,
|
||||
sorted_scattered_idxs=sorted_scattered_idxs,
|
||||
k=1,
|
||||
y_grouped=grouped_in,
|
||||
out=d_expanded_input # Reuse grouped_x buffer
|
||||
)
|
||||
|
||||
if k == 1:
|
||||
d_input = d_expanded_input
|
||||
else:
|
||||
d_input = d_expanded_input.view(x.size(0), k, d_expanded_input.size(-1)).sum(-2)
|
||||
# print("backward end.")
|
||||
return (
|
||||
# x, expert_weights, k,
|
||||
d_input, d_weights, None,
|
||||
# sorted_expert_idxs, sorted_scattered_idxs,
|
||||
None, None,
|
||||
# padded_block_idxs, expert_offsets,
|
||||
None, None,
|
||||
# gates
|
||||
d_gates, None, None
|
||||
)
|
||||
|
||||
def parallel_linear(inputs, expert_weights, k,
|
||||
sorted_expert_idxs, sorted_scattered_idxs,
|
||||
padded_block_idxs, expert_offsets,
|
||||
gates=None):
|
||||
results = ParallelLinear.apply(inputs, expert_weights, k,
|
||||
sorted_expert_idxs, sorted_scattered_idxs,
|
||||
padded_block_idxs, expert_offsets, gates)
|
||||
return results
|
||||
|
||||
class ParallelExperts(nn.Module):
|
||||
def __init__(self, num_experts, input_size, output_size) -> None:
|
||||
super().__init__()
|
||||
self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
|
||||
self.num_experts = num_experts
|
||||
self.input_size = input_size
|
||||
self.output_size = output_size
|
||||
|
||||
def extra_repr(self):
|
||||
return 'num_experts={}, input_size={}, output_size={}'.format(
|
||||
self.num_experts, self.input_size, self.output_size)
|
||||
|
||||
def forward(self, inputs, k, sorted_expert_idxs, sorted_scattered_idxs,
|
||||
padded_block_idxs, expert_offsets,
|
||||
gates=None, grouped_in=False, grouped_out=False):
|
||||
|
||||
results = ParallelLinear.apply(
|
||||
inputs, self.weight.permute(0, 2, 1), k,
|
||||
sorted_expert_idxs, sorted_scattered_idxs,
|
||||
padded_block_idxs, expert_offsets,
|
||||
gates, grouped_in, grouped_out
|
||||
)
|
||||
return results
|
||||
90
src/axolotl/monkeypatch/moe/mlp.py
Normal file
90
src/axolotl/monkeypatch/moe/mlp.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
Adapted from:
|
||||
https://github.com/shawntan/scattermoe
|
||||
https://arxiv.org/abs/2403.08245
|
||||
"""
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from axolotl.monkeypatch.moe import ops
|
||||
from axolotl.monkeypatch.moe.linear import ParallelExperts
|
||||
|
||||
|
||||
class FusedExperts(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
experts=None,
|
||||
hidden_dim=128,
|
||||
ffn_dim=512,
|
||||
num_experts=8,
|
||||
top_k=2,
|
||||
activation=nn.SiLU(),
|
||||
):
|
||||
"""
|
||||
This implements fused experts that are compatible with Mixtral.
|
||||
MLP of type Gated-Linear Unit, typically with a SiLU activation function.
|
||||
"""
|
||||
super(FusedExperts, self).__init__()
|
||||
|
||||
self.num_experts = num_experts
|
||||
self.hidden_dim = hidden_dim
|
||||
self.ffn_dim = ffn_dim
|
||||
self.experts = ParallelExperts(num_experts, hidden_dim, 2 * ffn_dim)
|
||||
self.output_experts = ParallelExperts(num_experts, ffn_dim, hidden_dim)
|
||||
self.top_k = min(top_k, self.num_experts)
|
||||
self.activation = activation
|
||||
|
||||
# parallelize all w1 and w3 computation by concat + stack
|
||||
with torch.no_grad():
|
||||
torch.stack(
|
||||
[
|
||||
torch.cat([experts[i].w1.weight, experts[i].w3.weight], dim=0)
|
||||
for i in range(len(experts))
|
||||
],
|
||||
dim=0,
|
||||
out=self.experts.weight.data,
|
||||
)
|
||||
|
||||
# parallelize all w2 computation by stack
|
||||
torch.stack(
|
||||
[expert.w2.weight for expert in experts],
|
||||
dim=0,
|
||||
out=self.output_experts.weight.data,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self, x: torch.Tensor, routing_weights: torch.Tensor, selected_experts: torch.Tensor
|
||||
):
|
||||
x_shape = x.size()
|
||||
x = x.view(-1, x_shape[-1])
|
||||
with torch.no_grad():
|
||||
sorted_expert_idxs, sorted_scattered_idxs = ops.flatten_and_sort(
|
||||
selected_experts
|
||||
)
|
||||
padded_block_idxs, expert_offsets = ops.padded_block_indices(
|
||||
sorted_expert_idxs, self.num_experts
|
||||
)
|
||||
|
||||
h, gates = self.experts(
|
||||
x,
|
||||
self.top_k,
|
||||
sorted_expert_idxs,
|
||||
sorted_scattered_idxs,
|
||||
padded_block_idxs,
|
||||
expert_offsets,
|
||||
grouped_out=True,
|
||||
).chunk(2, dim=-1)
|
||||
h = self.activation(gates) * h
|
||||
y = self.output_experts(
|
||||
h,
|
||||
1,
|
||||
sorted_expert_idxs,
|
||||
sorted_scattered_idxs,
|
||||
padded_block_idxs,
|
||||
expert_offsets,
|
||||
grouped_in=True,
|
||||
gates=routing_weights,
|
||||
)
|
||||
y = y.view(*x_shape[:-1], y.size(-1))
|
||||
return y
|
||||
73
src/axolotl/monkeypatch/moe/moe.py
Normal file
73
src/axolotl/monkeypatch/moe/moe.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from axolotl.monkeypatch.moe.mlp import FusedExperts
|
||||
|
||||
class SparseMoeBlock(nn.Module):
|
||||
def __init__(self, experts, gate, hidden_dim, ffn_dim, num_experts, top_k):
|
||||
super().__init__()
|
||||
self.hidden_dim = hidden_dim
|
||||
self.ffn_dim = ffn_dim
|
||||
self.num_experts = num_experts
|
||||
self.top_k = top_k
|
||||
self.gate = gate
|
||||
self.experts = FusedExperts(
|
||||
experts=experts,
|
||||
hidden_dim=hidden_dim,
|
||||
ffn_dim=ffn_dim,
|
||||
num_experts=num_experts,
|
||||
top_k=top_k,
|
||||
activation=experts[0].act_fn
|
||||
)
|
||||
|
||||
def _post_training(self, model, name):
|
||||
# Get original weights back: reverse the concat + stack in the fused experts
|
||||
w1s, w3s = torch.split(torch.unbind(self.experts.experts.weight, dim=0), 2, dim=1)
|
||||
w2s = torch.unbind(self.experts.output_experts.weight, dim=0)
|
||||
|
||||
# Recreate the structure of the original MixtralSparseMoeBlock
|
||||
original_moe = nn.Module()
|
||||
original_moe.hidden_dim = self.hidden_dim
|
||||
original_moe.ffn_dim = self.ffn_dim
|
||||
original_moe.num_experts = self.num_experts
|
||||
original_moe.top_k = self.top_k
|
||||
|
||||
# Recreate the gating module
|
||||
original_moe.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
|
||||
original_moe.gate.weight.data = self.gate.weight.data
|
||||
|
||||
# Recreate the experts as a ModuleList
|
||||
original_moe.experts = nn.ModuleList()
|
||||
for expert_idx in range(self.num_experts):
|
||||
expert = nn.Module()
|
||||
expert.w1 = nn.Linear(self.hidden_dim, 2 * self.ffn_dim, bias=False)
|
||||
expert.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
|
||||
expert.w3 = nn.Linear(self.hidden_dim, 2 * self.ffn_dim, bias=False)
|
||||
expert.act_fn = self.experts.activation
|
||||
|
||||
expert.w1.weight.data = torch.cat([w1s[expert_idx], w3s[expert_idx]], dim=0)
|
||||
expert.w2.weight.data = w2s[expert_idx]
|
||||
|
||||
original_moe.experts.append(expert)
|
||||
|
||||
# Replace the SparseMoeBlock with the recreated MixtralSparseMoeBlock structure
|
||||
setattr(model, name, original_moe)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
batch_size, sequence_length, hidden_dim = hidden_states.shape
|
||||
hidden_states = hidden_states.view(-1, hidden_dim)
|
||||
|
||||
# router_logits: (batch * sequence_length, n_experts)
|
||||
router_logits = self.gate(hidden_states)
|
||||
routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
|
||||
routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
|
||||
routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
|
||||
|
||||
# we cast back to the input dtype
|
||||
routing_weights = routing_weights.to(hidden_states.dtype)
|
||||
|
||||
# Fused expert forward
|
||||
final_hidden_states = self.experts(hidden_states, routing_weights, selected_experts)
|
||||
|
||||
final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
|
||||
return final_hidden_states, router_logits
|
||||
353
src/axolotl/monkeypatch/moe/ops.py
Normal file
353
src/axolotl/monkeypatch/moe/ops.py
Normal file
@@ -0,0 +1,353 @@
|
||||
"""
|
||||
Adapted from:
|
||||
https://github.com/shawntan/scattermoe
|
||||
https://arxiv.org/abs/2403.08245
|
||||
"""
|
||||
|
||||
import torch
|
||||
import triton
|
||||
import triton.language as tl
|
||||
from torch.nn import functional as F
|
||||
|
||||
BLOCK_M = 128
|
||||
|
||||
@torch.jit.script
|
||||
def flatten_and_sort(expert_idxs:torch.Tensor):
|
||||
flattened_expert_idxs = expert_idxs.flatten()
|
||||
sorted_expert_idxs, sorted_scattered_idxs = torch.sort(flattened_expert_idxs)
|
||||
return sorted_expert_idxs, sorted_scattered_idxs
|
||||
|
||||
@torch.jit.script
|
||||
def padded_block_indices(sorted_experts_idxs: torch.Tensor, k: int, N_BLOCK_SIZE: int=BLOCK_M) :
|
||||
expert_counts = torch.bincount(sorted_experts_idxs, minlength=k)
|
||||
padded_block_counts = ((expert_counts - 1) // N_BLOCK_SIZE) + 1
|
||||
padded_expert_block_end = padded_block_counts.cumsum(-1)
|
||||
expert_boundaries_end = expert_counts.cumsum(-1)
|
||||
expert_boundaries_start = expert_boundaries_end - expert_counts
|
||||
padded_expert_block_start = padded_expert_block_end - padded_block_counts
|
||||
block_idxs = torch.arange(padded_expert_block_end[-1],
|
||||
dtype=sorted_experts_idxs.dtype,
|
||||
device=sorted_experts_idxs.device)
|
||||
block_mask = (
|
||||
(block_idxs[:, None] < padded_expert_block_start) |
|
||||
(block_idxs[:, None] >= padded_expert_block_end)
|
||||
)
|
||||
expanded_block_idxs = (
|
||||
N_BLOCK_SIZE * (block_idxs[:, None] - padded_expert_block_start) +
|
||||
expert_boundaries_start
|
||||
)
|
||||
expanded_block_idxs = expanded_block_idxs.masked_fill(block_mask, 0).sum(-1)
|
||||
return expanded_block_idxs, expert_boundaries_end
|
||||
|
||||
|
||||
|
||||
def _scatter2scatter_configs():
|
||||
return [
|
||||
triton.Config({'BLOCK_N': 128, 'BLOCK_K': 32}, num_stages=4, num_warps=4),
|
||||
]
|
||||
|
||||
@triton.autotune(configs=_scatter2scatter_configs(), key=['M', 'N', 'K'], )
|
||||
@triton.heuristics({
|
||||
"NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0,
|
||||
"NO_N_MASK": lambda args: (args['N'] % args['BLOCK_N']) == 0,
|
||||
})
|
||||
@triton.jit
|
||||
def _scatter2scatter(
|
||||
X_ptr, stride_xm, stride_xk,
|
||||
W_ptr, stride_we, stride_wk, stride_wn,
|
||||
Y_ptr, stride_ym, stride_yn,
|
||||
grouped_idx_ptr, expert_idxs_ptr, block_start_idx_ptr,
|
||||
FAN_OUT: tl.constexpr,
|
||||
M: tl.constexpr, K: tl.constexpr, N: tl.constexpr, E: tl.constexpr,
|
||||
BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
|
||||
ACC_TYPE: tl.constexpr,
|
||||
OUT_M: tl.constexpr,
|
||||
allow_tf32: tl.constexpr,
|
||||
x_grouped: tl.constexpr, y_grouped: tl.constexpr,
|
||||
NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr
|
||||
):
|
||||
pid = tl.program_id(axis=0)
|
||||
|
||||
N_BLOCK_COUNT = tl.cdiv(N, BLOCK_N)
|
||||
M_block_id = pid // N_BLOCK_COUNT
|
||||
N_block_id = pid % N_BLOCK_COUNT
|
||||
M_range = tl.arange(0, BLOCK_M)
|
||||
block_start_idx = tl.load(block_start_idx_ptr + M_block_id)
|
||||
# M_block = tl.max_contiguous((block_start_idx + M_range) % OUT_M, BLOCK_M)
|
||||
M_block = tl.max_contiguous(block_start_idx + M_range, BLOCK_M)
|
||||
E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_block < (FAN_OUT * M), other=E)
|
||||
E_idx = tl.min(E_idxs)
|
||||
E_mask = E_idxs == E_idx
|
||||
M_idx = tl.load(grouped_idx_ptr + M_block, mask=E_mask, other=0)
|
||||
if x_grouped:
|
||||
M_in_idx = M_block
|
||||
else:
|
||||
M_in_idx = M_idx // FAN_OUT
|
||||
|
||||
if y_grouped:
|
||||
M_out_idx = M_block
|
||||
else:
|
||||
M_out_idx = M_idx
|
||||
|
||||
K_block = tl.arange(0, BLOCK_K)
|
||||
|
||||
N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
|
||||
N_mask = N_block < N
|
||||
# N_block = tl.max_contiguous(tl.multiple_of(N_block % N, BLOCK_N), BLOCK_N)
|
||||
# N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
|
||||
|
||||
X_blk_ptrs = X_ptr + M_in_idx[:, None] * stride_xm + K_block[None, :] * stride_xk
|
||||
W_blk_ptrs = W_ptr + K_block[:, None] * stride_wk + N_block[None, :] * stride_wn + E_idx * stride_we
|
||||
|
||||
acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
|
||||
iters = tl.cdiv(K, BLOCK_K)
|
||||
for K_block_id in range(0, iters):
|
||||
if NO_K_MASK:
|
||||
x = tl.load(X_blk_ptrs, mask=E_mask[:, None])
|
||||
if NO_N_MASK:
|
||||
w = tl.load(W_blk_ptrs)
|
||||
else:
|
||||
w = tl.load(W_blk_ptrs, mask=N_mask[None, :])
|
||||
else:
|
||||
K_mask = (K_block_id * BLOCK_K + K_block) < K
|
||||
x = tl.load(X_blk_ptrs, mask=E_mask[:, None] & K_mask[None, :])
|
||||
w = tl.load(W_blk_ptrs, mask=K_mask[:, None] & N_mask[None, :])
|
||||
X_blk_ptrs += BLOCK_K * stride_xk
|
||||
W_blk_ptrs += BLOCK_K * stride_wk
|
||||
acc += tl.dot(x, w, allow_tf32=allow_tf32, out_dtype=ACC_TYPE)
|
||||
|
||||
Y_blk_ptrs = Y_ptr + (M_out_idx[:, None] * stride_ym + N_block[None, :] * stride_yn)
|
||||
tl.store(Y_blk_ptrs, acc, mask=E_mask[:, None] & N_mask[None, :])
|
||||
|
||||
def scatter2scatter(X, W, sorted_expert_idxs, sorted_scattered_idxs, k,
|
||||
padded_block_idxs, x_grouped=False, y_grouped=False,
|
||||
out=None):
|
||||
assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0)
|
||||
assert sorted_scattered_idxs.size(0) == X.size(0) * k
|
||||
# Pre-kernel setup
|
||||
x_dim = X.size(-1)
|
||||
y_dim = W.size(-1)
|
||||
L_scattered = sorted_expert_idxs.size(0)
|
||||
if out is None:
|
||||
O = torch.empty((L_scattered, y_dim), device=X.device, dtype=X.dtype)
|
||||
else:
|
||||
assert out.size(0) == L_scattered and out.size(1) == y_dim
|
||||
O = out
|
||||
|
||||
def grid(META):
|
||||
grid_num = (
|
||||
padded_block_idxs.size(0) *
|
||||
triton.cdiv(META['N'], META['BLOCK_N']),
|
||||
)
|
||||
return grid_num
|
||||
"""
|
||||
print("X", X.size(), X.stride(),
|
||||
"W", W.size(), W.stride(),
|
||||
"O", O.size(), O.stride(),
|
||||
"sorted_idxs", sorted_scattered_idxs.size(),
|
||||
"FAN_OUT", k,
|
||||
"BLOCK_M", BLOCK_M,
|
||||
"grouped", (x_grouped, y_grouped))
|
||||
"""
|
||||
_scatter2scatter[grid](
|
||||
# X_ptr, stride_xm, stride_xk,
|
||||
X, X.stride(0), X.stride(1),
|
||||
# W_ptr, stride_we, stride_wk, stride_wn,
|
||||
W, W.stride(0), W.stride(1), W.stride(2),
|
||||
# Y_ptr, stride_ym, stride_yn,
|
||||
O, O.stride(0), O.stride(1),
|
||||
grouped_idx_ptr=sorted_scattered_idxs,
|
||||
expert_idxs_ptr=sorted_expert_idxs,
|
||||
block_start_idx_ptr=padded_block_idxs,
|
||||
FAN_OUT=k,
|
||||
M=X.size(0),
|
||||
K=X.size(1),
|
||||
N=O.size(1), E=W.size(0),
|
||||
BLOCK_M=BLOCK_M,
|
||||
ACC_TYPE=tl.float32,
|
||||
OUT_M=O.size(0),
|
||||
allow_tf32=True,
|
||||
x_grouped=x_grouped, y_grouped=y_grouped,
|
||||
)
|
||||
return O
|
||||
|
||||
|
||||
def _config_XtY():
|
||||
return [
|
||||
triton.Config({'BLOCK_N': 128, 'BLOCK_K': 128, 'BLOCK_M': 32}, num_stages=4, num_warps=4),
|
||||
]
|
||||
|
||||
def group_bwd_W(DY, X, expert_offsets, E):
|
||||
DWt = torch.zeros((E, DY.size(-1), X.size(-1)), device=DY.device, dtype=DY.dtype)
|
||||
DW = DWt.permute(0, 2, 1)
|
||||
def grid(META):
|
||||
grid = (
|
||||
E * triton.cdiv(META['K'], META['BLOCK_K']),
|
||||
triton.cdiv(META['N'], META['BLOCK_N']),
|
||||
)
|
||||
return grid
|
||||
_groupXtY[grid](
|
||||
# DY_ptr, stride_dym, stride_dyk,
|
||||
DY, DY.stride(0), DY.stride(1),
|
||||
# X_ptr, stride_xm, stride_xn,
|
||||
X, X.stride(0), X.stride(1),
|
||||
# DW_ptr, stride_dwe, stride_dwk, stride_dwn,
|
||||
DW, DW.stride(0), DW.stride(1), DW.stride(2),
|
||||
# expert_offsets_ptr,
|
||||
expert_offsets,
|
||||
# K: tl.constexpr, N: tl.constexpr,
|
||||
M=DY.size(0), N=DY.size(-1), K=X.size(-1),
|
||||
# ACC_TYPE: tl.constexpr,
|
||||
ACC_TYPE=tl.float32,
|
||||
allow_tf32=True
|
||||
)
|
||||
return DW
|
||||
|
||||
@triton.autotune(configs=_config_XtY(), key=['M', 'N', 'K'], )
|
||||
@triton.heuristics({
|
||||
"NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0,
|
||||
"NO_N_MASK": lambda args: (args['N'] % args['BLOCK_N']) == 0,
|
||||
})
|
||||
@triton.jit
|
||||
def _groupXtY(
|
||||
DY_ptr, stride_dym, stride_dyk,
|
||||
X_ptr, stride_xm, stride_xn,
|
||||
DW_ptr, stride_dwe, stride_dwk, stride_dwn,
|
||||
expert_offsets_ptr,
|
||||
M: tl.constexpr, K: tl.constexpr, N: tl.constexpr,
|
||||
BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
|
||||
ACC_TYPE: tl.constexpr,
|
||||
allow_tf32: tl.constexpr,
|
||||
NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr
|
||||
):
|
||||
pid0 = tl.program_id(axis=0)
|
||||
pid1 = tl.program_id(axis=1)
|
||||
num0 = tl.num_programs(0)
|
||||
num1 = tl.num_programs(1)
|
||||
pid1, pid0 = tl.swizzle2d(pid1, pid0, num1, num0, 128)
|
||||
|
||||
K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K)
|
||||
E_idx = pid0 // K_BLOCK_COUNT
|
||||
K_block_id = pid0 % K_BLOCK_COUNT
|
||||
N_block_id = pid1
|
||||
|
||||
if E_idx == 0:
|
||||
start_idx = 0
|
||||
else:
|
||||
start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
|
||||
end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
|
||||
|
||||
if end_idx > start_idx:
|
||||
M_block = tl.max_contiguous(start_idx + tl.arange(0, BLOCK_M), BLOCK_M)
|
||||
|
||||
K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
|
||||
K_mask = K_block < K
|
||||
K_block = tl.max_contiguous(tl.multiple_of(K_block % K, BLOCK_K), BLOCK_K)
|
||||
|
||||
N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
|
||||
N_mask = N_block < N
|
||||
N_block = tl.max_contiguous(tl.multiple_of(N_block % N, BLOCK_N), BLOCK_N)
|
||||
|
||||
M_idxs = M_block
|
||||
xt_blk_ptrs = X_ptr + K_block[:, None] * stride_xn + M_idxs[None, :] * stride_xm
|
||||
dy_blk_ptrs = DY_ptr + M_idxs[:, None] * stride_dym + N_block[None, :] * stride_dyk
|
||||
|
||||
acc = tl.zeros((BLOCK_K, BLOCK_N), dtype=ACC_TYPE)
|
||||
iters = tl.cdiv(end_idx - start_idx, BLOCK_M)
|
||||
for i in range(0, iters):
|
||||
M_mask = (i * BLOCK_M + M_block) < end_idx
|
||||
if NO_K_MASK:
|
||||
xt = tl.load(xt_blk_ptrs, mask=M_mask[None, :])
|
||||
else:
|
||||
xt = tl.load(xt_blk_ptrs, mask=K_mask[:, None] & M_mask[None, :])
|
||||
if NO_N_MASK:
|
||||
dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None])
|
||||
else:
|
||||
dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None] & N_mask[None, :])
|
||||
acc += tl.dot(xt, dy, out_dtype=ACC_TYPE, allow_tf32=allow_tf32)
|
||||
xt_blk_ptrs += BLOCK_M * stride_xm
|
||||
dy_blk_ptrs += BLOCK_M * stride_dym
|
||||
|
||||
|
||||
DW_blk_ptrs = DW_ptr + E_idx * stride_dwe + K_block[:, None] * stride_dwk + N_block[None, :] * stride_dwn
|
||||
acc = acc.to(DW_blk_ptrs.dtype.element_ty)
|
||||
tl.store(DW_blk_ptrs, acc, mask=K_mask[:, None] & N_mask[None, :])
|
||||
|
||||
|
||||
def _config_grouping():
|
||||
return [
|
||||
triton.Config({'BLOCK_N': 256, 'BLOCK_K': 128}, num_stages=4, num_warps=4),
|
||||
triton.Config({'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=4, num_warps=4),
|
||||
triton.Config({'BLOCK_N': 64, 'BLOCK_K': 32}, num_stages=4, num_warps=4),
|
||||
]
|
||||
|
||||
def group(A, sorted_expert_idxs, coeff=None, fan_out=1, out=None):
|
||||
N = sorted_expert_idxs.size(0)
|
||||
K = A.size(1)
|
||||
assert A.size(0) * fan_out == N
|
||||
if out is not None:
|
||||
Y = out
|
||||
else:
|
||||
Y = torch.empty((N, K), dtype=A.dtype, device=A.device)
|
||||
# print("grp init:", Y.size())
|
||||
def grid(META):
|
||||
grid_num = (triton.cdiv(META['N'], META['BLOCK_N']),)
|
||||
return grid_num
|
||||
_group[grid](
|
||||
# A_ptr, stride_an, stride_ai,
|
||||
A, A.stride(0), A.stride(1), coeff is not None, coeff, fan_out,
|
||||
# Y_ptr, stride_yn, stride_yk,
|
||||
Y, Y.stride(0), Y.stride(1),
|
||||
# grouped_idx_ptr,
|
||||
sorted_expert_idxs,
|
||||
# N: tl.constexpr, K: tl.constexpr,
|
||||
N, K
|
||||
)
|
||||
return Y
|
||||
|
||||
@triton.autotune(configs=_config_grouping(), key=['K'])
|
||||
@triton.heuristics({
|
||||
"NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0
|
||||
})
|
||||
@triton.jit
|
||||
def _group(
|
||||
src_ptr, stride_sn, stride_sk, has_coeff: tl.constexpr, coeff_ptr, FAN_OUT: tl.constexpr,
|
||||
tgt_ptr, stride_tn, stride_ti,
|
||||
grouped_idx_ptr,
|
||||
N: tl.constexpr, K: tl.constexpr,
|
||||
BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
|
||||
NO_K_MASK: tl.constexpr
|
||||
):
|
||||
pid = tl.program_id(axis=0)
|
||||
|
||||
N_block_id = pid
|
||||
N_blk = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
|
||||
N_mask = N_blk < N
|
||||
N_blk = tl.max_contiguous(tl.multiple_of(N_blk % N, BLOCK_N), BLOCK_N)
|
||||
N_idx = tl.load(grouped_idx_ptr + N_blk, mask=N_mask, other=0)
|
||||
|
||||
K_blk = tl.arange(0, BLOCK_K)
|
||||
src_blk_ptrs = src_ptr + (N_idx // FAN_OUT)[:, None] * stride_sn + K_blk[None, :] * stride_sk
|
||||
tgt_blk_ptrs = tgt_ptr + N_blk[:, None] * stride_tn + K_blk[None, :] * stride_ti
|
||||
|
||||
if has_coeff:
|
||||
c = tl.load(coeff_ptr + N_idx, mask=N_mask)[:, None]
|
||||
|
||||
iters = tl.cdiv(K, BLOCK_K)
|
||||
for i in range(0, iters):
|
||||
if NO_K_MASK:
|
||||
block = tl.load(src_blk_ptrs) # , mask=N_mask[:, None])
|
||||
if has_coeff:
|
||||
block *= c
|
||||
tl.store(tgt_blk_ptrs, block, mask=N_mask[:, None])
|
||||
|
||||
else:
|
||||
K_mask = (i * BLOCK_K + K_blk) < K
|
||||
mask = N_mask[:, None] & K_mask[None, :]
|
||||
block = tl.load(src_blk_ptrs, mask=mask)
|
||||
if has_coeff:
|
||||
block *= c
|
||||
tl.store(tgt_blk_ptrs, block, mask=mask)
|
||||
|
||||
src_blk_ptrs += BLOCK_K * stride_sk
|
||||
tgt_blk_ptrs += BLOCK_K * stride_ti
|
||||
66
src/axolotl/monkeypatch/moe/single.py
Normal file
66
src/axolotl/monkeypatch/moe/single.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""
|
||||
Adapted from:
|
||||
https://github.com/shawntan/scattermoe
|
||||
https://arxiv.org/abs/2403.08245
|
||||
"""
|
||||
|
||||
import torch
|
||||
import triton
|
||||
import triton.language as tl
|
||||
from torch.nn import functional as F
|
||||
|
||||
@triton.jit
|
||||
def _single2scatter(
|
||||
X_ptr, stride_xm, stride_xk,
|
||||
W_ptr, stride_we, stride_wk, stride_wn,
|
||||
Y_ptr, stride_ym, stride_yn,
|
||||
expert_idxs_ptr,
|
||||
FAN_OUT: tl.constexpr,
|
||||
K: tl.constexpr, N: tl.constexpr, E: tl.constexpr,
|
||||
BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
|
||||
ACC_TYPE: tl.constexpr,
|
||||
):
|
||||
pid0 = tl.program_id(axis=0)
|
||||
pid1 = tl.program_id(axis=1)
|
||||
|
||||
N_block_id = pid0
|
||||
if FAN_OUT == 1:
|
||||
in_idx = pid1
|
||||
else:
|
||||
in_idx = 0
|
||||
out_idx = pid1
|
||||
|
||||
K_block = tl.arange(0, BLOCK_K)
|
||||
N_block = tl.max_contiguous(tl.multiple_of((N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)) % N, BLOCK_N), BLOCK_N)
|
||||
E_idx = tl.load(expert_idxs_ptr + pid1)
|
||||
X_blk_ptrs = X_ptr + in_idx * stride_xm + K_block[:, None] * stride_xk
|
||||
W_blk_ptrs = W_ptr + E_idx * stride_we + K_block[:, None] * stride_wk + N_block[None, :] * stride_wn
|
||||
acc = tl.zeros((1, BLOCK_N), dtype=ACC_TYPE)
|
||||
for K_block_id in range(0, tl.cdiv(K, BLOCK_K)):
|
||||
x = tl.load(X_blk_ptrs)
|
||||
w = tl.load(W_blk_ptrs)
|
||||
acc += tl.sum(x * w, axis=0)[None, :]
|
||||
X_blk_ptrs += BLOCK_K * stride_xk
|
||||
W_blk_ptrs += BLOCK_K * stride_wk
|
||||
Y_blk_ptrs = Y_ptr + out_idx * stride_ym + N_block[None, :] * stride_yn
|
||||
tl.store(Y_blk_ptrs, acc)
|
||||
|
||||
def single2scatter(X, W, expert_idxs):
|
||||
E, xdim, ydim = W.size()
|
||||
k = expert_idxs.size(1)
|
||||
assert X.size(0) == k or X.size(0) == 1
|
||||
Y = torch.empty((k, ydim), device=X.device, dtype=X.dtype)
|
||||
BLOCK_N = 128
|
||||
BLOCK_K = 128
|
||||
grid = ydim // BLOCK_N, k
|
||||
_single2scatter[grid](
|
||||
X, X.stride(0), X.stride(1),
|
||||
W, W.stride(0), W.stride(1), W.stride(2),
|
||||
Y, Y.stride(0), Y.stride(1),
|
||||
expert_idxs,
|
||||
FAN_OUT=Y.size(0) // X.size(0),
|
||||
K=xdim, N=ydim, E=E,
|
||||
BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K,
|
||||
ACC_TYPE=tl.float32
|
||||
)
|
||||
return Y
|
||||
@@ -12,7 +12,6 @@ from axolotl.monkeypatch.utils import get_unpad_data
|
||||
SUPPORTED_MULTIPACK_MODEL_TYPES = [
|
||||
"mixtral",
|
||||
"qwen2",
|
||||
"qwen2_moe",
|
||||
"falcon",
|
||||
"phi",
|
||||
"gemma",
|
||||
@@ -32,10 +31,6 @@ def patch_for_multipack(model_type, model_name=None):
|
||||
transformers.models.qwen2.modeling_qwen2._get_unpad_data = ( # pylint: disable=protected-access
|
||||
get_unpad_data
|
||||
)
|
||||
elif model_type == "qwen2_moe":
|
||||
transformers.models.qwen2_moe.modeling_qwen2_moe._get_unpad_data = ( # pylint: disable=protected-access
|
||||
get_unpad_data
|
||||
)
|
||||
elif model_type == "falcon":
|
||||
transformers.models.falcon.modeling_falcon._get_unpad_data = ( # pylint: disable=protected-access
|
||||
get_unpad_data
|
||||
@@ -53,16 +48,14 @@ def patch_for_multipack(model_type, model_name=None):
|
||||
get_unpad_data
|
||||
)
|
||||
elif model_type == "gemmoe":
|
||||
patch_remote(model_name, ".configuration_gemmoe", ".modeling_gemmoe")
|
||||
elif model_type == "jamba":
|
||||
patch_remote(model_name, ".configuration_jamba", ".modeling_jamba")
|
||||
|
||||
|
||||
def patch_remote(model_name, config_name, modeling_name):
|
||||
model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
||||
# we need to load the model here in order for modeling_* to be available
|
||||
with init_empty_weights():
|
||||
AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
|
||||
module_name = model_config.__class__.__module__.replace(config_name, modeling_name)
|
||||
modeling_arch = importlib.import_module(module_name)
|
||||
modeling_arch._get_unpad_data = get_unpad_data # pylint: disable=protected-access
|
||||
model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
||||
# we need to load the model here in order for modeling_gemmoe to be available
|
||||
with init_empty_weights():
|
||||
AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
|
||||
module_name = model_config.__class__.__module__.replace(
|
||||
".configuration_gemmoe", ".modeling_gemmoe"
|
||||
)
|
||||
modeling_gemmoe = importlib.import_module(module_name)
|
||||
modeling_gemmoe._get_unpad_data = ( # pylint: disable=protected-access
|
||||
get_unpad_data
|
||||
)
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
"""
|
||||
module for base dataset transform strategies
|
||||
"""
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
|
||||
LOG = logging.getLogger("axolotl")
|
||||
|
||||
|
||||
def load(strategy, cfg, module_base=None, **kwargs):
|
||||
try:
|
||||
load_fn = strategy.split(".")[-1]
|
||||
strategy = ".".join(strategy.split(".")[:-1])
|
||||
mod = importlib.import_module(f".{strategy}", module_base)
|
||||
func = getattr(mod, load_fn)
|
||||
return func(cfg, **kwargs)
|
||||
except Exception: # pylint: disable=broad-exception-caught
|
||||
LOG.warning(f"unable to load strategy {strategy}")
|
||||
return None
|
||||
@@ -1,8 +1,20 @@
|
||||
"""
|
||||
module for DPO style dataset transform strategies
|
||||
"""
|
||||
from functools import partial
|
||||
|
||||
from ..base import load as load_base
|
||||
import importlib
|
||||
import logging
|
||||
|
||||
load = partial(load_base, module_base="axolotl.prompt_strategies.dpo")
|
||||
LOG = logging.getLogger("axolotl")
|
||||
|
||||
|
||||
def load(strategy, cfg, **kwargs):
|
||||
try:
|
||||
load_fn = strategy.split(".")[-1]
|
||||
strategy = ".".join(strategy.split(".")[:-1])
|
||||
mod = importlib.import_module(f".{strategy}", "axolotl.prompt_strategies.dpo")
|
||||
func = getattr(mod, load_fn)
|
||||
return func(cfg, **kwargs)
|
||||
except Exception: # pylint: disable=broad-exception-caught
|
||||
LOG.warning(f"unable to load strategy {strategy}")
|
||||
return None
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
"""
|
||||
module for ORPO style dataset transform strategies
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
|
||||
from ..base import load as load_base
|
||||
|
||||
load = partial(load_base, module="axolotl.prompt_strategies.orpo")
|
||||
@@ -1,188 +0,0 @@
|
||||
"""chatml prompt tokenization strategy for ORPO"""
|
||||
from typing import Any, Dict, Generator, List, Optional, Tuple
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from axolotl.prompt_tokenizers import IGNORE_INDEX, PromptTokenizingStrategy
|
||||
from axolotl.prompters import Prompter
|
||||
from axolotl.utils.chat_templates import chat_templates
|
||||
|
||||
|
||||
class Message(BaseModel):
|
||||
"""message/turn"""
|
||||
|
||||
role: str
|
||||
content: str
|
||||
label: Optional[bool] = None
|
||||
|
||||
|
||||
class MessageList(BaseModel):
|
||||
"""conversation"""
|
||||
|
||||
messages: List[Message]
|
||||
|
||||
|
||||
def load(
|
||||
tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None, **kwargs
|
||||
): # pylint: disable=possibly-unused-variable,unused-argument
|
||||
"""
|
||||
chatml transforms for datasets with system, input, chosen, rejected
|
||||
"""
|
||||
|
||||
chat_template = chat_templates("chatml")
|
||||
if ds_cfg and "chat_template" in ds_cfg:
|
||||
chat_template = ds_cfg["chat_template"]
|
||||
try:
|
||||
chat_template = chat_templates(chat_template)
|
||||
except ValueError:
|
||||
pass
|
||||
tokenizer.chat_template = chat_template
|
||||
|
||||
return ORPOTokenizingStrategy(
|
||||
ORPOPrompter(chat_template, tokenizer),
|
||||
tokenizer,
|
||||
cfg.train_on_inputs,
|
||||
cfg.sequence_len,
|
||||
dataset_parser=ORPODatasetParsingStrategy(),
|
||||
)
|
||||
|
||||
|
||||
class ORPODatasetParsingStrategy:
|
||||
"""Strategy to parse chosen rejected dataset into messagelist"""
|
||||
|
||||
def get_chosen_conversation_thread(self, prompt) -> MessageList:
|
||||
"""Dataset structure mappings"""
|
||||
|
||||
messages: List[Message] = []
|
||||
if system := prompt.get("system", None):
|
||||
messages.append(Message(role="system", content=system, label=False))
|
||||
messages.append(Message(role="user", content=prompt["prompt"], label=False))
|
||||
messages.append(
|
||||
Message(
|
||||
role="assistant", content=prompt["chosen"][1]["content"], label=True
|
||||
)
|
||||
)
|
||||
return MessageList(messages=messages)
|
||||
|
||||
def get_rejected_conversation_thread(self, prompt) -> MessageList:
|
||||
"""Dataset structure mappings"""
|
||||
|
||||
messages: List[Message] = []
|
||||
if system := prompt.get("system", None):
|
||||
messages.append(Message(role="system", content=system, label=False))
|
||||
messages.append(Message(role="user", content=prompt["prompt"], label=False))
|
||||
messages.append(
|
||||
Message(
|
||||
role="assistant", content=prompt["rejected"][1]["content"], label=True
|
||||
)
|
||||
)
|
||||
return MessageList(messages=messages)
|
||||
|
||||
|
||||
class ORPOTokenizingStrategy(PromptTokenizingStrategy):
|
||||
"""
|
||||
rejected_input_ids
|
||||
input_ids
|
||||
rejected_attention_mask
|
||||
attention_mask
|
||||
rejected_labels
|
||||
labels
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args,
|
||||
dataset_parser=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.dataset_parser = dataset_parser
|
||||
|
||||
def tokenize_prompt(self, prompt):
|
||||
# pass the rejected prompt/row to the Prompter to get the formatted prompt
|
||||
prompt_len = 0
|
||||
rejected_message_list = self.dataset_parser.get_rejected_conversation_thread(
|
||||
prompt
|
||||
)
|
||||
input_ids = []
|
||||
labels = []
|
||||
for _, (part, label) in enumerate(
|
||||
self.prompter.build_prompt(rejected_message_list)
|
||||
):
|
||||
if not part:
|
||||
continue
|
||||
_input_ids = self.tokenizer.encode(part, add_special_tokens=False)
|
||||
prev_idx = len(input_ids)
|
||||
input_ids += _input_ids[prev_idx:]
|
||||
if label:
|
||||
labels += input_ids[prev_idx:]
|
||||
else:
|
||||
labels += [IGNORE_INDEX] * (len(input_ids) - prev_idx)
|
||||
prompt_len = len(input_ids)
|
||||
# remap the input_ids, attention_mask and labels
|
||||
rejected_input_ids = input_ids
|
||||
rejected_labels = labels
|
||||
# pass the chosen prompt/row to the Prompter to get the formatted prompt
|
||||
chosen_message_list = self.dataset_parser.get_chosen_conversation_thread(prompt)
|
||||
input_ids = []
|
||||
labels = []
|
||||
for _, (part, label) in enumerate(
|
||||
self.prompter.build_prompt(chosen_message_list)
|
||||
):
|
||||
if not part:
|
||||
continue
|
||||
_input_ids = self.tokenizer.encode(part, add_special_tokens=False)
|
||||
prev_idx = len(input_ids)
|
||||
input_ids += _input_ids[prev_idx:]
|
||||
if label:
|
||||
labels += input_ids[prev_idx:]
|
||||
else:
|
||||
labels += [IGNORE_INDEX] * (len(input_ids) - prev_idx)
|
||||
|
||||
return {
|
||||
"rejected_input_ids": rejected_input_ids,
|
||||
"rejected_labels": rejected_labels,
|
||||
"rejected_attention_mask": [1] * len(rejected_labels),
|
||||
"input_ids": input_ids,
|
||||
"labels": labels,
|
||||
"attention_mask": [1] * len(labels),
|
||||
"prompt_attention_mask": [1] * prompt_len
|
||||
+ [0] * (len(labels) - prompt_len),
|
||||
}
|
||||
|
||||
|
||||
class ORPOPrompter(Prompter):
|
||||
"""Single Turn prompter for ORPO"""
|
||||
|
||||
def __init__(self, chat_template, tokenizer):
|
||||
self.chat_template = chat_template
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
def build_prompt(
|
||||
self,
|
||||
message_list: MessageList,
|
||||
) -> Generator[Tuple[str, bool], None, None]:
|
||||
conversation = []
|
||||
for message in message_list.messages:
|
||||
conversation.append(message.model_dump())
|
||||
if message.role == "system":
|
||||
yield self.tokenizer.apply_chat_template(
|
||||
conversation,
|
||||
add_generation_prompt=False,
|
||||
chat_template=self.chat_template,
|
||||
tokenize=False,
|
||||
), False
|
||||
if message.role == "user":
|
||||
yield self.tokenizer.apply_chat_template(
|
||||
conversation,
|
||||
add_generation_prompt=True,
|
||||
chat_template=self.chat_template,
|
||||
tokenize=False,
|
||||
), False
|
||||
if message.role == "assistant":
|
||||
yield self.tokenizer.apply_chat_template(
|
||||
conversation,
|
||||
add_generation_prompt=False,
|
||||
chat_template=self.chat_template,
|
||||
tokenize=False,
|
||||
), True
|
||||
@@ -1,6 +1,5 @@
|
||||
"""Module containing the SimpleShareGPTPromptTokenizingStrategy class"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from fastchat.conversation import Conversation, SeparatorStyle, register_conv_template
|
||||
@@ -12,8 +11,6 @@ from axolotl.utils.tokenization import (
|
||||
merge_consecutive_messages,
|
||||
)
|
||||
|
||||
LOG = logging.getLogger("axolotl")
|
||||
|
||||
|
||||
def register_chatml_template(system_message=None):
|
||||
system_message = system_message or "You are a helpful assistant."
|
||||
@@ -45,13 +42,11 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
|
||||
)
|
||||
field_human = ds_cfg["field_human"] if ds_cfg and "field_human" in ds_cfg else None
|
||||
field_model = ds_cfg["field_model"] if ds_cfg and "field_model" in ds_cfg else None
|
||||
roles = ds_cfg["roles"].to_dict() if ds_cfg and "roles" in ds_cfg else None
|
||||
strategy = SimpleShareGPTPromptTokenizingStrategy(
|
||||
ShareGPTPrompterV2(
|
||||
conversation=conversation,
|
||||
role_key_model=field_model,
|
||||
role_key_human=field_human,
|
||||
roles=roles,
|
||||
),
|
||||
tokenizer,
|
||||
cfg.train_on_inputs,
|
||||
@@ -147,12 +142,7 @@ class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
|
||||
"system": "system",
|
||||
}
|
||||
turns = [
|
||||
{
|
||||
"from": (
|
||||
role_map[t[role_key]] if t[role_key] in role_map else t[role_key]
|
||||
),
|
||||
"value": t[value_key],
|
||||
}
|
||||
{"from": role_map[t[role_key]], "value": t[value_key]}
|
||||
for t in conversations
|
||||
]
|
||||
return turns
|
||||
|
||||
@@ -11,7 +11,7 @@ from transformers import BatchEncoding, PreTrainedTokenizer
|
||||
from axolotl.monkeypatch.fastchat_conversation_turns import (
|
||||
add_get_turns_to_conversation,
|
||||
)
|
||||
from axolotl.prompters import IGNORE_TOKEN_ID, Prompter
|
||||
from axolotl.prompters import IGNORE_TOKEN_ID
|
||||
|
||||
LOG = logging.getLogger("axolotl")
|
||||
|
||||
@@ -37,7 +37,7 @@ class PromptTokenizingStrategy(abc.ABC):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
prompter: Prompter,
|
||||
prompter,
|
||||
tokenizer,
|
||||
train_on_inputs: bool = False,
|
||||
sequence_len: int = 2048,
|
||||
@@ -340,23 +340,6 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
|
||||
self.prompter._conversation.copy() # pylint: disable=protected-access
|
||||
)
|
||||
|
||||
input_roles = {conversation.roles[0]}
|
||||
output_roles = {conversation.roles[1]}
|
||||
|
||||
if len(conversation.roles) == 3:
|
||||
tool_role_label = conversation.roles[2]
|
||||
input_roles.add(tool_role_label)
|
||||
|
||||
# Add roles from the config
|
||||
if self.prompter.roles:
|
||||
if "input" in self.prompter.roles and self.prompter.roles["input"]:
|
||||
for role in self.prompter.roles["input"]:
|
||||
input_roles.add(role)
|
||||
|
||||
if "output" in self.prompter.roles and self.prompter.roles["output"]:
|
||||
for role in self.prompter.roles["output"]:
|
||||
output_roles.add(role)
|
||||
|
||||
# support for custom roles from the dataset, only useful for vicuna style prompts/roles
|
||||
role_remap = []
|
||||
if (
|
||||
@@ -377,18 +360,19 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
|
||||
LOG.warning(f"expected tuple, got {part}")
|
||||
continue
|
||||
|
||||
tool_role_label = None
|
||||
if len(conversation.roles) == 3:
|
||||
(
|
||||
user_role_label,
|
||||
assistant_role_label,
|
||||
tool_role_label,
|
||||
) = conversation.roles
|
||||
else:
|
||||
user_role_label, assistant_role_label = conversation.roles
|
||||
role, content = part
|
||||
|
||||
# Uses "in" because role contains extra characters
|
||||
input_turn = any(r.lower() in role.lower() for r in input_roles)
|
||||
output_turn = any(r.lower() in role.lower() for r in output_roles)
|
||||
empty_role = role.strip() == ""
|
||||
|
||||
if not any([input_turn, output_turn, empty_role]):
|
||||
LOG.warning(f"unhandled role: {role}")
|
||||
continue
|
||||
|
||||
if input_turn:
|
||||
if user_role_label in role:
|
||||
role = (
|
||||
role.replace(role_remap[0]["from"], role_remap[0]["to"])
|
||||
if role_remap
|
||||
@@ -408,7 +392,7 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
|
||||
else:
|
||||
# everything from this is masked out from the labels
|
||||
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
|
||||
elif output_turn:
|
||||
elif assistant_role_label in role:
|
||||
role = (
|
||||
role.replace(role_remap[1]["from"], role_remap[1]["to"])
|
||||
if role_remap
|
||||
@@ -439,7 +423,7 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
|
||||
labels[:len_role] = [IGNORE_TOKEN_ID] * min(
|
||||
len_role, len(labels)
|
||||
)
|
||||
elif empty_role:
|
||||
elif role == "":
|
||||
turn = content
|
||||
# this is only ever the first part, should include the bos token and the user query
|
||||
res = self._tokenize(
|
||||
@@ -450,6 +434,11 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
|
||||
else:
|
||||
# everything from this is masked out from the labels
|
||||
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
|
||||
elif tool_role_label and tool_role_label in role:
|
||||
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
|
||||
else:
|
||||
LOG.warning(f"unhandled role: {role}")
|
||||
continue
|
||||
|
||||
# pylint: disable=duplicate-code
|
||||
result, current_len = parse_tokenized_to_result(
|
||||
|
||||
@@ -259,12 +259,6 @@ SHAREGPT_ASSERTION_FAILED_ROLE = (
|
||||
"Role did not alternate between turns (gpt and human). Please check your data."
|
||||
)
|
||||
|
||||
CONVERSATION_ROLE_FORMAT = {
|
||||
"chatml": "<|im_start|>{ROLE}",
|
||||
"zephyr": "<|{ROLE}|>",
|
||||
"vicuna_v1.1": "{ROLE}",
|
||||
}
|
||||
|
||||
|
||||
class ShareGPTPrompter(Prompter): # pylint: disable=too-few-public-methods
|
||||
"""
|
||||
@@ -274,9 +268,7 @@ class ShareGPTPrompter(Prompter): # pylint: disable=too-few-public-methods
|
||||
role_key_human = "human"
|
||||
role_key_model = "gpt"
|
||||
# Optional, only used for tool usage datasets.
|
||||
role_key_tool: Optional[str] = None
|
||||
# Optional, role input/output mapping
|
||||
roles: Optional[dict] = None
|
||||
role_key_tool = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -285,7 +277,6 @@ class ShareGPTPrompter(Prompter): # pylint: disable=too-few-public-methods
|
||||
role_key_human: Optional[str] = None,
|
||||
role_key_model: Optional[str] = None,
|
||||
role_key_tool: Optional[str] = None,
|
||||
roles: Optional[dict] = None,
|
||||
):
|
||||
if conversation:
|
||||
if isinstance(conversation, Conversation):
|
||||
@@ -300,8 +291,6 @@ class ShareGPTPrompter(Prompter): # pylint: disable=too-few-public-methods
|
||||
self.role_key_model = role_key_model
|
||||
if role_key_tool:
|
||||
self.role_key_tool = role_key_tool
|
||||
if roles:
|
||||
self.roles = roles
|
||||
|
||||
def _build_result(self, source):
|
||||
if len(source) < 2:
|
||||
@@ -333,23 +322,11 @@ class ShareGPTPrompter(Prompter): # pylint: disable=too-few-public-methods
|
||||
|
||||
conv.messages = []
|
||||
for _, sentence in enumerate(source):
|
||||
from_role = sentence["from"]
|
||||
if from_role in roles:
|
||||
role = roles[from_role]
|
||||
else:
|
||||
if self._conversation.name not in CONVERSATION_ROLE_FORMAT:
|
||||
raise NotImplementedError(
|
||||
f"Role ({role}) not in default roles, and {self._conversation.name} does not support role remapping yet."
|
||||
"Please help us by creating an Issue to add support for this conversation type."
|
||||
)
|
||||
|
||||
role = CONVERSATION_ROLE_FORMAT[self._conversation.name].format(
|
||||
ROLE=from_role
|
||||
)
|
||||
|
||||
if len(conv.messages) > 0 and ((role == conv.messages[-1][0])):
|
||||
role = roles[sentence["from"]]
|
||||
if len(conv.messages) > 0 and (
|
||||
(role == conv.messages[-1][0]) or (role not in conv.roles)
|
||||
):
|
||||
LOG.warning(f"{SHAREGPT_ASSERTION_FAILED_ROLE}: {sentence}")
|
||||
|
||||
conv.append_message(role, sentence["value"])
|
||||
|
||||
return conv.get_turns()
|
||||
@@ -377,13 +354,11 @@ class ShareGPTPrompterV2(ShareGPTPrompter):
|
||||
conversation: Optional[Union[str, Conversation]] = None,
|
||||
role_key_human: Optional[str] = None,
|
||||
role_key_model: Optional[str] = None,
|
||||
roles: Optional[dict] = None,
|
||||
):
|
||||
super().__init__(
|
||||
conversation=conversation,
|
||||
role_key_human=role_key_human,
|
||||
role_key_model=role_key_model,
|
||||
roles=roles,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -85,7 +85,7 @@ def train(
|
||||
model.generation_config.do_sample = True
|
||||
|
||||
model_ref = None
|
||||
if cfg.rl and cfg.rl != "orpo":
|
||||
if cfg.rl:
|
||||
if cfg.adapter and not cfg.rl_adapter_ref_model:
|
||||
# use built-in trl autounwrap
|
||||
LOG.debug("Passing model_ref: None to RL trainer")
|
||||
@@ -110,6 +110,9 @@ def train(
|
||||
total_num_steps,
|
||||
)
|
||||
|
||||
if hasattr(model, "config"):
|
||||
model.config.use_cache = False
|
||||
|
||||
# go ahead and presave, so we have the adapter config available to inspect
|
||||
if peft_config:
|
||||
LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
|
||||
|
||||
@@ -1,91 +0,0 @@
|
||||
"""
|
||||
module for LISA
|
||||
|
||||
Adapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl
|
||||
Arxiv: https://arxiv.org/abs/2403.17919
|
||||
License: Apache 2.0
|
||||
"""
|
||||
|
||||
import logging
|
||||
from functools import reduce
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
from transformers import TrainerCallback
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from axolotl.core.trainer_builder import AxolotlTrainer
|
||||
|
||||
LOG = logging.getLogger("axolotl.callbacks.lisa")
|
||||
|
||||
|
||||
def lisa_callback_factory(trainer: "AxolotlTrainer"):
|
||||
class LISACallback(TrainerCallback):
|
||||
"""trainer callback for lisa layer switching"""
|
||||
|
||||
def __init__(
|
||||
self, n_layers, step_interval, trainer, layers_attribute="model.layers"
|
||||
):
|
||||
super().__init__()
|
||||
self.n_layers = n_layers
|
||||
self.step_interval = step_interval
|
||||
self.layers_attribute = layers_attribute
|
||||
self.trainer = trainer
|
||||
|
||||
reduce(getattr, self.layers_attribute.split("."), self.trainer.model)
|
||||
|
||||
self.total_layers = len(
|
||||
reduce(getattr, self.layers_attribute.split("."), self.trainer.model)
|
||||
)
|
||||
self.active_layers_indices = []
|
||||
|
||||
layers = reduce(
|
||||
getattr, self.layers_attribute.split("."), self.trainer.model
|
||||
)
|
||||
LOG.info(
|
||||
f"LISA will activate {self.n_layers}/{len(layers)} layers ({self.n_layers*100/len(layers)}%) every {self.step_interval} steps"
|
||||
)
|
||||
|
||||
def freeze_all_layers(self):
|
||||
layers = reduce(
|
||||
getattr, self.layers_attribute.split("."), self.trainer.model
|
||||
)
|
||||
for layer in layers:
|
||||
for param in layer.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
def on_step_begin(
|
||||
self, args, state, control, **kwargs
|
||||
): # pylint: disable=unused-argument
|
||||
# Check if it's time to switch active layers, including at step 0
|
||||
if state.global_step % self.step_interval == 0 or state.global_step == 1:
|
||||
self.switch_active_layers()
|
||||
|
||||
def switch_active_layers(self):
|
||||
# First, disable gradients for all layers
|
||||
self.freeze_all_layers()
|
||||
|
||||
# Randomly select n_layers to activate
|
||||
layers = reduce(
|
||||
getattr, self.layers_attribute.split("."), self.trainer.model
|
||||
)
|
||||
self.active_layers_indices = np.random.choice(
|
||||
range(self.total_layers), self.n_layers, replace=False
|
||||
)
|
||||
LOG.info(
|
||||
f"Activating layers at indices: {self.active_layers_indices} for the next steps."
|
||||
)
|
||||
|
||||
# Enable gradients only for the selected layers
|
||||
for idx in self.active_layers_indices:
|
||||
for param in layers[idx].parameters():
|
||||
param.requires_grad = True
|
||||
|
||||
lisa_callback = LISACallback(
|
||||
n_layers=trainer.args.lisa_n_layers,
|
||||
step_interval=trainer.args.lisa_step_interval,
|
||||
trainer=trainer,
|
||||
layers_attribute=trainer.args.lisa_layers_attribute,
|
||||
)
|
||||
|
||||
return lisa_callback
|
||||
@@ -21,7 +21,7 @@ def chat_templates(user_choice: str):
|
||||
templates = {
|
||||
"alpaca": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response: ' + message['content'] + eos_token}}{% endif %}{% endfor %}",
|
||||
"inst": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", # I don't know what this one is called. Used by Mistral/Mixtral.
|
||||
"chatml": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
||||
"chatml": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
||||
"gemma": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
|
||||
}
|
||||
|
||||
|
||||
@@ -119,10 +119,6 @@ def normalize_config(cfg):
|
||||
model_config = load_model_config(cfg)
|
||||
cfg.model_config_type = model_config.model_type
|
||||
|
||||
cfg.tokenizer_config = (
|
||||
cfg.tokenizer_config or cfg.base_model_config or cfg.base_model
|
||||
)
|
||||
|
||||
# figure out if the model is llama
|
||||
cfg.is_llama_derived_model = (
|
||||
(hasattr(model_config, "model_type") and model_config.model_type == "llama")
|
||||
@@ -195,11 +191,6 @@ def normalize_cfg_datasets(cfg):
|
||||
f"updating dataset {ds_cfg.path} with `conversation: chatml` to match your chat_template"
|
||||
)
|
||||
cfg.datasets[idx].conversation = "chatml"
|
||||
if ds_cfg.type == "orpo.chat_template" and not ds_cfg.chat_template:
|
||||
LOG.info(
|
||||
f"updating dataset {ds_cfg.path} with `chat_template: chatml` to match your chat_template"
|
||||
)
|
||||
cfg.datasets[idx].chat_template = "chatml"
|
||||
|
||||
|
||||
def validate_config(cfg: DictDefault, capabilities: Optional[dict] = None):
|
||||
@@ -208,11 +199,11 @@ def validate_config(cfg: DictDefault, capabilities: Optional[dict] = None):
|
||||
dict(
|
||||
AxolotlConfigWCapabilities(
|
||||
**cfg.to_dict(), capabilities=capabilities
|
||||
).model_dump(exclude_none=True)
|
||||
).model_dump(exclude_unset=True)
|
||||
)
|
||||
)
|
||||
return DictDefault(
|
||||
dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_none=True))
|
||||
dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_unset=True))
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ Module for pydantic models for configuration
|
||||
import logging
|
||||
import os
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
||||
from typing import Any, Dict, List, Literal, Optional, Union
|
||||
|
||||
from pydantic import BaseModel, Field, conlist, field_validator, model_validator
|
||||
from transformers import SchedulerType
|
||||
@@ -96,8 +96,6 @@ class SFTDataset(BaseModel):
|
||||
field_human: Optional[str] = None
|
||||
field_model: Optional[str] = None
|
||||
|
||||
roles: Optional[Dict[str, List[str]]] = None
|
||||
|
||||
|
||||
class UserDefinedDPOType(BaseModel):
|
||||
"""User defined typing for DPO"""
|
||||
@@ -126,7 +124,6 @@ class RLType(str, Enum):
|
||||
dpo = "dpo" # pylint: disable=invalid-name
|
||||
ipo = "ipo" # pylint: disable=invalid-name
|
||||
kto_pair = "kto_pair" # pylint: disable=invalid-name
|
||||
orpo = "orpo" # pylint: disable=invalid-name
|
||||
|
||||
|
||||
class ChatTemplate(str, Enum):
|
||||
@@ -151,6 +148,12 @@ class PeftConfig(BaseModel):
|
||||
loftq_config: Optional[LoftQConfig] = None
|
||||
|
||||
|
||||
class AutoType(str, Enum):
|
||||
"""auto type string configuration subset - used for bf16"""
|
||||
|
||||
AUTO = "auto"
|
||||
|
||||
|
||||
class SpecialTokensConfig(BaseModel):
|
||||
"""Special tokens configuration subset"""
|
||||
|
||||
@@ -179,8 +182,7 @@ class LoraConfig(BaseModel):
|
||||
peft_layers_to_transform: Optional[List[int]] = None
|
||||
peft: Optional[PeftConfig] = None
|
||||
peft_use_dora: Optional[bool] = None
|
||||
peft_use_rslora: Optional[bool] = None
|
||||
peft_layer_replication: Optional[List[Tuple[int, int]]] = None
|
||||
peft_use_relora: Optional[bool] = None
|
||||
|
||||
lora_on_cpu: Optional[bool] = None
|
||||
gptq: Optional[bool] = None
|
||||
@@ -302,25 +304,14 @@ class HyperparametersConfig(BaseModel):
|
||||
},
|
||||
)
|
||||
|
||||
train_on_inputs: Optional[bool] = False
|
||||
train_on_inputs: Optional[bool] = None
|
||||
group_by_length: Optional[bool] = None
|
||||
|
||||
learning_rate: Union[str, float]
|
||||
weight_decay: Optional[float] = 0.0
|
||||
optimizer: Optional[
|
||||
Union[OptimizerNames, Literal["lion_pytorch"]]
|
||||
] = OptimizerNames.ADAMW_HF.value
|
||||
optim_args: Optional[Union[str, Dict[str, Any]]] = Field(
|
||||
default=None, metadata={"help": "Optional arguments to supply to optimizer."}
|
||||
)
|
||||
optim_target_modules: Optional[Union[List[str], Literal["all_linear"]]] = Field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "The target modules to optimize, i.e. the module names that you would like to train."
|
||||
},
|
||||
)
|
||||
weight_decay: Optional[float] = None
|
||||
optimizer: Optional[Union[OptimizerNames, Literal["lion_pytorch"]]] = None
|
||||
torchdistx_path: Optional[str] = None
|
||||
lr_scheduler: Optional[SchedulerType] = "cosine"
|
||||
lr_scheduler: Optional[SchedulerType] = None
|
||||
lr_scheduler_kwargs: Optional[Dict[str, Any]] = None
|
||||
lr_quadratic_warmup: Optional[bool] = None
|
||||
cosine_min_lr_ratio: Optional[float] = None
|
||||
@@ -370,23 +361,6 @@ class MLFlowConfig(BaseModel):
|
||||
hf_mlflow_log_artifacts: Optional[bool] = None
|
||||
|
||||
|
||||
class LISAConfig(BaseModel):
|
||||
"""LISA options"""
|
||||
|
||||
lisa_n_layers: Optional[int] = Field(
|
||||
default=None,
|
||||
metadata={"help": "the number of activate layers in LISA"},
|
||||
)
|
||||
lisa_step_interval: Optional[int] = Field(
|
||||
default=None,
|
||||
metadata={"help": "how often to switch layers in LISA"},
|
||||
)
|
||||
lisa_layers_attribute: Optional[str] = Field(
|
||||
default="model.layers",
|
||||
metadata={"help": "path under the model to access the layers"},
|
||||
)
|
||||
|
||||
|
||||
class WandbConfig(BaseModel):
|
||||
"""wandb configuration subset"""
|
||||
|
||||
@@ -421,7 +395,6 @@ class AxolotlInputConfig(
|
||||
HyperparametersConfig,
|
||||
WandbConfig,
|
||||
MLFlowConfig,
|
||||
LISAConfig,
|
||||
RemappedParameters,
|
||||
DeprecatedParameters,
|
||||
BaseModel,
|
||||
@@ -442,7 +415,6 @@ class AxolotlInputConfig(
|
||||
|
||||
datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None # type: ignore
|
||||
test_datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None # type: ignore
|
||||
shuffle_merged_datasets: Optional[bool] = True
|
||||
dataset_prepared_path: Optional[str] = None
|
||||
dataset_shard_num: Optional[int] = None
|
||||
dataset_shard_idx: Optional[int] = None
|
||||
@@ -459,8 +431,6 @@ class AxolotlInputConfig(
|
||||
dataloader_prefetch_factor: Optional[int] = None
|
||||
dataloader_drop_last: Optional[bool] = None
|
||||
|
||||
remove_unused_columns: Optional[bool] = None
|
||||
|
||||
push_dataset_to_hub: Optional[str] = None
|
||||
hf_use_auth_token: Optional[bool] = None
|
||||
|
||||
@@ -488,7 +458,7 @@ class AxolotlInputConfig(
|
||||
loss_watchdog_threshold: Optional[float] = None
|
||||
loss_watchdog_patience: Optional[int] = None
|
||||
|
||||
bf16: Optional[Union[Literal["auto"], bool]] = "auto"
|
||||
bf16: Optional[Union[AutoType, bool]] = AutoType.AUTO
|
||||
fp16: Optional[bool] = None
|
||||
bfloat16: Optional[bool] = None # for non-AMP cases
|
||||
float16: Optional[bool] = None # for non-AMP cases
|
||||
@@ -502,7 +472,7 @@ class AxolotlInputConfig(
|
||||
|
||||
unfrozen_parameters: Optional[List[str]] = None
|
||||
|
||||
sequence_len: int = Field(default=512)
|
||||
sequence_len: int = Field(default=1024)
|
||||
sample_packing: Optional[bool] = None
|
||||
eval_sample_packing: Optional[bool] = None
|
||||
pad_to_sequence_len: Optional[bool] = None
|
||||
@@ -545,13 +515,10 @@ class AxolotlInputConfig(
|
||||
|
||||
neftune_noise_alpha: Optional[float] = None
|
||||
|
||||
orpo_alpha: Optional[float] = None
|
||||
|
||||
max_memory: Optional[
|
||||
Dict[Union[int, Literal["cpu", "disk"]], Union[int, str]]
|
||||
] = None
|
||||
gpu_memory_limit: Optional[Union[int, str]] = None
|
||||
low_cpu_mem_usage: Optional[bool] = None
|
||||
|
||||
chat_template: Optional[ChatTemplate] = None
|
||||
default_system_message: Optional[str] = None
|
||||
@@ -564,10 +531,10 @@ class AxolotlInputConfig(
|
||||
sample_packing_eff_est: Optional[float] = None
|
||||
axolotl_config_path: Optional[str] = None
|
||||
|
||||
is_falcon_derived_model: Optional[bool] = Field(default=None)
|
||||
is_llama_derived_model: Optional[bool] = Field(default=None)
|
||||
is_mistral_derived_model: Optional[bool] = Field(default=None)
|
||||
is_qwen_derived_model: Optional[bool] = Field(default=None)
|
||||
is_falcon_derived_model: Optional[bool] = Field(default=False)
|
||||
is_llama_derived_model: Optional[bool] = Field(default=False)
|
||||
is_mistral_derived_model: Optional[bool] = Field(default=False)
|
||||
is_qwen_derived_model: Optional[bool] = Field(default=False)
|
||||
|
||||
@field_validator("datasets", mode="before")
|
||||
@classmethod
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
"""Module containing data utilities"""
|
||||
|
||||
import functools
|
||||
import hashlib
|
||||
import logging
|
||||
@@ -135,7 +134,7 @@ def load_tokenized_prepared_datasets(
|
||||
split="train",
|
||||
) -> Tuple[DatasetDict, List[Prompter]]:
|
||||
cfg_datasets = cfg.test_datasets if split == "test" else cfg.datasets
|
||||
tokenizer_name = cfg.tokenizer_config
|
||||
tokenizer_name = tokenizer.__class__.__name__
|
||||
ds_hash = str(
|
||||
md5(
|
||||
(
|
||||
@@ -224,7 +223,7 @@ def load_tokenized_prepared_datasets(
|
||||
token=use_auth_token,
|
||||
)
|
||||
ds_from_hub = True
|
||||
except (FileNotFoundError, ConnectionError, HFValidationError, ValueError):
|
||||
except (FileNotFoundError, ConnectionError, HFValidationError):
|
||||
pass
|
||||
|
||||
ds_from_cloud = False
|
||||
@@ -291,17 +290,14 @@ def load_tokenized_prepared_datasets(
|
||||
local_path = Path(config_dataset.path)
|
||||
if local_path.exists():
|
||||
if local_path.is_dir():
|
||||
if config_dataset.data_files:
|
||||
ds_type = get_ds_type(config_dataset)
|
||||
ds = load_dataset(
|
||||
ds_type,
|
||||
name=config_dataset.name,
|
||||
data_files=config_dataset.data_files,
|
||||
streaming=False,
|
||||
split=None,
|
||||
)
|
||||
else:
|
||||
ds = load_from_disk(config_dataset.path)
|
||||
# TODO dirs with arrow or parquet files could be loaded with `load_from_disk`
|
||||
ds = load_dataset(
|
||||
config_dataset.path,
|
||||
name=config_dataset.name,
|
||||
data_files=config_dataset.data_files,
|
||||
streaming=False,
|
||||
split=None,
|
||||
)
|
||||
elif local_path.is_file():
|
||||
ds_type = get_ds_type(config_dataset)
|
||||
|
||||
@@ -419,11 +415,8 @@ def load_tokenized_prepared_datasets(
|
||||
dataset = concatenate_datasets(datasets)
|
||||
|
||||
if len(datasets) > 1:
|
||||
if cfg.shuffle_merged_datasets:
|
||||
LOG.debug("shuffle merged datasets")
|
||||
dataset = dataset.shuffle(seed=seed)
|
||||
else:
|
||||
LOG.debug("NOT shuffling merged datasets")
|
||||
LOG.info("shuffle merged datasets")
|
||||
dataset = dataset.shuffle(seed=seed)
|
||||
|
||||
dataset, _ = process_datasets_for_packing(cfg, dataset, None)
|
||||
|
||||
@@ -826,11 +819,7 @@ def wrap_pretraining_dataset(
|
||||
else:
|
||||
encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
|
||||
|
||||
if cfg.shuffle_merged_datasets:
|
||||
dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
|
||||
else:
|
||||
LOG.debug("NOT shuffling merged pretraining datasets")
|
||||
|
||||
dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
|
||||
dataset = dataset.map(
|
||||
encode,
|
||||
batched=True,
|
||||
|
||||
@@ -3,7 +3,7 @@ module to freeze/unfreeze parameters by name
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
from typing import Callable, List, Tuple, Union
|
||||
from typing import Callable, List, Tuple
|
||||
|
||||
from axolotl.utils.distributed import is_main_process
|
||||
|
||||
@@ -99,7 +99,7 @@ def _invert_ranges(
|
||||
|
||||
|
||||
def _merge_ranges(
|
||||
given_ranges: List[Tuple[int, Union[int, None]]], layer_size: int
|
||||
given_ranges: List[Tuple[int, int | None]], layer_size: int
|
||||
) -> List[Tuple[int, int]]:
|
||||
"""
|
||||
Merges overlapping ranges and sorts the given ranges.
|
||||
@@ -194,9 +194,7 @@ class LayerNamePattern:
|
||||
"""
|
||||
return self.name_regex.match(name) is not None
|
||||
|
||||
def _parse_pattern(
|
||||
self, pattern: str
|
||||
) -> Tuple[str, Union[Tuple[int, Union[int, None]], None]]:
|
||||
def _parse_pattern(self, pattern: str) -> Tuple[str, Tuple[int, int | None] | None]:
|
||||
"""
|
||||
Extracts the range pattern from the given pattern.
|
||||
|
||||
|
||||
@@ -5,14 +5,16 @@ import logging
|
||||
import math
|
||||
import os
|
||||
import types
|
||||
from typing import Any, Dict, Optional, Tuple, Union # noqa: F401
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type, Union # noqa: F401
|
||||
|
||||
import addict
|
||||
import bitsandbytes as bnb
|
||||
import safetensors
|
||||
import torch
|
||||
import transformers
|
||||
from accelerate import init_empty_weights
|
||||
from bitsandbytes.nn import Params4bit
|
||||
from bitsandbytes.nn import Linear4bit, Params4bit
|
||||
from fastcore.parallel import parallel
|
||||
from peft import (
|
||||
LoftQConfig,
|
||||
PeftConfig,
|
||||
@@ -21,7 +23,7 @@ from peft import (
|
||||
prepare_model_for_kbit_training,
|
||||
)
|
||||
from peft.tuners.lora import QuantLinear
|
||||
from torch import nn
|
||||
from torch import Tensor, nn
|
||||
from transformers import ( # noqa: F401
|
||||
AddedToken,
|
||||
AutoConfig,
|
||||
@@ -33,7 +35,9 @@ from transformers import ( # noqa: F401
|
||||
PreTrainedTokenizerBase,
|
||||
)
|
||||
from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
|
||||
from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, hub
|
||||
|
||||
from axolotl.core.policies.auto_wrap import SUPPORTED_AUTO_WRAP_MODEL_TYPES
|
||||
from axolotl.models.mamba import fix_mamba_attn_for_loss
|
||||
from axolotl.monkeypatch.multipack import (
|
||||
SUPPORTED_MULTIPACK_MODEL_TYPES,
|
||||
@@ -134,8 +138,9 @@ def load_tokenizer(cfg):
|
||||
if cfg.tokenizer_type:
|
||||
tokenizer_cls = getattr(transformers, cfg.tokenizer_type)
|
||||
|
||||
tokenizer_config = cfg.tokenizer_config or cfg.base_model_config or cfg.base_model
|
||||
tokenizer = tokenizer_cls.from_pretrained(
|
||||
cfg.tokenizer_config,
|
||||
tokenizer_config,
|
||||
trust_remote_code=cfg.trust_remote_code or False,
|
||||
use_fast=use_fast,
|
||||
**tokenizer_kwargs,
|
||||
@@ -267,6 +272,117 @@ def load_tokenizer(cfg):
|
||||
return tokenizer
|
||||
|
||||
|
||||
def replace_linear(
|
||||
model: nn.Module,
|
||||
linear_replacement: Type[nn.Module],
|
||||
quant_config: Union[dict, None] = None,
|
||||
skip_modules=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Replace linear modules with a new Linear module.
|
||||
Parameters:
|
||||
model (`torch.nn.Module`):
|
||||
Input model or `torch.nn.Module` as the function is run recursively.
|
||||
linear_replacement (`torch.nn.Module`):
|
||||
The linear module that replaces the old one. Only expects standard arguments.
|
||||
If other arguments need to be passed, use a lambda.
|
||||
skip_modules (`List[str]`, *optional*, defaults to `lm_head`):
|
||||
List of modules names not to convert. Defaults to `lm_head`.
|
||||
"""
|
||||
if skip_modules is None:
|
||||
skip_modules = ["lm_head"]
|
||||
for name, module in model.named_children():
|
||||
if len(list(module.children())) > 0:
|
||||
replace_linear(
|
||||
module, linear_replacement, quant_config, skip_modules, **kwargs
|
||||
)
|
||||
|
||||
if isinstance(module, torch.nn.Linear) and name not in skip_modules:
|
||||
if issubclass(linear_replacement, Linear4bit):
|
||||
model._modules[ # pylint: disable=protected-access
|
||||
name
|
||||
] = linear_replacement(
|
||||
module.in_features,
|
||||
module.out_features,
|
||||
module.bias is not None,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported linear replacement: {type(linear_replacement)}"
|
||||
)
|
||||
return model
|
||||
|
||||
|
||||
def load_and_quantize(
|
||||
module: nn.Module,
|
||||
name: str,
|
||||
value: Tensor,
|
||||
device: torch.device = None,
|
||||
dtype: torch.dtype = None,
|
||||
skip_names: Optional[List[str]] = None,
|
||||
is_meta_rank: bool = False,
|
||||
low_memory: bool = True,
|
||||
verbose: bool = False,
|
||||
quant_method: str = "bnb",
|
||||
):
|
||||
"""
|
||||
Loads `value` tensor into submodule of `module`, optionally skipping `skip_names` and converting to `dtype`.
|
||||
|
||||
Quantizes `Params4bit` on `device` then places on "cpu" if low_memory=True or "meta" if is_meta_rank=True.
|
||||
"""
|
||||
|
||||
if skip_names is None:
|
||||
skip_names = []
|
||||
|
||||
def place_on_device(value):
|
||||
if is_meta_rank:
|
||||
device = "meta"
|
||||
elif low_memory:
|
||||
device = "cpu"
|
||||
else:
|
||||
device = "cuda"
|
||||
return value.to(device=device, dtype=dtype)
|
||||
|
||||
if any(skip_name in name for skip_name in skip_names):
|
||||
if verbose:
|
||||
print(f"Skipping {name} because it is in skip_names")
|
||||
return
|
||||
|
||||
module_key, _, value_key = name.rpartition(".")
|
||||
try:
|
||||
submodule = module.get_submodule(module_key)
|
||||
except AttributeError as exc:
|
||||
print(f"Module {module_key} not found:\n{exc}")
|
||||
return
|
||||
|
||||
try:
|
||||
if quant_method == "bnb":
|
||||
param = submodule.get_parameter(value_key)
|
||||
if isinstance(param, Params4bit):
|
||||
# With `sync_module_states=True`, a meta device Params4bit needs to be the same
|
||||
# shape as the quantized Params4bit with an initialized quant_state. However,
|
||||
# FSDP only syncs parameters and buffers, so the quant_state isn't copied. This
|
||||
# workaround quantizes Params4bit to initialize quant_state on all ranks, then
|
||||
# replaces Params4bit's data with a meta tensor to free memory on non-rank 0.
|
||||
value = type(param)(
|
||||
value.to(device=device, dtype=dtype).data, **param.__dict__
|
||||
).cuda(device)
|
||||
if is_meta_rank:
|
||||
value = type(param)(value.data.to("meta"), **value.__dict__)
|
||||
elif low_memory:
|
||||
value = type(param)(value.data.to("cpu"), **value.__dict__)
|
||||
else:
|
||||
value = type(param)(place_on_device(value).data)
|
||||
|
||||
except AttributeError:
|
||||
# it's a buffer
|
||||
value = place_on_device(value)
|
||||
|
||||
setattr(submodule, value_key, value)
|
||||
|
||||
|
||||
def load_model(
|
||||
cfg: DictDefault,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
@@ -402,9 +518,7 @@ def load_model(
|
||||
from accelerate import infer_auto_device_map
|
||||
|
||||
with init_empty_weights():
|
||||
model_canvas = AutoModelForCausalLM.from_config(
|
||||
model_config, trust_remote_code=cfg.trust_remote_code or False
|
||||
)
|
||||
model_canvas = AutoModelForCausalLM.from_config(model_config)
|
||||
model_canvas.tie_weights()
|
||||
device_map = infer_auto_device_map(
|
||||
model_canvas,
|
||||
@@ -454,12 +568,7 @@ def load_model(
|
||||
"bnb_4bit_compute_dtype": cfg.torch_dtype,
|
||||
"bnb_4bit_use_double_quant": True,
|
||||
"bnb_4bit_quant_type": "nf4",
|
||||
"bnb_4bit_quant_storage": torch.bfloat16,
|
||||
}
|
||||
if not cfg.deepspeed:
|
||||
# for some reason, this causes the loss to be off by an order of magnitude
|
||||
# but deepspeed needs this still in bfloat16
|
||||
bnb_config["bnb_4bit_quant_storage"] = torch.float32
|
||||
|
||||
if cfg.bnb_config_kwargs:
|
||||
bnb_config.update(cfg.bnb_config_kwargs)
|
||||
@@ -508,13 +617,78 @@ def load_model(
|
||||
model_kwargs["attn_implementation"] = "eager"
|
||||
model_config._attn_implementation = "eager" # pylint: disable=protected-access
|
||||
|
||||
if cfg.low_cpu_mem_usage:
|
||||
model_kwargs["low_cpu_mem_usage"] = True
|
||||
|
||||
qlora_fsdp = cfg.fsdp and cfg.adapter == "qlora"
|
||||
qlora_fsdp = (
|
||||
cfg.fsdp
|
||||
and cfg.adapter == "qlora"
|
||||
and model_config.model_type in SUPPORTED_AUTO_WRAP_MODEL_TYPES
|
||||
)
|
||||
|
||||
try:
|
||||
if (
|
||||
if qlora_fsdp:
|
||||
if cfg.bf16 or cfg.bfloat16:
|
||||
torch_dtype, compute_dtype = torch.float32, torch.bfloat16
|
||||
elif cfg.fp16 or cfg.float16:
|
||||
torch_dtype, compute_dtype = torch.float32, torch.float16
|
||||
else:
|
||||
torch_dtype, compute_dtype = torch.float32, torch.float16
|
||||
|
||||
with init_empty_weights():
|
||||
LOG.info("Loading model with empty weights.")
|
||||
model = AutoModelForCausalLM.from_config(model_config)
|
||||
model.model = replace_linear(
|
||||
model.model,
|
||||
Linear4bit,
|
||||
compute_dtype=compute_dtype,
|
||||
quant_type="nf4",
|
||||
quant_storage=torch_dtype,
|
||||
)
|
||||
|
||||
model.is_loaded_in_4bit = True
|
||||
|
||||
# Grab the safetensors files that hold the weights
|
||||
try:
|
||||
idx = hub.cached_file(base_model, SAFE_WEIGHTS_INDEX_NAME)
|
||||
files, _ = hub.get_checkpoint_shard_files(base_model, idx)
|
||||
except OSError:
|
||||
try:
|
||||
# This means the model doesn't have a model.safetensors.index.json because it is not sharded
|
||||
files = []
|
||||
files.append(hub.cached_file(base_model, SAFE_WEIGHTS_NAME))
|
||||
except OSError as exc:
|
||||
# This means the model probably doesn't have a safetensors file
|
||||
raise exc
|
||||
|
||||
# Load in the weights, using our custom load_and_quantize method which quantizes Params4bit on the fly
|
||||
# and then places each layer on CPU or meta if using low_memory to minimize GPU memory usage
|
||||
def load_and_quantize_parallel(name_param, model, **kwargs):
|
||||
name, param = name_param
|
||||
load_and_quantize(model, name, param, **kwargs)
|
||||
|
||||
param_count = sum((p.numel() for n, p in model.named_parameters()))
|
||||
for filename in files:
|
||||
weights = safetensors.torch.load_file(filename)
|
||||
quant_method = "bnb"
|
||||
devprops = torch.cuda.get_device_properties(torch.cuda.current_device())
|
||||
left = int(os.cpu_count() / torch.cuda.device_count())
|
||||
right = int(
|
||||
8 * (devprops.total_memory / 1e9 / 40) * (70 / (param_count / 1e9))
|
||||
)
|
||||
n_workers = min(left, right)
|
||||
parallel(
|
||||
load_and_quantize_parallel,
|
||||
weights.items(),
|
||||
n_workers=n_workers,
|
||||
threadpool=True,
|
||||
model=model,
|
||||
dtype=torch_dtype,
|
||||
device=cfg.local_rank,
|
||||
skip_names=[],
|
||||
is_meta_rank=(cfg.local_rank != 0),
|
||||
verbose=False,
|
||||
quant_method=quant_method,
|
||||
)
|
||||
|
||||
elif (
|
||||
model_config.model_type == "llama"
|
||||
and not cfg.trust_remote_code
|
||||
and not cfg.gptq
|
||||
@@ -541,6 +715,27 @@ def load_model(
|
||||
if cfg.flash_attn_fuse_qkv:
|
||||
LOG.info("patching with fused QKV")
|
||||
replace_llama_qkv_with_fused(model)
|
||||
elif (
|
||||
model_config.model_type == "mixtral"
|
||||
and not cfg.adapter
|
||||
and cfg.fuse_moe
|
||||
):
|
||||
from axolotl.monkeypatch.utils import set_module_name
|
||||
from axolotl.monkeypatch.moe.moe import SparseMoeBlock
|
||||
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
|
||||
|
||||
for name, module in model.named_modules():
|
||||
if isinstance(module, MixtralSparseMoeBlock):
|
||||
smoe = SparseMoeBlock(
|
||||
experts=module.experts,
|
||||
gate=module.gate,
|
||||
hidden_dim=module.hidden_dim,
|
||||
ffn_dim=module.ffn_dim,
|
||||
num_experts=module.num_experts,
|
||||
top_k=module.top_k,
|
||||
)
|
||||
set_module_name(model, name, smoe)
|
||||
|
||||
elif model_type == "MambaLMHeadModel":
|
||||
# FIXME this is janky at best and hacked together to make it work
|
||||
MambaLMHeadModel = fix_mamba_attn_for_loss() # pylint: disable=invalid-name
|
||||
@@ -688,9 +883,7 @@ def load_model(
|
||||
|
||||
if cfg.adapter in ["lora", "qlora"]:
|
||||
if cfg.gradient_checkpointing:
|
||||
model.gradient_checkpointing_enable(
|
||||
gradient_checkpointing_kwargs=cfg.gradient_checkpointing_kwargs
|
||||
)
|
||||
model.gradient_checkpointing_enable()
|
||||
if (
|
||||
cfg.load_in_8bit or cfg.load_in_4bit
|
||||
) and not skip_prepare_model_for_kbit_training:
|
||||
@@ -858,9 +1051,7 @@ def load_lora(model, cfg, inference=False, config_only=False):
|
||||
if cfg.peft_use_dora:
|
||||
lora_config_kwargs["use_dora"] = cfg.peft_use_dora
|
||||
if cfg.peft_use_rslora:
|
||||
lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora
|
||||
if cfg.peft_layer_replication:
|
||||
lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication
|
||||
lora_config_kwargs["use_rslora"] = cfg.use_rslora
|
||||
|
||||
lora_config = LoraConfig(
|
||||
r=cfg.lora_r,
|
||||
|
||||
@@ -11,7 +11,6 @@ import torch.cuda
|
||||
from accelerate.logging import get_logger
|
||||
from datasets import set_caching_enabled
|
||||
from torch.utils.data import DataLoader, RandomSampler
|
||||
from transformers.utils import is_torch_bf16_gpu_available
|
||||
|
||||
from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFDPOTrainerBuilder
|
||||
from axolotl.utils.distributed import is_main_process, reduce_and_broadcast, zero_first
|
||||
@@ -125,10 +124,9 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
||||
eval_dataset = eval_dataset.remove_columns("attention_mask")
|
||||
|
||||
if cfg.model_config_type == "falcon":
|
||||
LOG.info("dropping token_type_ids column if it exists")
|
||||
if "token_type_ids" in train_dataset.column_names:
|
||||
train_dataset = train_dataset.remove_columns("token_type_ids")
|
||||
if eval_dataset and "token_type_ids" in eval_dataset.column_names:
|
||||
LOG.info("dropping token_type_ids column")
|
||||
train_dataset = train_dataset.remove_columns("token_type_ids")
|
||||
if eval_dataset:
|
||||
eval_dataset = eval_dataset.remove_columns("token_type_ids")
|
||||
|
||||
train_dataset = train_dataset.filter(
|
||||
@@ -306,14 +304,8 @@ def setup_fsdp_envs(cfg):
|
||||
os.environ["FSDP_OFFLOAD_PARAMS"] = "true"
|
||||
if cfg.fsdp_config.fsdp_sync_module_states:
|
||||
os.environ["FSDP_SYNC_MODULE_STATES"] = "true"
|
||||
if cfg.fsdp_config.fsdp_cpu_ram_efficient_loading:
|
||||
os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "true"
|
||||
if cfg.fsdp_config.fsdp_use_orig_params:
|
||||
os.environ["FSDP_USE_ORIG_PARAMS"] = "true"
|
||||
if cfg.fsdp_config.fsdp_state_dict_type:
|
||||
os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.fsdp_state_dict_type
|
||||
if cfg.fsdp_config.fsdp_auto_wrap_policy:
|
||||
os.environ["FSDP_AUTO_WRAP_POLICY"] = cfg.fsdp_config.fsdp_auto_wrap_policy
|
||||
if cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap:
|
||||
os.environ[
|
||||
"FSDP_TRANSFORMER_CLS_TO_WRAP"
|
||||
@@ -327,11 +319,6 @@ def prepare_optim_env(cfg):
|
||||
os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
|
||||
os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed
|
||||
|
||||
if (cfg.bf16 == "auto" and is_torch_bf16_gpu_available()) or cfg.bf16 is True:
|
||||
os.environ["ACCELERATE_MIXED_PRECISION"] = "bf16"
|
||||
elif cfg.fp16:
|
||||
os.environ["ACCELERATE_MIXED_PRECISION"] = "fp16"
|
||||
|
||||
|
||||
def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
|
||||
if cfg.rl in ["dpo", "ipo", "kto_pair"]:
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
/* css styles */
|
||||
@@ -1,18 +1,16 @@
|
||||
"""
|
||||
unit tests for axolotl.core.trainer_builder
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from axolotl.core.trainer_builder import HFDPOTrainerBuilder
|
||||
from axolotl.utils.config import normalize_config
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.models import load_model, load_tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(name="cfg")
|
||||
def fixture_cfg():
|
||||
cfg = DictDefault(
|
||||
return DictDefault(
|
||||
{
|
||||
"base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
|
||||
"model_type": "AutoModelForCausalLM",
|
||||
@@ -36,10 +34,6 @@ def fixture_cfg():
|
||||
}
|
||||
)
|
||||
|
||||
normalize_config(cfg)
|
||||
|
||||
return cfg
|
||||
|
||||
|
||||
@pytest.fixture(name="tokenizer")
|
||||
def fixture_tokenizer(cfg):
|
||||
|
||||
@@ -77,7 +77,7 @@ class TestMixtral(unittest.TestCase):
|
||||
model, _ = train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (
|
||||
model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
|
||||
== torch.float32
|
||||
== torch.uint8
|
||||
)
|
||||
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
||||
|
||||
@@ -131,7 +131,7 @@ class TestMixtral(unittest.TestCase):
|
||||
model, _ = train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (
|
||||
model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
|
||||
== torch.float32
|
||||
== torch.uint8
|
||||
)
|
||||
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
||||
|
||||
|
||||
60
tests/monkeypatch/test_moe.py
Normal file
60
tests/monkeypatch/test_moe.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import torch
|
||||
import pytest
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
from axolotl.monkeypatch.moe.mlp import FusedExperts
|
||||
from axolotl.monkeypatch.moe.moe import SparseMoeBlock
|
||||
|
||||
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock, MixtralConfig
|
||||
|
||||
def test_fused_mixtral_moe():
|
||||
# NOTE: Requires torch 2.2.0
|
||||
# Set random seeds for reproducibility
|
||||
torch.set_default_dtype(torch.float16)
|
||||
torch.set_default_device("cuda")
|
||||
torch.manual_seed(0)
|
||||
|
||||
# Define the configuration for the MixtralSparseMoeBlock
|
||||
config = MixtralConfig(
|
||||
hidden_size=128,
|
||||
intermediate_size=512,
|
||||
num_local_experts=8,
|
||||
num_experts_per_tok=2,
|
||||
)
|
||||
|
||||
# Initialize the MixtralSparseMoeBlock and SparseMoeBlock with the same configuration
|
||||
mixtral_moe = MixtralSparseMoeBlock(config)
|
||||
sparse_moe = SparseMoeBlock(
|
||||
experts=mixtral_moe.experts,
|
||||
gate=mixtral_moe.gate,
|
||||
hidden_dim=config.hidden_size,
|
||||
ffn_dim=config.intermediate_size,
|
||||
num_experts=config.num_local_experts,
|
||||
top_k=config.num_experts_per_tok
|
||||
)
|
||||
|
||||
assert torch.cat([
|
||||
mixtral_moe.experts[0].w1.weight.data,
|
||||
mixtral_moe.experts[0].w3.weight.data], dim=0
|
||||
).equal(sparse_moe.experts.experts.weight[0])
|
||||
|
||||
# Generate random input data
|
||||
batch_size = 16
|
||||
sequence_length = 32
|
||||
input_data = torch.randn(batch_size, sequence_length, config.hidden_size)
|
||||
|
||||
# Run the forward pass with gradients for both models
|
||||
with torch.no_grad():
|
||||
mixtral_output, mixtral_router_logits = mixtral_moe(input_data)
|
||||
sparse_output, sparse_router_logits = sparse_moe(input_data)
|
||||
|
||||
# Compute the difference between the outputs
|
||||
output_diff = torch.abs(mixtral_output - sparse_output).mean().item()
|
||||
router_diff = torch.abs(mixtral_router_logits - sparse_router_logits).mean().item()
|
||||
|
||||
# Define the tolerance for the difference
|
||||
tolerance = 0.05
|
||||
|
||||
# # Check if the difference is within the tolerance
|
||||
assert output_diff < 0.05, f"Output difference is {output_diff}, which is greater than the tolerance of {tolerance}"
|
||||
assert router_diff == 0, f"Output difference is {output_diff}, which is greater than the tolerance of {tolerance}"
|
||||
@@ -62,38 +62,6 @@ def fixture_sharegpt_glaive_dataset():
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(name="multi_role_dataset")
|
||||
def fixture_multi_role_dataset():
|
||||
return Dataset.from_list(
|
||||
[
|
||||
{
|
||||
"conversations": [
|
||||
{
|
||||
"from": "system",
|
||||
"value": "use get_weather(city) to get the weather for a city",
|
||||
},
|
||||
{
|
||||
"from": "human",
|
||||
"value": "hello, what's the weather in New York?",
|
||||
},
|
||||
{
|
||||
"from": "gpt",
|
||||
"value": "let me get that for you",
|
||||
},
|
||||
{
|
||||
"from": "tool",
|
||||
"value": "get_weather(New York)",
|
||||
},
|
||||
{
|
||||
"from": "gpt",
|
||||
"value": "the weather in New York is 70 degrees and sunny",
|
||||
},
|
||||
]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(name="tokenizer")
|
||||
def fixture_tokenizer():
|
||||
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
|
||||
@@ -228,39 +196,3 @@ class TestSharegpt:
|
||||
32001, 13892, 13, 28737, 28742, 28719, 7371, 28725, 562, 315, 949, 28742, 28707, 506, 272, 21368, 298, 1820, 22447, 28723, 28705, 523, 28766, 416, 1009, 772, 28766, 28767, 32000, 28705, 13 # gpt
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
def test_multi_role_dataset(self, multi_role_dataset, tokenizer):
|
||||
strategy = SimpleShareGPTPromptTokenizingStrategy(
|
||||
ShareGPTPrompterV2(conversation="chatml", roles={"input": ["tool"]}),
|
||||
tokenizer,
|
||||
False, # train_on_inputs
|
||||
2048, # sequence_len
|
||||
)
|
||||
|
||||
dataset_wrapper = TokenizedPromptDataset(
|
||||
strategy, multi_role_dataset, process_count=1
|
||||
)
|
||||
|
||||
input_ids = dataset_wrapper[0]["input_ids"]
|
||||
# fmt: off
|
||||
assert input_ids == [
|
||||
1, # bos
|
||||
32001, 1587, 13, 1730, 625, 28730, 769, 1223, 28732, 18373, 28731, 298, 625, 272, 8086, 354, 264, 2990, 32000, 28705, 13, # system
|
||||
32001, 2188, 13, 21558, 28725, 767, 28742, 28713, 272, 8086, 297, 1450, 2726, 28804, 32000, 28705, 13, # human
|
||||
32001, 13892, 13, 895, 528, 625, 369, 354, 368, 32000, 28705, 13, # gpt
|
||||
32001, 3921, 13, 527, 28730, 769, 1223, 28732, 2972, 2726, 28731, 32000, 28705, 13, # tool
|
||||
32001, 13892, 13, 1237, 8086, 297, 1450, 2726, 349, 28705, 28787, 28734, 11182, 304, 4376, 1780, 32000, 28705, 13 # gpt
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
labels = dataset_wrapper[0]["labels"]
|
||||
# fmt: off
|
||||
assert labels == [
|
||||
-100, # bos
|
||||
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # system
|
||||
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # human
|
||||
-100, -100, 13, 895, 528, 625, 369, 354, 368, 32000, 28705, 13, # gpt
|
||||
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # tool
|
||||
-100, -100, 13, 1237, 8086, 297, 1450, 2726, 349, 28705, 28787, 28734, 11182, 304, 4376, 1780, 32000, 28705, 13 # gpt
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
@@ -1,272 +0,0 @@
|
||||
"""
|
||||
Test dataset loading under various conditions.
|
||||
"""
|
||||
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from datasets import Dataset
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from axolotl.utils.data import load_tokenized_prepared_datasets
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
|
||||
class TestDatasetPreparation(unittest.TestCase):
|
||||
"""Test a configured dataloader."""
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
|
||||
self.tokenizer.add_special_tokens(
|
||||
{
|
||||
"bos_token": "<s>",
|
||||
"eos_token": "</s>",
|
||||
"unk_token": "<unk>",
|
||||
}
|
||||
)
|
||||
# Alpaca dataset.
|
||||
self.dataset = Dataset.from_list(
|
||||
[
|
||||
{
|
||||
"instruction": "Evaluate this sentence for spelling and grammar mistakes",
|
||||
"input": "He finnished his meal and left the resturant",
|
||||
"output": "He finished his meal and left the restaurant.",
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
def test_load_hub(self):
|
||||
"""Core use case. Verify that processing data from the hub works"""
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
prepared_path = Path(tmp_dir) / "prepared"
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"tokenizer_config": "huggyllama/llama-7b",
|
||||
"sequence_len": 1024,
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
dataset, _ = load_tokenized_prepared_datasets(
|
||||
self.tokenizer, cfg, prepared_path
|
||||
)
|
||||
|
||||
assert len(dataset) == 2000
|
||||
assert "input_ids" in dataset.features
|
||||
assert "attention_mask" in dataset.features
|
||||
assert "labels" in dataset.features
|
||||
|
||||
def test_load_local_hub(self):
|
||||
"""Niche use case. Verify that a local copy of a hub dataset can be loaded"""
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tmp_ds_path = Path("mhenrichsen/alpaca_2k_test")
|
||||
tmp_ds_path.mkdir(parents=True, exist_ok=True)
|
||||
snapshot_download(
|
||||
repo_id="mhenrichsen/alpaca_2k_test",
|
||||
repo_type="dataset",
|
||||
local_dir=tmp_ds_path,
|
||||
)
|
||||
|
||||
prepared_path = Path(tmp_dir) / "prepared"
|
||||
# Right now a local copy that doesn't fully conform to a dataset
|
||||
# must list data_files and ds_type otherwise the loader won't know
|
||||
# how to load it.
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"tokenizer_config": "huggyllama/llama-7b",
|
||||
"sequence_len": 1024,
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"ds_type": "parquet",
|
||||
"type": "alpaca",
|
||||
"data_files": [
|
||||
"mhenrichsen/alpaca_2k_test/alpaca_2000.parquet",
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
dataset, _ = load_tokenized_prepared_datasets(
|
||||
self.tokenizer, cfg, prepared_path
|
||||
)
|
||||
|
||||
assert len(dataset) == 2000
|
||||
assert "input_ids" in dataset.features
|
||||
assert "attention_mask" in dataset.features
|
||||
assert "labels" in dataset.features
|
||||
shutil.rmtree(tmp_ds_path)
|
||||
|
||||
def test_load_from_save_to_disk(self):
|
||||
"""Usual use case. Verify datasets saved via `save_to_disk` can be loaded."""
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tmp_ds_name = Path(tmp_dir) / "tmp_dataset"
|
||||
self.dataset.save_to_disk(tmp_ds_name)
|
||||
|
||||
prepared_path = Path(tmp_dir) / "prepared"
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"tokenizer_config": "huggyllama/llama-7b",
|
||||
"sequence_len": 256,
|
||||
"datasets": [
|
||||
{
|
||||
"path": str(tmp_ds_name),
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
dataset, _ = load_tokenized_prepared_datasets(
|
||||
self.tokenizer, cfg, prepared_path
|
||||
)
|
||||
|
||||
assert len(dataset) == 1
|
||||
assert "input_ids" in dataset.features
|
||||
assert "attention_mask" in dataset.features
|
||||
assert "labels" in dataset.features
|
||||
|
||||
def test_load_from_dir_of_parquet(self):
|
||||
"""Usual use case. Verify a directory of parquet files can be loaded."""
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
|
||||
tmp_ds_dir.mkdir()
|
||||
tmp_ds_path = tmp_ds_dir / "shard1.parquet"
|
||||
self.dataset.to_parquet(tmp_ds_path)
|
||||
|
||||
prepared_path: Path = Path(tmp_dir) / "prepared"
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"tokenizer_config": "huggyllama/llama-7b",
|
||||
"sequence_len": 256,
|
||||
"datasets": [
|
||||
{
|
||||
"path": str(tmp_ds_dir),
|
||||
"ds_type": "parquet",
|
||||
"name": "test_data",
|
||||
"data_files": [
|
||||
str(tmp_ds_path),
|
||||
],
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
dataset, _ = load_tokenized_prepared_datasets(
|
||||
self.tokenizer, cfg, prepared_path
|
||||
)
|
||||
|
||||
assert len(dataset) == 1
|
||||
assert "input_ids" in dataset.features
|
||||
assert "attention_mask" in dataset.features
|
||||
assert "labels" in dataset.features
|
||||
|
||||
def test_load_from_dir_of_json(self):
|
||||
"""Standard use case. Verify a directory of json files can be loaded."""
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
|
||||
tmp_ds_dir.mkdir()
|
||||
tmp_ds_path = tmp_ds_dir / "shard1.json"
|
||||
self.dataset.to_json(tmp_ds_path)
|
||||
|
||||
prepared_path: Path = Path(tmp_dir) / "prepared"
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"tokenizer_config": "huggyllama/llama-7b",
|
||||
"sequence_len": 256,
|
||||
"datasets": [
|
||||
{
|
||||
"path": str(tmp_ds_dir),
|
||||
"ds_type": "json",
|
||||
"name": "test_data",
|
||||
"data_files": [
|
||||
str(tmp_ds_path),
|
||||
],
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
dataset, _ = load_tokenized_prepared_datasets(
|
||||
self.tokenizer, cfg, prepared_path
|
||||
)
|
||||
|
||||
assert len(dataset) == 1
|
||||
assert "input_ids" in dataset.features
|
||||
assert "attention_mask" in dataset.features
|
||||
assert "labels" in dataset.features
|
||||
|
||||
def test_load_from_single_parquet(self):
|
||||
"""Standard use case. Verify a single parquet file can be loaded."""
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tmp_ds_path = Path(tmp_dir) / "tmp_dataset.parquet"
|
||||
self.dataset.to_parquet(tmp_ds_path)
|
||||
|
||||
prepared_path: Path = Path(tmp_dir) / "prepared"
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"tokenizer_config": "huggyllama/llama-7b",
|
||||
"sequence_len": 256,
|
||||
"datasets": [
|
||||
{
|
||||
"path": str(tmp_ds_path),
|
||||
"name": "test_data",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
dataset, _ = load_tokenized_prepared_datasets(
|
||||
self.tokenizer, cfg, prepared_path
|
||||
)
|
||||
|
||||
assert len(dataset) == 1
|
||||
assert "input_ids" in dataset.features
|
||||
assert "attention_mask" in dataset.features
|
||||
assert "labels" in dataset.features
|
||||
|
||||
def test_load_from_single_json(self):
|
||||
"""Standard use case. Verify a single json file can be loaded."""
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tmp_ds_path = Path(tmp_dir) / "tmp_dataset.json"
|
||||
self.dataset.to_json(tmp_ds_path)
|
||||
|
||||
prepared_path: Path = Path(tmp_dir) / "prepared"
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"tokenizer_config": "huggyllama/llama-7b",
|
||||
"sequence_len": 256,
|
||||
"datasets": [
|
||||
{
|
||||
"path": str(tmp_ds_path),
|
||||
"name": "test_data",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
dataset, _ = load_tokenized_prepared_datasets(
|
||||
self.tokenizer, cfg, prepared_path
|
||||
)
|
||||
|
||||
assert len(dataset) == 1
|
||||
assert "input_ids" in dataset.features
|
||||
assert "attention_mask" in dataset.features
|
||||
assert "labels" in dataset.features
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -8,8 +8,7 @@ from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from datasets import load_dataset
|
||||
from transformers import AddedToken, AutoTokenizer, LlamaTokenizer
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
|
||||
from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
|
||||
from axolotl.prompt_strategies.alpaca_w_system import (
|
||||
@@ -20,14 +19,12 @@ from axolotl.prompt_strategies.llama2_chat import (
|
||||
Llama2ChatPrompter,
|
||||
LLama2ChatTokenizingStrategy,
|
||||
)
|
||||
from axolotl.prompt_strategies.orpo.chat_template import load
|
||||
from axolotl.prompt_strategies.sharegpt import GlaiveShareGPTPromptTokenizingStrategy
|
||||
from axolotl.prompt_tokenizers import (
|
||||
AlpacaPromptTokenizingStrategy,
|
||||
ShareGPTPromptTokenizingStrategy,
|
||||
)
|
||||
from axolotl.prompters import AlpacaPrompter, PromptStyle, ShareGPTPrompterV2
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
LOG = logging.getLogger("axolotl")
|
||||
|
||||
@@ -449,57 +446,5 @@ If a question does not make any sense, or is not factually coherent, explain why
|
||||
)
|
||||
|
||||
|
||||
class OrpoTokenizationTest(unittest.TestCase):
|
||||
"""test case for the ORPO tokenization"""
|
||||
|
||||
def setUp(self) -> None:
|
||||
# pylint: disable=duplicate-code
|
||||
tokenizer = LlamaTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
|
||||
tokenizer.add_special_tokens(
|
||||
{
|
||||
"eos_token": AddedToken(
|
||||
"<|im_end|>", rstrip=False, lstrip=False, normalized=False
|
||||
)
|
||||
}
|
||||
)
|
||||
tokenizer.add_tokens(
|
||||
[
|
||||
AddedToken(
|
||||
"<|im_start|>", rstrip=False, lstrip=False, normalized=False
|
||||
),
|
||||
]
|
||||
)
|
||||
self.tokenizer = tokenizer
|
||||
self.dataset = load_dataset(
|
||||
"argilla/ultrafeedback-binarized-preferences-cleaned", split="train"
|
||||
).select([0])
|
||||
|
||||
def test_orpo_integration(self):
|
||||
strat = load(
|
||||
self.tokenizer,
|
||||
DictDefault({"train_on_inputs": False}),
|
||||
DictDefault({"chat_template": "chatml"}),
|
||||
)
|
||||
res = strat.tokenize_prompt(self.dataset[0])
|
||||
assert "rejected_input_ids" in res
|
||||
assert "rejected_labels" in res
|
||||
assert "input_ids" in res
|
||||
assert "labels" in res
|
||||
assert "prompt_attention_mask" in res
|
||||
|
||||
assert len(res["rejected_input_ids"]) == len(res["rejected_labels"])
|
||||
assert len(res["input_ids"]) == len(res["labels"])
|
||||
assert len(res["input_ids"]) == len(res["prompt_attention_mask"])
|
||||
|
||||
assert res["rejected_labels"][0] == -100
|
||||
assert res["rejected_input_ids"][-1] == res["rejected_labels"][-1]
|
||||
|
||||
assert res["labels"][0] == -100
|
||||
assert res["input_ids"][-1] == res["labels"][-1]
|
||||
|
||||
assert res["prompt_attention_mask"][0] == 1
|
||||
assert res["prompt_attention_mask"][-1] == 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -54,18 +54,6 @@ class TestValidation(BaseValidation):
|
||||
Test the validation module
|
||||
"""
|
||||
|
||||
def test_defaults(self, minimal_cfg):
|
||||
test_cfg = DictDefault(
|
||||
{
|
||||
"weight_decay": None,
|
||||
}
|
||||
| minimal_cfg
|
||||
)
|
||||
cfg = validate_config(test_cfg)
|
||||
|
||||
assert cfg.train_on_inputs is False
|
||||
assert cfg.weight_decay is None
|
||||
|
||||
def test_datasets_min_length(self):
|
||||
cfg = DictDefault(
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user