Compare commits
1 Commits
q-galore
...
pytest-ski
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f8bb4185bc |
8
.github/CONTRIBUTING.md
vendored
8
.github/CONTRIBUTING.md
vendored
@@ -21,12 +21,12 @@ All contributors are expected to adhere to our [Code of Conduct](CODE_OF_CONDUCT
|
||||
|
||||
## Getting Started
|
||||
|
||||
Bugs? Please check for open issue else create a new [Issue](https://github.com/axolotl-ai-cloud/axolotl/issues/new).
|
||||
Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new).
|
||||
|
||||
PRs are **greatly welcome**!
|
||||
|
||||
1. Fork the repository and clone it to your local machine.
|
||||
2. Set up the development environment by following the instructions in the [README.md](https://github.com/axolotl-ai-cloud/axolotl/tree/main/README.md) file.
|
||||
2. Set up the development environment by following the instructions in the [README.md](https://github.com/OpenAccess-AI-Collective/axolotl/tree/main/README.md) file.
|
||||
3. Explore the codebase, run tests, and verify that everything works as expected.
|
||||
|
||||
Please run below to setup env
|
||||
@@ -42,11 +42,11 @@ pytest tests/
|
||||
|
||||
### Reporting Bugs
|
||||
|
||||
If you encounter a bug or issue while using axolotl, please open a new issue on the [GitHub Issues](https://github.com/axolotl-ai-cloud/axolotl/issues) page. Provide a clear and concise description of the problem, steps to reproduce it, and any relevant error messages or logs.
|
||||
If you encounter a bug or issue while using axolotl, please open a new issue on the [GitHub Issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues) page. Provide a clear and concise description of the problem, steps to reproduce it, and any relevant error messages or logs.
|
||||
|
||||
### Suggesting Enhancements
|
||||
|
||||
We welcome ideas for improvements and new features. To suggest an enhancement, open a new issue on the [GitHub Issues](https://github.com/axolotl-ai-cloud/axolotl/issues) page. Describe the enhancement in detail, explain the use case, and outline the benefits it would bring to the project.
|
||||
We welcome ideas for improvements and new features. To suggest an enhancement, open a new issue on the [GitHub Issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues) page. Describe the enhancement in detail, explain the use case, and outline the benefits it would bring to the project.
|
||||
|
||||
### Submitting Pull Requests
|
||||
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/bug-report.yaml
vendored
2
.github/ISSUE_TEMPLATE/bug-report.yaml
vendored
@@ -15,7 +15,7 @@ body:
|
||||
label: "Please check that this issue hasn't been reported before."
|
||||
description: "The **Label filters** may help make your search more focussed."
|
||||
options:
|
||||
- label: "I searched previous [Bug Reports](https://github.com/axolotl-ai-cloud/axolotl/labels/bug) didn't find any similar reports."
|
||||
- label: "I searched previous [Bug Reports](https://github.com/OpenAccess-AI-Collective/axolotl/labels/bug) didn't find any similar reports."
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/config.yml
vendored
2
.github/ISSUE_TEMPLATE/config.yml
vendored
@@ -1,7 +1,7 @@
|
||||
blank_issues_enabled: false
|
||||
contact_links:
|
||||
- name: Ask a question
|
||||
url: https://github.com/axolotl-ai-cloud/axolotl/discussions/categories/q-a
|
||||
url: https://github.com/OpenAccess-AI-Collective/axolotl/discussions/categories/q-a
|
||||
about: Ask questions and discuss with other community members
|
||||
- name: Discuss the Project in Discord
|
||||
url: https://discord.gg/HhrNrHJPRb
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/docs.yml
vendored
2
.github/ISSUE_TEMPLATE/docs.yml
vendored
@@ -10,7 +10,7 @@ body:
|
||||
value: |
|
||||
* Ask questions in [Discord](https://discord.gg/HhrNrHJPRb).
|
||||
* Before you file an issue read the [Contributing guide](./CONTRIBUTING.md).
|
||||
* Check to make sure someone hasn't already opened a [similar issue](https://github.com/axolotl-ai-cloud/axolotl/issues).
|
||||
* Check to make sure someone hasn't already opened a [similar issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues).
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: What piece of documentation is affected?
|
||||
|
||||
4
.github/ISSUE_TEMPLATE/feature-request.yaml
vendored
4
.github/ISSUE_TEMPLATE/feature-request.yaml
vendored
@@ -8,9 +8,9 @@ body:
|
||||
label: "⚠️ Please check that this feature request hasn't been suggested before."
|
||||
description: "There are two locations for previous feature requests. Please search in both. Thank you. The **Label filters** may help make your search more focussed."
|
||||
options:
|
||||
- label: "I searched previous [Ideas in Discussions](https://github.com/axolotl-ai-cloud/axolotl/discussions/categories/ideas) didn't find any similar feature requests."
|
||||
- label: "I searched previous [Ideas in Discussions](https://github.com/OpenAccess-AI-Collective/axolotl/discussions/categories/ideas) didn't find any similar feature requests."
|
||||
required: true
|
||||
- label: "I searched previous [Issues](https://github.com/axolotl-ai-cloud/axolotl/labels/enhancement) didn't find any similar feature requests."
|
||||
- label: "I searched previous [Issues](https://github.com/OpenAccess-AI-Collective/axolotl/labels/enhancement) didn't find any similar feature requests."
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
|
||||
14
.github/workflows/base.yml
vendored
14
.github/workflows/base.yml
vendored
@@ -5,7 +5,7 @@ on:
|
||||
|
||||
jobs:
|
||||
build-base:
|
||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||
if: github.repository_owner == 'OpenAccess-AI-Collective'
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
runs-on: axolotl-gpu-runner
|
||||
strategy:
|
||||
@@ -30,17 +30,7 @@ jobs:
|
||||
- cuda: "121"
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.2.2
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
- cuda: "121"
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.3.0
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
- cuda: "121"
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.3.1
|
||||
pytorch: 2.2.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
1
.github/workflows/lint.yml
vendored
1
.github/workflows/lint.yml
vendored
@@ -7,7 +7,6 @@ on:
|
||||
- 'requirements.txt'
|
||||
- '.github/workflows/*.yml'
|
||||
- "*.md"
|
||||
- "examples/**/*.y[a]?ml"
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
|
||||
64
.github/workflows/main.yml
vendored
64
.github/workflows/main.yml
vendored
@@ -8,7 +8,7 @@ on:
|
||||
|
||||
jobs:
|
||||
build-axolotl:
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -19,6 +19,7 @@ jobs:
|
||||
pytorch: 2.1.2
|
||||
axolotl_extras:
|
||||
axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
|
||||
is_latest: true
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.10"
|
||||
@@ -27,14 +28,8 @@ jobs:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.2.2
|
||||
pytorch: 2.2.1
|
||||
axolotl_extras:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.3.1
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -70,7 +65,7 @@ jobs:
|
||||
|
||||
build-axolotl-cloud:
|
||||
needs: build-axolotl
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -80,6 +75,7 @@ jobs:
|
||||
python_version: "3.10"
|
||||
pytorch: 2.1.2
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.10"
|
||||
@@ -88,14 +84,8 @@ jobs:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.2.2
|
||||
pytorch: 2.2.1
|
||||
axolotl_extras:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.3.1
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -125,45 +115,3 @@ jobs:
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
|
||||
build-axolotl-cloud-no-tmux:
|
||||
needs: build-axolotl
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.3.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Docker metadata
|
||||
id: metadata
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: winglian/axolotl-cloud-term
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Build
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
CUDA=${{ matrix.cuda }}
|
||||
file: ./docker/Dockerfile-cloud-no-tmux
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: |
|
||||
${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||
${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
|
||||
labels: ${{ steps.metadata.outputs.labels }}
|
||||
|
||||
22
.github/workflows/nightlies.yml
vendored
22
.github/workflows/nightlies.yml
vendored
@@ -7,7 +7,7 @@ on:
|
||||
|
||||
jobs:
|
||||
build-axolotl:
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -18,6 +18,7 @@ jobs:
|
||||
pytorch: 2.1.2
|
||||
axolotl_extras:
|
||||
axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
|
||||
is_latest: true
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.10"
|
||||
@@ -26,14 +27,8 @@ jobs:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.2.2
|
||||
pytorch: 2.2.1
|
||||
axolotl_extras:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.3.1
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -70,7 +65,7 @@ jobs:
|
||||
|
||||
build-axolotl-cloud:
|
||||
needs: build-axolotl
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -80,6 +75,7 @@ jobs:
|
||||
python_version: "3.10"
|
||||
pytorch: 2.1.2
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.10"
|
||||
@@ -88,14 +84,8 @@ jobs:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.2.2
|
||||
pytorch: 2.2.1
|
||||
axolotl_extras:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.3.1
|
||||
axolotl_extras:
|
||||
is_latest: true
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
9
.github/workflows/tests.yml
vendored
9
.github/workflows/tests.yml
vendored
@@ -58,7 +58,7 @@ jobs:
|
||||
pytest --ignore=tests/e2e/ tests/
|
||||
|
||||
docker-e2e-tests:
|
||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||
if: github.repository_owner == 'OpenAccess-AI-Collective'
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
runs-on: [self-hosted, modal]
|
||||
timeout-minutes: 60
|
||||
@@ -82,12 +82,7 @@ jobs:
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.2.2
|
||||
num_gpus: 1
|
||||
- cuda: 121
|
||||
cuda_version: 12.1.0
|
||||
python_version: "3.11"
|
||||
pytorch: 2.3.1
|
||||
pytorch: 2.2.1
|
||||
num_gpus: 1
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
7
.gitignore
vendored
7
.gitignore
vendored
@@ -133,7 +133,6 @@ venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
venv3.10/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
@@ -176,9 +175,3 @@ qlora-out/*
|
||||
mlruns/*
|
||||
|
||||
/.quarto/
|
||||
prepared-datasets/
|
||||
submit.sh
|
||||
*.out*
|
||||
|
||||
typings/
|
||||
out/
|
||||
|
||||
65
README.md
65
README.md
@@ -34,7 +34,6 @@ Features:
|
||||
- [Mac](#mac)
|
||||
- [Google Colab](#google-colab)
|
||||
- [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
|
||||
- [Launching on public clouds via dstack](#launching-on-public-clouds-via-dstack)
|
||||
- [Dataset](#dataset)
|
||||
- [Config](#config)
|
||||
- [Train](#train)
|
||||
@@ -45,7 +44,6 @@ Features:
|
||||
- Advanced Topics
|
||||
- [Multipack](./docs/multipack.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
|
||||
- [RLHF & DPO](./docs/rlhf.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
|
||||
- [Dataset Pre-Processing](./docs/dataset_preprocessing.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
|
||||
- [Common Errors](#common-errors-)
|
||||
- [Tokenization Mismatch b/w Training & Inference](#tokenization-mismatch-bw-inference--training)
|
||||
- [Debugging Axolotl](#debugging-axolotl)
|
||||
@@ -67,8 +65,8 @@ Features:
|
||||
<p>
|
||||
Go ahead and Axolotl questions!!
|
||||
</p>
|
||||
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/pre-commit.yml/badge.svg?branch=main" alt="pre-commit">
|
||||
<img alt="PyTest Status" src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg?branch=main">
|
||||
<img src="https://github.com/OpenAccess-AI-Collective/axolotl/actions/workflows/pre-commit.yml/badge.svg?branch=main" alt="pre-commit">
|
||||
<img alt="PyTest Status" src="https://github.com/OpenAccess-AI-Collective/axolotl/actions/workflows/tests.yml/badge.svg?branch=main">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -83,7 +81,6 @@ Features:
|
||||
| llama | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Mistral | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| Mixtral-MoE | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ |
|
||||
| Mixtral8X22 | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ |
|
||||
| Pythia | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
|
||||
| cerebras | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
|
||||
| btlm | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
|
||||
@@ -107,7 +104,7 @@ Get started with Axolotl in just a few steps! This quickstart guide will walk yo
|
||||
**Requirements**: Python >=3.10 and Pytorch >=2.1.1.
|
||||
|
||||
```bash
|
||||
git clone https://github.com/axolotl-ai-cloud/axolotl
|
||||
git clone https://github.com/OpenAccess-AI-Collective/axolotl
|
||||
cd axolotl
|
||||
|
||||
pip3 install packaging ninja
|
||||
@@ -124,15 +121,15 @@ accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml
|
||||
|
||||
# inference
|
||||
accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
|
||||
--lora_model_dir="./outputs/lora-out"
|
||||
--lora_model_dir="./lora-out"
|
||||
|
||||
# gradio
|
||||
accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
|
||||
--lora_model_dir="./outputs/lora-out" --gradio
|
||||
--lora_model_dir="./lora-out" --gradio
|
||||
|
||||
# remote yaml files - the yaml config can be hosted on a public URL
|
||||
# Note: the yaml config must directly link to the **raw** yaml
|
||||
accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/openllama-3b/lora.yml
|
||||
accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/examples/openllama-3b/lora.yml
|
||||
```
|
||||
|
||||
## Advanced Setup
|
||||
@@ -293,47 +290,11 @@ HF_TOKEN=xx sky launch axolotl.yaml --env HF_TOKEN
|
||||
HF_TOKEN=xx BUCKET=<unique-name> sky spot launch axolotl-spot.yaml --env HF_TOKEN --env BUCKET
|
||||
```
|
||||
|
||||
#### Launching on public clouds via dstack
|
||||
To launch on GPU instance (both on-demand and spot instances) on public clouds (GCP, AWS, Azure, Lambda Labs, TensorDock, Vast.ai, and CUDO), you can use [dstack](https://dstack.ai/).
|
||||
|
||||
Write a job description in YAML as below:
|
||||
|
||||
```yaml
|
||||
# dstack.yaml
|
||||
type: task
|
||||
|
||||
image: winglian/axolotl-cloud:main-20240429-py3.11-cu121-2.2.2
|
||||
|
||||
env:
|
||||
- HUGGING_FACE_HUB_TOKEN
|
||||
- WANDB_API_KEY
|
||||
|
||||
commands:
|
||||
- accelerate launch -m axolotl.cli.train config.yaml
|
||||
|
||||
ports:
|
||||
- 6006
|
||||
|
||||
resources:
|
||||
gpu:
|
||||
memory: 24GB..
|
||||
count: 2
|
||||
```
|
||||
|
||||
then, simply run the job with `dstack run` command. Append `--spot` option if you want spot instance. `dstack run` command will show you the instance with cheapest price across multi cloud services:
|
||||
|
||||
```bash
|
||||
pip install dstack
|
||||
HUGGING_FACE_HUB_TOKEN=xxx WANDB_API_KEY=xxx dstack run . -f dstack.yaml # --spot
|
||||
```
|
||||
|
||||
For further and fine-grained use cases, please refer to the official [dstack documents](https://dstack.ai/docs/) and the detailed description of [axolotl example](https://github.com/dstackai/dstack/tree/master/examples/fine-tuning/axolotl) on the official repository.
|
||||
|
||||
### Dataset
|
||||
|
||||
Axolotl supports a variety of dataset formats. It is recommended to use a JSONL. The schema of the JSONL depends upon the task and the prompt template you wish to use. Instead of a JSONL, you can also use a HuggingFace dataset with columns for each JSONL field.
|
||||
|
||||
See [these docs](https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/) for more information on how to use different dataset formats.
|
||||
See [these docs](https://openaccess-ai-collective.github.io/axolotl/docs/dataset-formats/) for more information on how to use different dataset formats.
|
||||
|
||||
### Config
|
||||
|
||||
@@ -464,7 +425,7 @@ deepspeed: deepspeed_configs/zero1.json
|
||||
```
|
||||
|
||||
```shell
|
||||
accelerate launch -m axolotl.cli.train examples/llama-2/config.yml --deepspeed deepspeed_configs/zero1.json
|
||||
accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed_configs/zero1.json
|
||||
```
|
||||
|
||||
##### FSDP
|
||||
@@ -609,7 +570,7 @@ If you decode a prompt constructed by axolotl, you might see spaces between toke
|
||||
3. Make sure the inference string from #2 looks **exactly** like the data you fine tuned on from #1, including spaces and new lines. If they aren't the same, adjust your inference server accordingly.
|
||||
4. As an additional troubleshooting step, you can look at the token ids between 1 and 2 to make sure they are identical.
|
||||
|
||||
Having misalignment between your prompts during training and inference can cause models to perform very poorly, so it is worth checking this. See [this blog post](https://hamel.dev/notes/llm/finetuning/05_tokenizer_gotchas.html) for a concrete example.
|
||||
Having misalignment between your prompts during training and inference can cause models to perform very poorly, so it is worth checking this. See [this blog post](https://hamel.dev/notes/llm/05_tokenizer_gotchas.html) for a concrete example.
|
||||
|
||||
## Debugging Axolotl
|
||||
|
||||
@@ -626,10 +587,10 @@ Need dedicated support? Please contact us at [✉️wing@openaccessaicollective.
|
||||
Building something cool with Axolotl? Consider adding a badge to your model card.
|
||||
|
||||
```markdown
|
||||
[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
|
||||
```
|
||||
|
||||
[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
|
||||
[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
|
||||
|
||||
## Community Showcase
|
||||
|
||||
@@ -647,7 +608,7 @@ PocketDoc Labs
|
||||
|
||||
Please read the [contributing guide](./.github/CONTRIBUTING.md)
|
||||
|
||||
Bugs? Please check the [open issues](https://github.com/axolotl-ai-cloud/axolotl/issues/bug) else create a new Issue.
|
||||
Bugs? Please check the [open issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues/bug) else create a new Issue.
|
||||
|
||||
PRs are **greatly welcome**!
|
||||
|
||||
@@ -665,7 +626,7 @@ pre-commit run --all-files
|
||||
|
||||
Thanks to all of our contributors to date. Help drive open source AI progress forward by contributing to Axolotl.
|
||||
|
||||
<a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors">
|
||||
<a href="https://github.com/openaccess-ai-collective/axolotl/graphs/contributors">
|
||||
<img src="https://contrib.rocks/image?repo=openaccess-ai-collective/axolotl" alt="contributor chart by https://contrib.rocks"/>
|
||||
</a>
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ website:
|
||||
- icon: twitter
|
||||
href: https://twitter.com/axolotl_ai
|
||||
- icon: github
|
||||
href: https://github.com/axolotl-ai-cloud/axolotl/
|
||||
href: https://github.com/OpenAccess-AI-Collective/axolotl/
|
||||
- icon: discord
|
||||
href: https://discord.gg/7m9sfhzaf3
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ RUN apt-get update && \
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
||||
|
||||
WORKDIR /workspace/axolotl
|
||||
|
||||
@@ -24,13 +24,13 @@ RUN git fetch origin +$GITHUB_REF && \
|
||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||
RUN pip install causal_conv1d
|
||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
else \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,optimizers] $AXOLOTL_ARGS; \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
|
||||
fi
|
||||
|
||||
# So we can test the Docker image
|
||||
RUN pip install -r requirements-tests.txt
|
||||
RUN pip install pytest
|
||||
|
||||
# fix so that git fetch/pull from remote works
|
||||
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
|
||||
pytest /workspace/axolotl/tests/e2e/patched/
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
{
|
||||
"zero_force_ds_cpu_optimizer": false,
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_optimizer": {
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
{
|
||||
"zero_force_ds_cpu_optimizer": false,
|
||||
"zero_allow_untested_optimizer": true,
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_param": {
|
||||
|
||||
@@ -11,20 +11,20 @@ ARG PYTORCH_VERSION="2.1.2"
|
||||
ENV PYTORCH_VERSION=$PYTORCH_VERSION
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs
|
||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
||||
|
||||
WORKDIR /workspace/axolotl
|
||||
|
||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||
RUN pip install causal_conv1d
|
||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
else \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,optimizers] $AXOLOTL_ARGS; \
|
||||
pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
|
||||
fi
|
||||
|
||||
# So we can test the Docker image
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
ARG BASE_TAG=main
|
||||
FROM winglian/axolotl:$BASE_TAG
|
||||
|
||||
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
|
||||
ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
||||
ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
||||
|
||||
EXPOSE 8888
|
||||
EXPOSE 22
|
||||
|
||||
COPY scripts/cloud-entrypoint-term.sh /root/cloud-entrypoint.sh
|
||||
COPY scripts/motd /etc/motd
|
||||
|
||||
RUN pip install jupyterlab notebook ipywidgets && \
|
||||
jupyter lab clean
|
||||
RUN apt install --yes --no-install-recommends openssh-server tmux sudo && \
|
||||
pip3 install -U --no-cache-dir grpcio ray[default]==2.9.3 && \
|
||||
mkdir -p ~/.ssh && \
|
||||
chmod 700 ~/.ssh && \
|
||||
printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
|
||||
chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
|
||||
chmod +x /root/cloud-entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/root/cloud-entrypoint.sh"]
|
||||
CMD ["sleep", "infinity"]
|
||||
@@ -16,7 +16,7 @@ RUN apt-get update && \
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
||||
|
||||
WORKDIR /workspace/axolotl
|
||||
|
||||
|
||||
@@ -138,7 +138,7 @@ test_datasets:
|
||||
data_files:
|
||||
- /workspace/data/eval.jsonl
|
||||
|
||||
# use RL training: 'dpo', 'ipo', 'kto'
|
||||
# use RL training: 'dpo', 'ipo', 'kto_pair'
|
||||
rl:
|
||||
|
||||
# Saves the desired chat template to the tokenizer_config.json for easier inferencing
|
||||
@@ -186,11 +186,6 @@ eval_sample_packing:
|
||||
# The trainer will provide recommended values for these values.
|
||||
sample_packing_eff_est:
|
||||
total_num_tokens:
|
||||
# Increasing the following values helps with packing, but usually only slightly (<%1.)
|
||||
# The number of samples packed at a time.
|
||||
sample_packing_group_size: 100000
|
||||
# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
|
||||
sample_packing_bin_size: 200
|
||||
|
||||
# Passed through to transformers when loading the model when launched without accelerate
|
||||
# Use `sequential` when training w/ model parallelism to limit memory
|
||||
@@ -232,12 +227,6 @@ lora_modules_to_save:
|
||||
|
||||
lora_fan_in_fan_out: false
|
||||
|
||||
# LoRA+ hyperparameters
|
||||
# For more details about the following options, see:
|
||||
# https://arxiv.org/abs/2402.12354 and `src/axolotl/core/train_builder.py`
|
||||
loraplus_lr_ratio: # loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.
|
||||
loraplus_lr_embedding: # loraplus learning rate for lora embedding layers. Default value is 1e-6.
|
||||
|
||||
peft:
|
||||
# Configuration options for loftq initialization for LoRA
|
||||
# https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization
|
||||
@@ -279,7 +268,6 @@ torch_compile_backend: # Optional[str]
|
||||
# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
|
||||
gradient_accumulation_steps: 1
|
||||
# The number of samples to include in each batch. This is the number of samples sent to each GPU.
|
||||
# Batch size per gpu = micro_batch_size * gradient_accumulation_steps
|
||||
micro_batch_size: 2
|
||||
eval_batch_size:
|
||||
num_epochs: 4
|
||||
@@ -290,7 +278,7 @@ lr_quadratic_warmup:
|
||||
logging_steps:
|
||||
eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
|
||||
evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
|
||||
save_strategy: # Set to `"no"` to skip checkpoint saves
|
||||
save_strategy: # Set to `no` to skip checkpoint saves
|
||||
save_steps: # Leave empty to save at each epoch
|
||||
saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
|
||||
save_total_limit: # Checkpoints saved at a time
|
||||
@@ -424,7 +412,6 @@ special_tokens:
|
||||
# bos_token: "<s>"
|
||||
# eos_token: "</s>"
|
||||
# unk_token: "<unk>"
|
||||
# pad_token: "[PAD]"
|
||||
|
||||
# Add extra tokens.
|
||||
tokens:
|
||||
|
||||
@@ -4,25 +4,9 @@ description: How to use a custom pre-tokenized dataset.
|
||||
order: 5
|
||||
---
|
||||
|
||||
- Pass an empty `type:` in your axolotl config.
|
||||
- Do not pass a `type:` in your axolotl config.
|
||||
- Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
|
||||
- To indicate that a token should be ignored during training, set its corresponding label to `-100`.
|
||||
- Do not add BOS/EOS. Axolotl will add them for you based on the default tokenizer for the model you're using.
|
||||
- For pretraining, do not truncate/pad documents to the context window length.
|
||||
- For instruction training, documents must be truncated/padded as desired.
|
||||
|
||||
Sample config:
|
||||
|
||||
```{.yaml filename="config.yml"}
|
||||
datasets:
|
||||
- path: /path/to/your/file.jsonl
|
||||
ds_type: json
|
||||
type:
|
||||
```
|
||||
|
||||
Sample jsonl:
|
||||
|
||||
```jsonl
|
||||
{"input_ids":[271,299,99],"attention_mask":[1,1,1],"labels":[271,-100,99]}
|
||||
{"input_ids":[87,227,8383,12],"attention_mask":[1,1,1,1],"labels":[87,227,8383,12]}
|
||||
- path: ...
|
||||
```
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
---
|
||||
title: Dataset Preprocessing
|
||||
description: How datasets are processed
|
||||
---
|
||||
|
||||
Dataset pre-processing is the step where Axolotl takes each dataset you've configured alongside
|
||||
the (dataset format)[../dataset-formats/] and prompt strategies to:
|
||||
- parse the dataset based on the *dataset format*
|
||||
- transform the dataset to how you would interact with the model based on the *prompt strategy*
|
||||
- tokenize the dataset based on the configured model & tokenizer
|
||||
- shuffle and merge multiple datasets together if using more than one
|
||||
|
||||
The processing of the datasets can happen one of two ways:
|
||||
|
||||
1. Before kicking off training by calling `python -m axolotl.cli.preprocess /path/to/your.yaml --debug`
|
||||
2. When training is started
|
||||
|
||||
What are the benefits of pre-processing? When training interactively or for sweeps
|
||||
(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly
|
||||
slow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent
|
||||
training parameters so that it will intelligently pull from its cache when possible.
|
||||
|
||||
The path of the cache is controlled by `dataset_prepared_path:` and is often left blank in example
|
||||
YAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.
|
||||
|
||||
If `dataset_prepared_path:` is left empty, when training, the processed dataset will be cached in a
|
||||
default path of `./last_run_prepared/`, but will ignore anything already cached there. By explicitly
|
||||
setting `dataset_prepared_path: ./last_run_prepared`, the trainer will use whatever pre-processed
|
||||
data is in the cache.
|
||||
|
||||
What are the edge cases? Let's say you are writing a custom prompt strategy or using a user-defined
|
||||
prompt template. Because the trainer cannot readily detect these changes, we cannot change the
|
||||
calculated hash value for the pre-processed dataset. If you have `dataset_prepared_path: ...` set
|
||||
and change your prompt templating logic, it may not pick up the changes you made and you will be
|
||||
training over the old prompt.
|
||||
@@ -192,7 +192,7 @@ Using [official Axolotl Docker images](https://hub.docker.com/r/winglian/axolotl
|
||||
On the host that is running axolotl (ex: if you are using a remote host), clone the axolotl repo and change your current directory to the root:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/axolotl-ai-cloud/axolotl
|
||||
git clone https://github.com/OpenAccess-AI-Collective/axolotl
|
||||
cd axolotl
|
||||
```
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ To enable `QLoRA` with `FSDP`, you need to perform the following steps:
|
||||
> See the [example config](#example-config) file in addition to reading these instructions.
|
||||
|
||||
1. Set `adapter: qlora` in your axolotl config file.
|
||||
2. Enable FSDP in your axolotl config, as [described here](https://github.com/axolotl-ai-cloud/axolotl?tab=readme-ov-file#fsdp).
|
||||
2. Enable FSDP in your axolotl config, as [described here](https://github.com/OpenAccess-AI-Collective/axolotl?tab=readme-ov-file#fsdp).
|
||||
3. Use one of the supported model types: `llama`, `mistral` or `mixtral`.
|
||||
|
||||
## Example Config
|
||||
@@ -29,7 +29,7 @@ To enable `QLoRA` with `FSDP`, you need to perform the following steps:
|
||||
|
||||
## References
|
||||
|
||||
- [PR #1378](https://github.com/axolotl-ai-cloud/axolotl/pull/1378) enabling QLoRA in FSDP in Axolotl.
|
||||
- [PR #1378](https://github.com/OpenAccess-AI-Collective/axolotl/pull/1378) enabling QLoRA in FSDP in Axolotl.
|
||||
- [Blog Post](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html) from the [Answer.AI](https://www.answer.ai/) team describing the work that enabled QLoRA in FSDP.
|
||||
- Related HuggingFace PRs Enabling FDSP + QLoRA:
|
||||
- Accelerate [PR#2544](https://github.com/huggingface/accelerate/pull/2544 )
|
||||
|
||||
@@ -25,7 +25,7 @@ description: "Template-free prompt construction with the `input_output` format"
|
||||
### Masking Inputs
|
||||
|
||||
One of the most popular features of
|
||||
[axolotl](https://github.com/axolotl-ai-cloud/axolotl) is
|
||||
[axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) is
|
||||
setting the following configuration value:
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ setting the following configuration value:
|
||||
train_on_inputs: false
|
||||
```
|
||||
|
||||
If you declare a [dataset formats](https://github.com/axolotl-ai-cloud/axolotl?tab=readme-ov-file#dataset)
|
||||
If you declare a [dataset formats](https://github.com/OpenAccess-AI-Collective/axolotl?tab=readme-ov-file#dataset)
|
||||
such as `alpaca` or `chatml`, axolotl knows what is an input
|
||||
(i.e. human) vs. an output (i.e. the assistant) and masks the input
|
||||
labels so that your model can focus on predicting the outputs only.
|
||||
|
||||
@@ -49,7 +49,7 @@ remove_unused_columns: false
|
||||
chat_template: chatml
|
||||
datasets:
|
||||
- path: argilla/ultrafeedback-binarized-preferences-cleaned
|
||||
type: chat_template.argilla
|
||||
type: orpo.chat_template
|
||||
```
|
||||
|
||||
#### Using local dataset files
|
||||
|
||||
@@ -38,7 +38,7 @@ wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
output_dir: ./outputs/btlm-out
|
||||
output_dir: btlm-out
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
|
||||
@@ -25,7 +25,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/qlora-out
|
||||
output_dir: ./qlora-out
|
||||
batch_size: 4
|
||||
micro_batch_size: 4
|
||||
num_epochs: 2
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
output_dir: ./lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/qlora-out
|
||||
output_dir: ./qlora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
output_dir: ./lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/qlora-out
|
||||
output_dir: ./qlora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
output_dir: ./lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/qlora-out
|
||||
output_dir: ./qlora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
@@ -1,223 +1,216 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "AKjdG7tbTb-n"
|
||||
},
|
||||
"source": [
|
||||
"# Example notebook for running Axolotl on google colab"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "RcbNpOgWRcii"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"# Check so there is a gpu available, a T4(free tier) is enough to run this notebook\n",
|
||||
"assert (torch.cuda.is_available()==True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "h3nLav8oTRA5"
|
||||
},
|
||||
"source": [
|
||||
"## Install Axolotl and dependencies"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "AKjdG7tbTb-n"
|
||||
},
|
||||
"source": [
|
||||
"# Example notebook for running Axolotl on google colab"
|
||||
]
|
||||
},
|
||||
"id": "3c3yGAwnOIdi",
|
||||
"outputId": "e3777b5a-40ef-424f-e181-62dfecd1dd01"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install torch==\"2.1.2\"\n",
|
||||
"!pip install -e git+https://github.com/axolotl-ai-cloud/axolotl#egg=axolotl\n",
|
||||
"!pip install flash-attn==\"2.5.0\"\n",
|
||||
"!pip install deepspeed==\"0.13.1\"!pip install mlflow==\"2.13.0\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "BW2MFr7HTjub"
|
||||
},
|
||||
"source": [
|
||||
"## Create an yaml config file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "9pkF2dSoQEUN"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import yaml\n",
|
||||
"\n",
|
||||
"# Your YAML string\n",
|
||||
"yaml_string = \"\"\"\n",
|
||||
"base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\n",
|
||||
"model_type: LlamaForCausalLM\n",
|
||||
"tokenizer_type: LlamaTokenizer\n",
|
||||
"\n",
|
||||
"load_in_8bit: false\n",
|
||||
"load_in_4bit: true\n",
|
||||
"strict: false\n",
|
||||
"\n",
|
||||
"datasets:\n",
|
||||
" - path: mhenrichsen/alpaca_2k_test\n",
|
||||
" type: alpaca\n",
|
||||
"dataset_prepared_path:\n",
|
||||
"val_set_size: 0.05\n",
|
||||
"output_dir: ./outputs/qlora-out\n",
|
||||
"\n",
|
||||
"adapter: qlora\n",
|
||||
"lora_model_dir:\n",
|
||||
"\n",
|
||||
"sequence_len: 4096\n",
|
||||
"sample_packing: true\n",
|
||||
"eval_sample_packing: false\n",
|
||||
"pad_to_sequence_len: true\n",
|
||||
"\n",
|
||||
"lora_r: 32\n",
|
||||
"lora_alpha: 16\n",
|
||||
"lora_dropout: 0.05\n",
|
||||
"lora_target_modules:\n",
|
||||
"lora_target_linear: true\n",
|
||||
"lora_fan_in_fan_out:\n",
|
||||
"\n",
|
||||
"wandb_project:\n",
|
||||
"wandb_entity:\n",
|
||||
"wandb_watch:\n",
|
||||
"wandb_name:\n",
|
||||
"wandb_log_model:\n",
|
||||
"\n",
|
||||
"gradient_accumulation_steps: 4\n",
|
||||
"micro_batch_size: 2\n",
|
||||
"num_epochs: 4\n",
|
||||
"optimizer: paged_adamw_32bit\n",
|
||||
"lr_scheduler: cosine\n",
|
||||
"learning_rate: 0.0002\n",
|
||||
"\n",
|
||||
"train_on_inputs: false\n",
|
||||
"group_by_length: false\n",
|
||||
"bf16: auto\n",
|
||||
"fp16:\n",
|
||||
"tf32: false\n",
|
||||
"\n",
|
||||
"gradient_checkpointing: true\n",
|
||||
"early_stopping_patience:\n",
|
||||
"resume_from_checkpoint:\n",
|
||||
"local_rank:\n",
|
||||
"logging_steps: 1\n",
|
||||
"xformers_attention:\n",
|
||||
"flash_attention: true\n",
|
||||
"\n",
|
||||
"warmup_steps: 10\n",
|
||||
"evals_per_epoch: 4\n",
|
||||
"saves_per_epoch: 1\n",
|
||||
"debug:\n",
|
||||
"deepspeed:\n",
|
||||
"weight_decay: 0.0\n",
|
||||
"fsdp:\n",
|
||||
"fsdp_config:\n",
|
||||
"special_tokens:\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# Convert the YAML string to a Python dictionary\n",
|
||||
"yaml_dict = yaml.safe_load(yaml_string)\n",
|
||||
"\n",
|
||||
"# Specify your file path\n",
|
||||
"file_path = 'test_axolotl.yaml'\n",
|
||||
"\n",
|
||||
"# Write the YAML file\n",
|
||||
"with open(file_path, 'w') as file:\n",
|
||||
" yaml.dump(yaml_dict, file)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "bidoj8YLTusD"
|
||||
},
|
||||
"source": [
|
||||
"## Launch the training"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "RcbNpOgWRcii"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"# Check so there is a gpu available, a T4(free tier) is enough to run this notebook\n",
|
||||
"assert (torch.cuda.is_available()==True)"
|
||||
]
|
||||
},
|
||||
"id": "ydTI2Jk2RStU",
|
||||
"outputId": "d6d0df17-4b53-439c-c802-22c0456d301b"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# By using the ! the comand will be executed as a bash command\n",
|
||||
"!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
|
||||
]
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "h3nLav8oTRA5"
|
||||
},
|
||||
"source": [
|
||||
"## Install Axolotl and dependencies"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "3c3yGAwnOIdi",
|
||||
"outputId": "e3777b5a-40ef-424f-e181-62dfecd1dd01"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install torch==\"2.1.2\"\n",
|
||||
"!pip install -e git+https://github.com/OpenAccess-AI-Collective/axolotl#egg=axolotl\n",
|
||||
"!pip install flash-attn==\"2.5.0\"\n",
|
||||
"!pip install deepspeed==\"0.13.1\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "BW2MFr7HTjub"
|
||||
},
|
||||
"source": [
|
||||
"## Create an yaml config file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "9pkF2dSoQEUN"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import yaml\n",
|
||||
"\n",
|
||||
"# Your YAML string\n",
|
||||
"yaml_string = \"\"\"\n",
|
||||
"base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\n",
|
||||
"model_type: LlamaForCausalLM\n",
|
||||
"tokenizer_type: LlamaTokenizer\n",
|
||||
"is_llama_derived_model: true\n",
|
||||
"\n",
|
||||
"load_in_8bit: false\n",
|
||||
"load_in_4bit: true\n",
|
||||
"strict: false\n",
|
||||
"\n",
|
||||
"datasets:\n",
|
||||
" - path: mhenrichsen/alpaca_2k_test\n",
|
||||
" type: alpaca\n",
|
||||
"dataset_prepared_path:\n",
|
||||
"val_set_size: 0.05\n",
|
||||
"output_dir: ./qlora-out\n",
|
||||
"\n",
|
||||
"adapter: qlora\n",
|
||||
"lora_model_dir:\n",
|
||||
"\n",
|
||||
"sequence_len: 1096\n",
|
||||
"sample_packing: true\n",
|
||||
"pad_to_sequence_len: true\n",
|
||||
"\n",
|
||||
"lora_r: 32\n",
|
||||
"lora_alpha: 16\n",
|
||||
"lora_dropout: 0.05\n",
|
||||
"lora_target_modules:\n",
|
||||
"lora_target_linear: true\n",
|
||||
"lora_fan_in_fan_out:\n",
|
||||
"\n",
|
||||
"wandb_project:\n",
|
||||
"wandb_entity:\n",
|
||||
"wandb_watch:\n",
|
||||
"wandb_name:\n",
|
||||
"wandb_log_model:\n",
|
||||
"\n",
|
||||
"mlflow_experiment_name: colab-example\n",
|
||||
"\n",
|
||||
"gradient_accumulation_steps: 1\n",
|
||||
"micro_batch_size: 1\n",
|
||||
"num_epochs: 4\n",
|
||||
"max_steps: 20\n",
|
||||
"optimizer: paged_adamw_32bit\n",
|
||||
"lr_scheduler: cosine\n",
|
||||
"learning_rate: 0.0002\n",
|
||||
"\n",
|
||||
"train_on_inputs: false\n",
|
||||
"group_by_length: false\n",
|
||||
"bf16: false\n",
|
||||
"fp16: true\n",
|
||||
"tf32: false\n",
|
||||
"\n",
|
||||
"gradient_checkpointing: true\n",
|
||||
"early_stopping_patience:\n",
|
||||
"resume_from_checkpoint:\n",
|
||||
"local_rank:\n",
|
||||
"logging_steps: 1\n",
|
||||
"xformers_attention:\n",
|
||||
"flash_attention: false\n",
|
||||
"\n",
|
||||
"warmup_steps: 10\n",
|
||||
"evals_per_epoch:\n",
|
||||
"saves_per_epoch:\n",
|
||||
"debug:\n",
|
||||
"deepspeed:\n",
|
||||
"weight_decay: 0.0\n",
|
||||
"fsdp:\n",
|
||||
"fsdp_config:\n",
|
||||
"special_tokens:\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# Convert the YAML string to a Python dictionary\n",
|
||||
"yaml_dict = yaml.safe_load(yaml_string)\n",
|
||||
"\n",
|
||||
"# Specify your file path\n",
|
||||
"file_path = 'test_axolotl.yaml'\n",
|
||||
"\n",
|
||||
"# Write the YAML file\n",
|
||||
"with open(file_path, 'w') as file:\n",
|
||||
" yaml.dump(yaml_dict, file)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "bidoj8YLTusD"
|
||||
},
|
||||
"source": [
|
||||
"## Launch the training"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "ydTI2Jk2RStU",
|
||||
"outputId": "d6d0df17-4b53-439c-c802-22c0456d301b"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Buy using the ! the comand will be executed as a bash command\n",
|
||||
"!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Play with inference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Buy using the ! the comand will be executed as a bash command\n",
|
||||
"!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n",
|
||||
" --qlora_model_dir=\"./qlora-out\" --gradio"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"gpuType": "T4",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Play with inference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# By using the ! the comand will be executed as a bash command\n",
|
||||
"!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n",
|
||||
" --qlora_model_dir=\"./qlora-out\" --gradio"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"gpuType": "T4",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
|
||||
@@ -1,81 +0,0 @@
|
||||
base_model: LnL-AI/dbrx-base-converted-v2
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 512
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: false
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 8
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
# w1, w2, & v1 will hang the trainer
|
||||
lora_target_modules:
|
||||
- q_proj # attn
|
||||
- k_proj # attn
|
||||
- v_proj # attn
|
||||
- out_proj # attn
|
||||
- layer # router
|
||||
# - w1
|
||||
# - w2
|
||||
# - v1
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: paged_adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: false # don't use with fsdp_activation_checkpointing
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch:
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_config:
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: false
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_transformer_layer_cls_to_wrap: DbrxBlock
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_activation_checkpointing: true
|
||||
@@ -1,81 +0,0 @@
|
||||
base_model: LnL-AI/dbrx-base-converted-v2
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 512
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: false
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 8
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
# w1, w2, & v1 will hang the trainer
|
||||
lora_target_modules:
|
||||
- q_proj # attn
|
||||
- k_proj # attn
|
||||
- v_proj # attn
|
||||
- out_proj # attn
|
||||
- layer # router
|
||||
# - w1
|
||||
# - w2
|
||||
# - v1
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: paged_adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: false # don't use with fsdp_activation_checkpointing
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch:
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_config:
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: false
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_transformer_layer_cls_to_wrap: DbrxBlock
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_activation_checkpointing: true
|
||||
@@ -1,26 +0,0 @@
|
||||
# DBRX MoE
|
||||
|
||||
Currently, for LoRA, only the `q_proj`, `k_proj`, `v_proj` `out_proj` and `layer` Linear layers are trainable.
|
||||
|
||||
We are using the "converted" base models based on [this issue](https://huggingface.co/databricks/dbrx-instruct/discussions/10)
|
||||
where the Experts are fused as an `nn.Parameter` rather than a `nn.Linear` layer. However, the implementation
|
||||
is still a bit buggy and attempting to train a LoRA adapter over those `w1`, `w2` and `v1` layers
|
||||
results in the trainer hanging.
|
||||
|
||||
|
||||
### FSDP
|
||||
We've tested using the [`LnL-AI/dbrx-base-converted-v2`](https://huggingface.co/LnL-AI/dbrx-base-converted-v2) model as the base model for FSDP.
|
||||
|
||||
The high memory usage seen w/ FSDP is due to FSDP not supporting 8bit optimizers.
|
||||
|
||||
- 16-bit LoRA w/ FSDP
|
||||
- ✅ w/o CPU Offload - 8x80GB uses ~80GiB/gpu
|
||||
- ❌ w/ CPU Offload - `paged_adamw_8bit` optimizer errors from being on cpu
|
||||
- ✅ 8-bit LoRA w/ FSDP
|
||||
- ❌ 4-bit QLoRA w/ FSDP - errors w/: `Error an illegal memory access was encountered at line 90 in file /src/csrc/ops.cu`
|
||||
- ✅ bf16 full finetune w/ FSDP, freezing all but first 8 layers (8x80GB uses ~78GiB/gpu)
|
||||
|
||||
|
||||
### Deepspeed
|
||||
|
||||
WIP
|
||||
@@ -1,56 +0,0 @@
|
||||
base_model: LnL-AI/dbrx-base-converted-v2
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 512
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: false
|
||||
|
||||
unfrozen_parameters:
|
||||
- transformer.blocks.[0-7].
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: paged_adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch:
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
weight_decay: 0.0
|
||||
deepspeed: deepspeed_configs/zero3_bf16.json
|
||||
@@ -28,7 +28,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/falcon-7b
|
||||
output_dir: ./falcon-7b
|
||||
batch_size: 2
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
|
||||
@@ -42,7 +42,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/qlora-out
|
||||
output_dir: ./qlora-out
|
||||
|
||||
# QLoRA paper Table 9
|
||||
# - 16 for 7b & 13b
|
||||
|
||||
@@ -28,7 +28,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/falcon-7b
|
||||
output_dir: ./falcon-7b
|
||||
batch_size: 2
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
|
||||
@@ -12,7 +12,7 @@ datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
type: alpaca
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/out
|
||||
output_dir: ./out
|
||||
|
||||
adapter: qlora
|
||||
lora_r: 32
|
||||
|
||||
@@ -1,68 +0,0 @@
|
||||
base_model: google/gemma-2-9b
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
# huggingface repo
|
||||
chat_template: gemma
|
||||
datasets:
|
||||
- path: cgato/SlimOrcaDedupCleaned
|
||||
type: chat_template
|
||||
chat_template: gemma
|
||||
drop_system_message: true
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
|
||||
adapter: qlora
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch:
|
||||
eval_table_size:
|
||||
eval_max_new_tokens: 128
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
@@ -23,7 +23,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/qlora-out
|
||||
output_dir: ./qlora-out
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 2
|
||||
num_epochs: 2
|
||||
|
||||
@@ -10,7 +10,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
output_dir: ./out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
|
||||
@@ -10,7 +10,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
output_dir: ./out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
|
||||
@@ -21,7 +21,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/jeopardy-bot-7b
|
||||
output_dir: ./jeopardy-bot-7b
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/out
|
||||
output_dir: ./out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
|
||||
@@ -33,7 +33,7 @@ wandb_project:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/model-out
|
||||
output_dir: ./model-out
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lisa-out
|
||||
output_dir: ./lisa-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
output_dir: ./lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
output_dir: ./lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/qlora-out
|
||||
output_dir: ./qlora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
@@ -65,14 +65,12 @@ deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_config:
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: true
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_state_dict_type: SHARDED_STATE_DICT
|
||||
special_tokens:
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/qlora-out
|
||||
output_dir: ./qlora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
@@ -12,7 +12,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/relora-out
|
||||
output_dir: ./relora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
# Llama-3
|
||||
|
||||
https://llama.meta.com/llama3/
|
||||
|
||||
[8B Base Model](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
|
||||
- [Full Fine Tune](./fft-8b.yaml)
|
||||
- Single GPU @ 48GB VRAM
|
||||
- [LoRA](./lora-8b.yml)
|
||||
- Single GPU @ 11GB VRAM
|
||||
|
||||
[70B Base Model](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
|
||||
- [QLORA+FSDP](./qlora-fsdp-70b.yaml)
|
||||
- Dual GPU @ 21GB VRAM
|
||||
@@ -1,58 +0,0 @@
|
||||
base_model: meta-llama/Meta-Llama-3-8B
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 8192
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 8
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: paged_adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 2e-5
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 100
|
||||
evals_per_epoch: 2
|
||||
eval_table_size:
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
pad_token: <|end_of_text|>
|
||||
@@ -1,76 +0,0 @@
|
||||
base_model: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
chat_template: llama3
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
chat_template: llama3
|
||||
field_messages: messages
|
||||
message_field_role: role
|
||||
message_field_content: content
|
||||
roles:
|
||||
user:
|
||||
- user
|
||||
assistant:
|
||||
- assistant
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 4
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
s2_attention:
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
eval_table_size:
|
||||
eval_max_new_tokens: 128
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
@@ -1,71 +0,0 @@
|
||||
base_model: meta-llama/Meta-Llama-3-8B
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 4
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
s2_attention:
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
eval_table_size:
|
||||
eval_max_new_tokens: 128
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
pad_token: <|end_of_text|>
|
||||
@@ -1,80 +0,0 @@
|
||||
base_model: casperhansen/llama-3-70b-fp16
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: AutoTokenizer # PreTrainedTokenizerFast
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/out/qlora-llama3-70b
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 512
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
lora_r: 8
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_modules:
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
optimizer: adamw_torch
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.00001
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
eval_table_size:
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_config:
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: true
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
special_tokens:
|
||||
pad_token: <|end_of_text|>
|
||||
@@ -1,67 +0,0 @@
|
||||
base_model: meta-llama/Meta-Llama-3-8B
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: aaditya/alpaca_subset_1
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0
|
||||
output_dir: ./outputs/qlora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_modules:
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 4
|
||||
optimizer: paged_adamw_32bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
eval_table_size:
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
pad_token: "<|end_of_text|>"
|
||||
@@ -12,7 +12,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
output_dir: ./out
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: false
|
||||
|
||||
@@ -1,63 +0,0 @@
|
||||
base_model: mistral-community/Mixtral-8x22B-v0.1
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
unfrozen_parameters:
|
||||
- ^lm_head.weight$
|
||||
- ^model.embed_tokens.weight$
|
||||
- model.layers.4[4-9]+.block_sparse_moe.gate
|
||||
- model.layers.4[4-9]+.block_sparse_moe.experts
|
||||
- model.layers.5[0-5]+.block_sparse_moe.gate
|
||||
- model.layers.5[0-5]+.block_sparse_moe.experts
|
||||
|
||||
model_config:
|
||||
output_router_logits: true
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 3
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0001
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
save_total_limit: 1
|
||||
save_steps:
|
||||
debug:
|
||||
deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_params.json
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
eos_token: "<|im_end|>"
|
||||
tokens:
|
||||
- "<|im_start|>"
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/out
|
||||
output_dir: ./out
|
||||
|
||||
sequence_len: 8192
|
||||
sample_packing: true
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0
|
||||
output_dir: ./outputs/lora-out
|
||||
output_dir: ./lora-out
|
||||
eval_sample_packing: false
|
||||
|
||||
adapter: lora
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/lora-out
|
||||
output_dir: ./lora-out
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
|
||||
@@ -1,82 +0,0 @@
|
||||
base_model: mistralai/Mixtral-8x7B-v0.1
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.02
|
||||
output_dir: ./outputs/qlora-out
|
||||
|
||||
model_config:
|
||||
output_router_logits: true
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 1024
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: false
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: paged_adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
loss_watchdog_threshold: 5.0
|
||||
loss_watchdog_patience: 3
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
eval_table_size:
|
||||
eval_max_new_tokens: 128
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_config:
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: false
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: false
|
||||
fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
special_tokens:
|
||||
@@ -1,82 +0,0 @@
|
||||
base_model: mistralai/Mistral-7B-v0.1
|
||||
model_type: MistralForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
rl: orpo
|
||||
orpo_alpha: 0.1
|
||||
remove_unused_columns: false
|
||||
|
||||
chat_template: chatml
|
||||
datasets:
|
||||
- path: argilla/ultrafeedback-binarized-preferences-cleaned
|
||||
type: chat_template.argilla
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/mistral-qlora-orpo-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
lora_target_modules:
|
||||
- gate_proj
|
||||
- down_proj
|
||||
- up_proj
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
loss_watchdog_threshold: 5.0
|
||||
loss_watchdog_patience: 3
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
eval_table_size:
|
||||
eval_max_new_tokens: 128
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
@@ -1,81 +0,0 @@
|
||||
base_model: mistral-community/Mixtral-8x22B-v0.1
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.02
|
||||
output_dir: ./outputs/qlora-out
|
||||
|
||||
model_config:
|
||||
output_router_logits: true
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 1024
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: false
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
loss_watchdog_threshold: 5.0
|
||||
loss_watchdog_patience: 3
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
eval_table_size:
|
||||
eval_max_new_tokens: 128
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_config:
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: true
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
special_tokens:
|
||||
@@ -12,7 +12,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.02
|
||||
output_dir: ./outputs/qlora-out
|
||||
output_dir: ./qlora-out
|
||||
|
||||
model_config:
|
||||
output_router_logits: true
|
||||
@@ -39,7 +39,7 @@ wandb_log_model:
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch
|
||||
optimizer: paged_adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
@@ -47,7 +47,7 @@ train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
@@ -69,17 +69,6 @@ debug:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_config:
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: true
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
fsdp_forward_prefetch: false
|
||||
fsdp_backward_prefetch: BACKWARD_PRE
|
||||
special_tokens:
|
||||
|
||||
@@ -12,7 +12,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/qlora-out
|
||||
output_dir: ./qlora-out
|
||||
|
||||
## You can optionally freeze the entire model and unfreeze a subset of parameters
|
||||
unfrozen_parameters:
|
||||
|
||||
@@ -1,61 +0,0 @@
|
||||
base_model: mistral-community/Mixtral-8x22B-v0.1
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
unfrozen_parameters:
|
||||
- ^lm_head.weight$
|
||||
- ^model.embed_tokens.weight$
|
||||
- model.layers.4[4-9]+.block_sparse_moe.gate
|
||||
- model.layers.4[4-9]+.block_sparse_moe.experts
|
||||
- model.layers.5[0-5]+.block_sparse_moe.gate
|
||||
- model.layers.5[0-5]+.block_sparse_moe.experts
|
||||
|
||||
model_config:
|
||||
output_router_logits: true
|
||||
|
||||
datasets:
|
||||
- path: yahma/alpaca-cleaned
|
||||
type: alpaca
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 8000
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 3
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0001
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
save_total_limit: 1
|
||||
save_steps:
|
||||
debug:
|
||||
deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
eos_token: "<|im_end|>"
|
||||
tokens:
|
||||
- "<|im_start|>"
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.1
|
||||
output_dir: ./outputs/qlora-out
|
||||
output_dir: ./qlora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
@@ -23,7 +23,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/mpt-alpaca-7b
|
||||
output_dir: ./mpt-alpaca-7b
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
|
||||
@@ -25,7 +25,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/openllama-out
|
||||
output_dir: ./openllama-out
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
|
||||
@@ -31,7 +31,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/lora-out
|
||||
output_dir: ./lora-out
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 2
|
||||
num_epochs: 4
|
||||
|
||||
@@ -25,7 +25,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/qlora-out
|
||||
output_dir: ./qlora-out
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 2
|
||||
num_epochs: 4
|
||||
|
||||
@@ -12,7 +12,7 @@ datasets:
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/phi-sft-out
|
||||
output_dir: ./phi-sft-out
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
@@ -12,7 +12,7 @@ datasets:
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/phi-sft-out
|
||||
output_dir: ./phi-sft-out
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
@@ -12,7 +12,7 @@ datasets:
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/phi-sft-out
|
||||
output_dir: ./phi-sft-out
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
@@ -1,83 +0,0 @@
|
||||
base_model: microsoft/Phi-3-mini-4k-instruct
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
type: alpaca
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0
|
||||
output_dir: ./phi-sft-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
trust_remote_code: true
|
||||
|
||||
adapter:
|
||||
lora_model_dir:
|
||||
lora_r:
|
||||
lora_alpha:
|
||||
lora_dropout:
|
||||
lora_target_linear:
|
||||
lora_fan_in_fan_out:
|
||||
|
||||
wandb_project: phi3
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 12
|
||||
num_epochs: 2
|
||||
optimizer: adamw_torch
|
||||
adam_beta2: 0.95
|
||||
adam_epsilon: 0.00001
|
||||
max_grad_norm: 1.0
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.000003
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 100
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.1
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_config:
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: true
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_transformer_layer_cls_to_wrap: Phi3DecoderLayer
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
resize_token_embeddings_to_32x: true
|
||||
special_tokens:
|
||||
pad_token: "<|endoftext|>"
|
||||
@@ -1,64 +0,0 @@
|
||||
base_model: microsoft/Phi-3-mini-4k-instruct
|
||||
trust_remote_code: true
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
chat_template: phi_3
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: garage-bAInd/Open-Platypus
|
||||
type: alpaca:phi
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.01
|
||||
output_dir: ./out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 64
|
||||
lora_alpha: 32
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch
|
||||
adam_beta2: 0.95
|
||||
adam_epsilon: 0.00001
|
||||
max_grad_norm: 1.0
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 5.0e-6
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: True
|
||||
early_stopping_patience: 3
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
eval_steps: 1000
|
||||
save_steps: 5000
|
||||
eval_table_size: 2
|
||||
eval_batch_size: 2
|
||||
eval_sample_packing: false
|
||||
eval_max_new_tokens: 32
|
||||
eval_causal_lm_metrics: ["perplexity"]
|
||||
do_causal_lm_eval: true
|
||||
|
||||
warmup_ratio: 0.2
|
||||
debug: true
|
||||
weight_decay: 0.1
|
||||
resize_token_embeddings_to_32x: true
|
||||
@@ -26,7 +26,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/pythia-12b
|
||||
output_dir: ./pythia-12b
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 5
|
||||
|
||||
@@ -20,7 +20,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/lora-alpaca-pythia
|
||||
output_dir: ./lora-alpaca-pythia
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 4
|
||||
num_epochs: 4
|
||||
|
||||
@@ -13,7 +13,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
output_dir: ./lora-out
|
||||
|
||||
sequence_len: 2048 # supports up to 8192
|
||||
sample_packing: false
|
||||
|
||||
@@ -13,7 +13,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
output_dir: ./lora-out
|
||||
|
||||
sequence_len: 2048 # supports up to 8192
|
||||
sample_packing: false
|
||||
|
||||
@@ -10,7 +10,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/out
|
||||
output_dir: ./out
|
||||
|
||||
sequence_len: 1024 # supports up to 32k
|
||||
sample_packing: false
|
||||
|
||||
@@ -10,13 +10,13 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/out
|
||||
output_dir: ./out
|
||||
|
||||
sequence_len: 1024 # supports up to 32k
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: false
|
||||
|
||||
adapter: qlora
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
|
||||
@@ -1,75 +0,0 @@
|
||||
base_model: Qwen/Qwen2-7B
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
eval_sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 64
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
optimizer: adamw_torch
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_config:
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: true
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
special_tokens:
|
||||
@@ -24,7 +24,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/redpajama-alpaca-3b
|
||||
output_dir: ./redpajama-alpaca-3b
|
||||
batch_size: 4
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
|
||||
@@ -23,7 +23,7 @@ wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
output_dir: ./outputs/lora-replit
|
||||
output_dir: ./lora-replit
|
||||
batch_size: 8
|
||||
micro_batch_size: 1
|
||||
num_epochs: 4
|
||||
|
||||
@@ -12,7 +12,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/out
|
||||
output_dir: ./out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
|
||||
@@ -12,7 +12,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
output_dir: ./lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.2
|
||||
output_dir: ./outputs/qlora
|
||||
output_dir: ./qlora
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0
|
||||
output_dir: ./outputs/lora-out
|
||||
output_dir: ./lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
|
||||
@@ -11,7 +11,7 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
output_dir: ./lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
|
||||
@@ -14,7 +14,7 @@ pretraining_dataset:
|
||||
type: pretrain
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/model-out
|
||||
output_dir: ./model-out
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
|
||||
@@ -11,14 +11,13 @@ datasets:
|
||||
type: alpaca
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/qlora-out
|
||||
output_dir: ./qlora-out
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
lora_r: 32
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user