Compare commits
51 Commits
enable_tp
...
iterable-o
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1cfb8feb2d | ||
|
|
887513285d | ||
|
|
20620771f1 | ||
|
|
6086162488 | ||
|
|
b2774af66c | ||
|
|
74f9782fc3 | ||
|
|
8a7a0b07dc | ||
|
|
8fb72cbc0b | ||
|
|
bb9d4102c4 | ||
|
|
af727eedf7 | ||
|
|
8606093921 | ||
|
|
cba5a457d9 | ||
|
|
19cd83d408 | ||
|
|
1ed4de73b6 | ||
|
|
f89e962119 | ||
|
|
bc1c9c20e3 | ||
|
|
dd26cc3c0f | ||
|
|
d8b4027200 | ||
|
|
fb3352e21c | ||
|
|
ed77e7001e | ||
|
|
7669a03fb4 | ||
|
|
6553683170 | ||
|
|
5e0124e2ab | ||
|
|
2e8d7c1adb | ||
|
|
3c1921e400 | ||
|
|
7faf2b6e8e | ||
|
|
c1b920f291 | ||
|
|
3915abee4c | ||
|
|
7a38dbe674 | ||
|
|
e0a2eb2ebd | ||
|
|
d852d7af7a | ||
|
|
3742deb1de | ||
|
|
2312caaa98 | ||
|
|
307cf7c685 | ||
|
|
70541145f1 | ||
|
|
42bd32a233 | ||
|
|
5b8fb5e939 | ||
|
|
bd2a594b89 | ||
|
|
3798229d85 | ||
|
|
10cfecf02e | ||
|
|
339f3c67e2 | ||
|
|
d91feaffc8 | ||
|
|
e246ceffa4 | ||
|
|
8ddc18ec8d | ||
|
|
1c14c4a15c | ||
|
|
1f623e6cc8 | ||
|
|
f865464ae5 | ||
|
|
33090486d7 | ||
|
|
effc4dc409 | ||
|
|
02629c7cdf | ||
|
|
78a4aa86d6 |
1
.github/workflows/lint.yml
vendored
1
.github/workflows/lint.yml
vendored
@@ -1,6 +1,7 @@
|
|||||||
name: lint
|
name: lint
|
||||||
on:
|
on:
|
||||||
# check on PRs, and manual triggers
|
# check on PRs, and manual triggers
|
||||||
|
merge_group:
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
- '**.py'
|
- '**.py'
|
||||||
|
|||||||
4
.github/workflows/main.yml
vendored
4
.github/workflows/main.yml
vendored
@@ -25,7 +25,6 @@ jobs:
|
|||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.3.1
|
pytorch: 2.3.1
|
||||||
axolotl_extras: mamba-ssm
|
axolotl_extras: mamba-ssm
|
||||||
is_latest: true
|
|
||||||
- cuda: 124
|
- cuda: 124
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
@@ -36,6 +35,7 @@ jobs:
|
|||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.5.1
|
pytorch: 2.5.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
|
is_latest: true
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -92,7 +92,6 @@ jobs:
|
|||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.3.1
|
pytorch: 2.3.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
is_latest: true
|
|
||||||
- cuda: 124
|
- cuda: 124
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
@@ -103,6 +102,7 @@ jobs:
|
|||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.5.1
|
pytorch: 2.5.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
|
is_latest: true
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
|
|||||||
2
.github/workflows/multi-gpu-e2e.yml
vendored
2
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -52,7 +52,7 @@ jobs:
|
|||||||
- name: Install Modal
|
- name: Install Modal
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install modal==0.63.64 jinja2
|
pip install modal==0.71.8 jinja2
|
||||||
- name: Update env vars
|
- name: Update env vars
|
||||||
run: |
|
run: |
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
|
|||||||
7
.github/workflows/tests-nightly.yml
vendored
7
.github/workflows/tests-nightly.yml
vendored
@@ -44,6 +44,11 @@ jobs:
|
|||||||
python-version: ${{ matrix.python_version }}
|
python-version: ${{ matrix.python_version }}
|
||||||
cache: 'pip' # caching pip dependencies
|
cache: 'pip' # caching pip dependencies
|
||||||
|
|
||||||
|
- name: upgrade pip
|
||||||
|
run: |
|
||||||
|
pip3 install --upgrade pip
|
||||||
|
pip3 install --upgrade packaging setuptools wheel
|
||||||
|
|
||||||
- name: Install PyTorch
|
- name: Install PyTorch
|
||||||
run: |
|
run: |
|
||||||
pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
|
pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
|
||||||
@@ -124,7 +129,7 @@ jobs:
|
|||||||
- name: Install Modal
|
- name: Install Modal
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install modal==0.63.64 jinja2
|
pip install modal==0.71.8 jinja2
|
||||||
- name: Update env vars
|
- name: Update env vars
|
||||||
run: |
|
run: |
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
|
|||||||
41
.github/workflows/tests.yml
vendored
41
.github/workflows/tests.yml
vendored
@@ -1,6 +1,7 @@
|
|||||||
name: Tests
|
name: Tests
|
||||||
on:
|
on:
|
||||||
# check on push/merge to main, PRs, and manual triggers
|
# check on push/merge to main, PRs, and manual triggers
|
||||||
|
merge_group:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- "main"
|
- "main"
|
||||||
@@ -60,6 +61,15 @@ jobs:
|
|||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Restore HF cache
|
||||||
|
id: hf-cache-restore
|
||||||
|
uses: actions/cache/restore@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
|
key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
@@ -100,6 +110,15 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||||
|
|
||||||
|
- name: Save HF cache
|
||||||
|
id: hf-cache
|
||||||
|
uses: actions/cache/save@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
|
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
|
||||||
|
|
||||||
pytest-sdist:
|
pytest-sdist:
|
||||||
name: PyTest from Source Dist
|
name: PyTest from Source Dist
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -115,6 +134,15 @@ jobs:
|
|||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Restore HF cache
|
||||||
|
id: hf-cache-restore
|
||||||
|
uses: actions/cache/restore@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
|
key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
@@ -156,6 +184,15 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||||
|
|
||||||
|
- name: Save HF cache
|
||||||
|
id: hf-cache
|
||||||
|
uses: actions/cache/save@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
|
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
|
||||||
|
|
||||||
docker-e2e-tests-1st:
|
docker-e2e-tests-1st:
|
||||||
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
@@ -183,7 +220,7 @@ jobs:
|
|||||||
- name: Install Modal
|
- name: Install Modal
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install modal==0.63.64 jinja2
|
pip install modal==0.71.8 jinja2
|
||||||
- name: Update env vars
|
- name: Update env vars
|
||||||
run: |
|
run: |
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
@@ -229,7 +266,7 @@ jobs:
|
|||||||
- name: Install Modal
|
- name: Install Modal
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install modal==0.63.64 jinja2
|
pip install modal==0.71.8 jinja2
|
||||||
- name: Update env vars
|
- name: Update env vars
|
||||||
run: |
|
run: |
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
|
|||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,6 +1,7 @@
|
|||||||
**/axolotl.egg-info
|
**/axolotl.egg-info
|
||||||
configs
|
configs
|
||||||
last_run_prepared/
|
last_run_prepared/
|
||||||
|
outputs
|
||||||
.vscode
|
.vscode
|
||||||
_site/
|
_site/
|
||||||
|
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
- repo: https://github.com/PyCQA/pylint
|
- repo: https://github.com/PyCQA/pylint
|
||||||
rev: v2.17.4
|
rev: v3.3.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: pylint
|
- id: pylint
|
||||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[MASTER]
|
[MASTER]
|
||||||
init-hook="from pylint.config import find_pylintrc; import os, sys; sys.path.append(os.path.dirname(find_pylintrc()))"
|
init-hook="from pylint.config import find_default_config_files; import sys; sys.path.append(next(find_default_config_files()).parent.as_posix())"
|
||||||
|
|
||||||
[TYPECHECK]
|
[TYPECHECK]
|
||||||
|
|
||||||
@@ -12,3 +12,4 @@ generated-members=numpy.*, torch.*
|
|||||||
disable=missing-function-docstring, line-too-long, import-error,
|
disable=missing-function-docstring, line-too-long, import-error,
|
||||||
too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
|
too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
|
||||||
too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
|
too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
|
||||||
|
too-many-positional-arguments, possibly-used-before-assignment
|
||||||
|
|||||||
@@ -519,8 +519,8 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
|
|||||||
train_on_split: validation
|
train_on_split: validation
|
||||||
|
|
||||||
# loading from s3 or gcs
|
# loading from s3 or gcs
|
||||||
# s3 creds will be loaded from the system default and gcs only supports public access
|
# s3 creds will be loaded from the system default / gcs will attempt to load from gcloud creds, google metadata service, or anon
|
||||||
- path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
|
- path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above
|
||||||
...
|
...
|
||||||
|
|
||||||
# Loading Data From a Public URL
|
# Loading Data From a Public URL
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
|
|||||||
ENV GITHUB_REF="{{ GITHUB_REF }}"
|
ENV GITHUB_REF="{{ GITHUB_REF }}"
|
||||||
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
|
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
|
||||||
ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
|
ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
|
||||||
|
ENV HF_HOME="{{ HF_HOME }}"
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
|
|||||||
|
|
||||||
pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ /workspace/axolotl/tests/
|
pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ /workspace/axolotl/tests/
|
||||||
# pytest -v --durations=10 -n8 --dist loadfile /workspace/axolotl/tests/patched/
|
# pytest -v --durations=10 -n8 --dist loadfile /workspace/axolotl/tests/patched/
|
||||||
pytest -v --durations=10 -n1 --dist loadfile /workspace/axolotl/tests/e2e/patched/
|
pytest -v --durations=10 /workspace/axolotl/tests/e2e/patched/
|
||||||
pytest -v --durations=10 -n1 --dist loadfile /workspace/axolotl/tests/e2e/integrations/
|
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/solo/
|
||||||
pytest -v --durations=10 --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
|
pytest -v --durations=10 /workspace/axolotl/tests/e2e/integrations/
|
||||||
|
pytest -v --durations=10 --ignore=tests/e2e/solo/ --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ df_args = {
|
|||||||
"CUDA": os.environ.get("CUDA", "121"),
|
"CUDA": os.environ.get("CUDA", "121"),
|
||||||
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
||||||
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
||||||
|
"HF_HOME": "/workspace/data/huggingface-cache/hub",
|
||||||
}
|
}
|
||||||
|
|
||||||
dockerfile_contents = df_template.render(**df_args)
|
dockerfile_contents = df_template.render(**df_args)
|
||||||
@@ -48,6 +49,12 @@ cicd_image = (
|
|||||||
|
|
||||||
app = App("Axolotl CI/CD", secrets=[])
|
app = App("Axolotl CI/CD", secrets=[])
|
||||||
|
|
||||||
|
hf_cache_volume = modal.Volume.from_name(
|
||||||
|
"axolotl-ci-hf-hub-cache", create_if_missing=True
|
||||||
|
)
|
||||||
|
VOLUME_CONFIG = {
|
||||||
|
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
|
||||||
|
}
|
||||||
|
|
||||||
N_GPUS = int(os.environ.get("N_GPUS", 2))
|
N_GPUS = int(os.environ.get("N_GPUS", 2))
|
||||||
GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
|
GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
|
||||||
@@ -67,6 +74,7 @@ def run_cmd(cmd: str, run_folder: str):
|
|||||||
timeout=60 * 60,
|
timeout=60 * 60,
|
||||||
cpu=8.0,
|
cpu=8.0,
|
||||||
memory=131072 * N_GPUS,
|
memory=131072 * N_GPUS,
|
||||||
|
volumes=VOLUME_CONFIG,
|
||||||
)
|
)
|
||||||
def cicd_pytest():
|
def cicd_pytest():
|
||||||
run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")
|
run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ df_args = {
|
|||||||
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
||||||
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
||||||
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
|
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
|
||||||
|
"HF_HOME": "/workspace/data/huggingface-cache/hub",
|
||||||
}
|
}
|
||||||
|
|
||||||
dockerfile_contents = df_template.render(**df_args)
|
dockerfile_contents = df_template.render(**df_args)
|
||||||
@@ -50,6 +51,12 @@ cicd_image = (
|
|||||||
|
|
||||||
app = App("Axolotl CI/CD", secrets=[])
|
app = App("Axolotl CI/CD", secrets=[])
|
||||||
|
|
||||||
|
hf_cache_volume = modal.Volume.from_name(
|
||||||
|
"axolotl-ci-hf-hub-cache", create_if_missing=True
|
||||||
|
)
|
||||||
|
VOLUME_CONFIG = {
|
||||||
|
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
|
||||||
|
}
|
||||||
|
|
||||||
N_GPUS = int(os.environ.get("N_GPUS", 1))
|
N_GPUS = int(os.environ.get("N_GPUS", 1))
|
||||||
GPU_CONFIG = modal.gpu.A10G(count=N_GPUS)
|
GPU_CONFIG = modal.gpu.A10G(count=N_GPUS)
|
||||||
@@ -69,6 +76,7 @@ def run_cmd(cmd: str, run_folder: str):
|
|||||||
timeout=60 * 60,
|
timeout=60 * 60,
|
||||||
cpu=8.0,
|
cpu=8.0,
|
||||||
memory=131072,
|
memory=131072,
|
||||||
|
volumes=VOLUME_CONFIG,
|
||||||
)
|
)
|
||||||
def cicd_pytest():
|
def cicd_pytest():
|
||||||
run_cmd("./cicd/cicd.sh", "/workspace/axolotl")
|
run_cmd("./cicd/cicd.sh", "/workspace/axolotl")
|
||||||
|
|||||||
27
deepspeed_configs/zero1_torch_compile.json
Normal file
27
deepspeed_configs/zero1_torch_compile.json
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
{
|
||||||
|
"zero_optimization": {
|
||||||
|
"stage": 1,
|
||||||
|
"overlap_comm": true
|
||||||
|
},
|
||||||
|
"bf16": {
|
||||||
|
"enabled": "auto"
|
||||||
|
},
|
||||||
|
"fp16": {
|
||||||
|
"enabled": "auto",
|
||||||
|
"auto_cast": false,
|
||||||
|
"loss_scale": 0,
|
||||||
|
"initial_scale_power": 32,
|
||||||
|
"loss_scale_window": 1000,
|
||||||
|
"hysteresis": 2,
|
||||||
|
"min_loss_scale": 1
|
||||||
|
},
|
||||||
|
"compile": {
|
||||||
|
"disable": false,
|
||||||
|
"backend": "inductor"
|
||||||
|
},
|
||||||
|
"gradient_accumulation_steps": "auto",
|
||||||
|
"gradient_clipping": "auto",
|
||||||
|
"train_batch_size": "auto",
|
||||||
|
"train_micro_batch_size_per_gpu": "auto",
|
||||||
|
"wall_clock_breakdown": false
|
||||||
|
}
|
||||||
@@ -20,7 +20,8 @@ RUN apt install --yes --no-install-recommends openssh-server tmux && \
|
|||||||
printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
|
printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
|
||||||
printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
|
printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
|
||||||
chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
|
chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
|
||||||
chmod +x /root/cloud-entrypoint.sh
|
chmod +x /root/cloud-entrypoint.sh && \
|
||||||
|
echo 'set-option -g history-limit 5000' >> ~/.tmux.conf
|
||||||
|
|
||||||
ENTRYPOINT ["/root/cloud-entrypoint.sh"]
|
ENTRYPOINT ["/root/cloud-entrypoint.sh"]
|
||||||
CMD ["sleep", "infinity"]
|
CMD ["sleep", "infinity"]
|
||||||
|
|||||||
@@ -127,34 +127,40 @@ datasets:
|
|||||||
# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.
|
# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.
|
||||||
# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
|
# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
|
||||||
chat_template: tokenizer_default
|
chat_template: tokenizer_default
|
||||||
# Custom jinja template for chat template. This will be only used if `chat_template` is set to `jinja` or empty (in which case chat_template is automatically set to `jinja`).
|
|
||||||
|
# Custom jinja chat template. Used only if `chat_template: jinja` or empty.
|
||||||
chat_template_jinja:
|
chat_template_jinja:
|
||||||
# The key in the data example that contains the messages. Default is "messages".
|
|
||||||
|
# Key containing the messages (default: "messages")
|
||||||
field_messages: messages
|
field_messages: messages
|
||||||
# The key in the message turn that contains the role. Default is "role".
|
# Key for role in each message (default: "role")
|
||||||
message_field_role: role
|
message_field_role: role
|
||||||
# The key in the message turn that contains the content. Default is "content".
|
# Key for content in each message (default: "content")
|
||||||
message_field_content: content
|
message_field_content: content
|
||||||
# Optional[Dict[str, List]]. Roles mapping for the messages.
|
|
||||||
|
# Optional[Dict[str, List]]. Roles mapping in the messages. The default is:
|
||||||
roles:
|
roles:
|
||||||
user: ["human", "user"]
|
user: ["human", "user"]
|
||||||
assistant: ["gpt", "assistant", "ai"]
|
assistant: ["gpt", "assistant"]
|
||||||
system: ["system"]
|
system: ["system"]
|
||||||
|
tool: ["tool"]
|
||||||
|
|
||||||
## NOTE: Leaving the below empty will default to using the simple legacy tokenization strategy where only last message is trained on.
|
# IMPORTANT: The following fields determine which parts of the conversation to train on.
|
||||||
|
# Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
|
||||||
|
# See examples at `docs/dataset-formats/conversation.qmd`
|
||||||
|
# Note: If the below 4 fields are empty, defaults to training only on the last message.
|
||||||
|
|
||||||
# Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
|
# Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
|
||||||
roles_to_train: ["gpt", "assistant"]
|
roles_to_train: ["assistant"] # default
|
||||||
# Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:
|
# Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:
|
||||||
# - all: train on all EOS tokens
|
# - all: train on all EOS tokens
|
||||||
# - turn: train on the EOS token at the end of each trainable turn
|
# - turn (default): train on the EOS token at the end of each trainable turn
|
||||||
# - last: train on the last EOS token in the conversation
|
# - last: train on the last EOS token in the conversation
|
||||||
train_on_eos: last
|
train_on_eos: last
|
||||||
# The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
|
# The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
|
||||||
message_field_training: training
|
message_field_training: training
|
||||||
# The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
|
# The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
|
||||||
# The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).
|
# The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).
|
||||||
# See example at `docs/dataset-formats/conversation.qmd`
|
|
||||||
message_field_training_detail: train_detail
|
message_field_training_detail: train_detail
|
||||||
|
|
||||||
|
|
||||||
@@ -238,6 +244,11 @@ total_num_tokens:
|
|||||||
sample_packing_group_size: 100000
|
sample_packing_group_size: 100000
|
||||||
# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
|
# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
|
||||||
sample_packing_bin_size: 200
|
sample_packing_bin_size: 200
|
||||||
|
# whether to concatenate samples during pretraining
|
||||||
|
pretraining_sample_concatenation:
|
||||||
|
|
||||||
|
# Use batch flattening for speedups when not using sample_packing
|
||||||
|
batch_flattening:
|
||||||
|
|
||||||
# Passed through to transformers when loading the model when launched without accelerate
|
# Passed through to transformers when loading the model when launched without accelerate
|
||||||
# Use `sequential` when training w/ model parallelism to limit memory
|
# Use `sequential` when training w/ model parallelism to limit memory
|
||||||
@@ -331,7 +342,8 @@ comet_experiment_config: # Dictionary for additional configuration settings, see
|
|||||||
output_dir: ./completed-model
|
output_dir: ./completed-model
|
||||||
|
|
||||||
# Whether to use torch.compile and which backend to use
|
# Whether to use torch.compile and which backend to use
|
||||||
torch_compile: # bool
|
# setting to `auto` will enable torch compile when torch>=2.5.1
|
||||||
|
torch_compile: # Optional[Union[Literal["auto"], bool]]
|
||||||
torch_compile_backend: # Optional[str]
|
torch_compile_backend: # Optional[str]
|
||||||
|
|
||||||
# Training hyperparameters
|
# Training hyperparameters
|
||||||
@@ -348,10 +360,11 @@ warmup_ratio: 0.05 # cannot use with warmup_steps
|
|||||||
learning_rate: 0.00003
|
learning_rate: 0.00003
|
||||||
lr_quadratic_warmup:
|
lr_quadratic_warmup:
|
||||||
logging_steps:
|
logging_steps:
|
||||||
eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
|
eval_steps: # Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps
|
||||||
evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
|
evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
|
||||||
save_strategy: # Set to `"no"` to skip checkpoint saves
|
eval_strategy: # Set to `"no"` to skip evaluation, `"epoch"` at end of each epoch, leave empty to infer from `eval_steps`.
|
||||||
save_steps: # Leave empty to save at each epoch
|
save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of each epoch, `"best"` when better result is achieved, leave empty to infer from `save_steps`.
|
||||||
|
save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
|
||||||
saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
|
saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
|
||||||
save_total_limit: # Checkpoints saved at a time
|
save_total_limit: # Checkpoints saved at a time
|
||||||
# Maximum number of iterations to train for. It precedes num_epochs which means that
|
# Maximum number of iterations to train for. It precedes num_epochs which means that
|
||||||
@@ -363,6 +376,10 @@ eval_table_size: # Approximate number of predictions sent to wandb depending on
|
|||||||
eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
|
eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
|
||||||
eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"]
|
eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"]
|
||||||
|
|
||||||
|
profiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.
|
||||||
|
# see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information
|
||||||
|
# snapshots can be visualized @ https://pytorch.org/memory_viz
|
||||||
|
|
||||||
loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
|
loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
|
||||||
loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
|
loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
|
||||||
|
|
||||||
|
|||||||
@@ -68,6 +68,8 @@ We recommend checking the below examples for other usecases.
|
|||||||
datasets:
|
datasets:
|
||||||
- path: ...
|
- path: ...
|
||||||
type: chat_template
|
type: chat_template
|
||||||
|
roles_to_train:
|
||||||
|
train_on_eos:
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
|
2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
|
||||||
@@ -77,7 +79,7 @@ chat_template: gemma # this overwrites the tokenizer's chat_template
|
|||||||
datasets:
|
datasets:
|
||||||
- path: ...
|
- path: ...
|
||||||
type: chat_template
|
type: chat_template
|
||||||
roles_to_train: ["assistant"]
|
roles_to_train: ["assistant"] # default value
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
|
3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
|
||||||
@@ -87,7 +89,6 @@ chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer
|
|||||||
datasets:
|
datasets:
|
||||||
- path: ...
|
- path: ...
|
||||||
type: chat_template
|
type: chat_template
|
||||||
roles_to_train: ["assistant"]
|
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.
|
4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.
|
||||||
@@ -99,7 +100,6 @@ chat_template_jinja: "{{ bos_token }}{% for message in messages %}{% if (message
|
|||||||
datasets:
|
datasets:
|
||||||
- path: ...
|
- path: ...
|
||||||
type: chat_template
|
type: chat_template
|
||||||
roles_to_train: ["assistant"]
|
|
||||||
```
|
```
|
||||||
|
|
||||||
5. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
|
5. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
|
||||||
|
|||||||
@@ -19,7 +19,14 @@ For pretraining, there is no prompt template or roles. The only required field
|
|||||||
Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:
|
Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:
|
||||||
|
|
||||||
```{.yaml filename="config.yaml"}
|
```{.yaml filename="config.yaml"}
|
||||||
pretraining_dataset: # hf path only
|
pretraining_dataset:
|
||||||
|
- name:
|
||||||
|
path:
|
||||||
|
split:
|
||||||
|
text_column: # column in dataset with the data, usually `text`
|
||||||
|
type: pretrain
|
||||||
|
trust_remote_code:
|
||||||
|
skip: # number of rows of data to skip over from the beginning
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
29
docs/lr_groups.qmd
Normal file
29
docs/lr_groups.qmd
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
---
|
||||||
|
title: Learning Rate Groups
|
||||||
|
description: "Setting different learning rates by module name"
|
||||||
|
---
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of
|
||||||
|
modules in a model.
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
lr_groups:
|
||||||
|
- name: o_proj
|
||||||
|
modules:
|
||||||
|
- self_attn.o_proj.weight
|
||||||
|
lr: 1e-6
|
||||||
|
- name: q_proj
|
||||||
|
modules:
|
||||||
|
- model.layers.2.self_attn.q_proj.weight
|
||||||
|
lr: 1e-5
|
||||||
|
|
||||||
|
learning_rate: 2e-5
|
||||||
|
```
|
||||||
|
|
||||||
|
In this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate
|
||||||
|
of 1e-6 for all the self attention `o_proj` modules across all layers, and a learning are of 1e-5 to the 3rd layer's
|
||||||
|
self attention `q_proj` module.
|
||||||
@@ -1,6 +1,10 @@
|
|||||||
base_model: cerebras/btlm-3b-8k-base
|
base_model: cerebras/btlm-3b-8k-base
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: GPT2Tokenizer
|
tokenizer_type: GPT2Tokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
tokenizer_use_fast: true
|
tokenizer_use_fast: true
|
||||||
tokenizer_legacy: true
|
tokenizer_legacy: true
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
base_model: cerebras/Cerebras-GPT-1.3B
|
base_model: cerebras/Cerebras-GPT-1.3B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
strict: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: codellama/CodeLlama-13b-hf
|
base_model: codellama/CodeLlama-13b-hf
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: CodeLlamaTokenizer
|
tokenizer_type: CodeLlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: codellama/CodeLlama-13b-hf
|
base_model: codellama/CodeLlama-13b-hf
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: CodeLlamaTokenizer
|
tokenizer_type: CodeLlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: codellama/CodeLlama-34b-hf
|
base_model: codellama/CodeLlama-34b-hf
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: CodeLlamaTokenizer
|
tokenizer_type: CodeLlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: codellama/CodeLlama-34b-hf
|
base_model: codellama/CodeLlama-34b-hf
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: CodeLlamaTokenizer
|
tokenizer_type: CodeLlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: codellama/CodeLlama-7b-hf
|
base_model: codellama/CodeLlama-7b-hf
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: CodeLlamaTokenizer
|
tokenizer_type: CodeLlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: codellama/CodeLlama-7b-hf
|
base_model: codellama/CodeLlama-7b-hf
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: CodeLlamaTokenizer
|
tokenizer_type: CodeLlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
base_model: LnL-AI/dbrx-base-converted-v2
|
base_model: LnL-AI/dbrx-base-converted-v2
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
base_model: LnL-AI/dbrx-base-converted-v2
|
base_model: LnL-AI/dbrx-base-converted-v2
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
base_model: LnL-AI/dbrx-base-converted-v2
|
base_model: LnL-AI/dbrx-base-converted-v2
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
base_model: deepseek-ai/DeepSeek-V2-Lite
|
base_model: deepseek-ai/DeepSeek-V2-Lite
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
base_model: axolotl-quants/DeepSeek-V2.5-bnb-nf4-bf16
|
base_model: axolotl-quants/DeepSeek-V2.5-bnb-nf4-bf16
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,7 +1,12 @@
|
|||||||
base_model: tiiuae/falcon-7b
|
base_model: tiiuae/falcon-7b
|
||||||
trust_remote_code: true
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
||||||
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,10 +1,15 @@
|
|||||||
# 1b: tiiuae/falcon-rw-1b
|
# 1b: tiiuae/falcon-rw-1b
|
||||||
# 40b: tiiuae/falcon-40b
|
# 40b: tiiuae/falcon-40b
|
||||||
base_model: tiiuae/falcon-7b
|
base_model: tiiuae/falcon-7b
|
||||||
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
# optionally might have model_type or tokenizer_type
|
||||||
trust_remote_code: true
|
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
||||||
|
trust_remote_code: true
|
||||||
|
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
# enable 4bit for QLoRA
|
# enable 4bit for QLoRA
|
||||||
|
|||||||
@@ -1,7 +1,12 @@
|
|||||||
base_model: tiiuae/falcon-7b
|
base_model: tiiuae/falcon-7b
|
||||||
trust_remote_code: true
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
||||||
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
# use google/gemma-7b if you have access
|
# use google/gemma-7b if you have access
|
||||||
base_model: mhenrichsen/gemma-7b
|
base_model: mhenrichsen/gemma-7b
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: google/gemma-2-9b
|
base_model: google/gemma-2-9b
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: google/gemma-2-2b
|
base_model: google/gemma-2-2b
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForSequenceClassification
|
model_type: AutoModelForSequenceClassification
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
base_model: EleutherAI/gpt-j-6b
|
base_model: EleutherAI/gpt-j-6b
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
strict: false
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
base_model: ai21labs/Jamba-v0.1
|
base_model: ai21labs/Jamba-v0.1
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
base_model: ai21labs/Jamba-v0.1
|
base_model: ai21labs/Jamba-v0.1
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
base_model: ai21labs/AI21-Jamba-1.5-Large
|
base_model: ai21labs/AI21-Jamba-1.5-Large
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
strict: false
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
base_model: huggyllama/llama-7b
|
base_model: huggyllama/llama-7b
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
datasets:
|
datasets:
|
||||||
- path: openaccess-ai-collective/jeopardy
|
- path: openaccess-ai-collective/jeopardy
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: NousResearch/Llama-2-7b-hf
|
base_model: NousResearch/Llama-2-7b-hf
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
base_model: TheBloke/Llama-2-7B-GPTQ
|
base_model: TheBloke/Llama-2-7B-GPTQ
|
||||||
gptq: true
|
# optionally might have model_type or tokenizer_type
|
||||||
gptq_disable_exllama: true
|
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
gptq: true
|
||||||
|
gptq_disable_exllama: true
|
||||||
|
|
||||||
tokenizer_use_fast: true
|
tokenizer_use_fast: true
|
||||||
tokenizer_legacy: true
|
tokenizer_legacy: true
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: NousResearch/Llama-2-7b-hf
|
base_model: NousResearch/Llama-2-7b-hf
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: NousResearch/Llama-2-7b-hf
|
base_model: NousResearch/Llama-2-7b-hf
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: NousResearch/Llama-2-7b-hf
|
base_model: NousResearch/Llama-2-7b-hf
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: NousResearch/Llama-2-7b-hf
|
base_model: NousResearch/Llama-2-7b-hf
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: NousResearch/Llama-2-7b-hf
|
base_model: NousResearch/Llama-2-7b-hf
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,5 +1,9 @@
|
|||||||
base_model: alpindale/Llama-3.2-11B-Vision-Instruct
|
base_model: alpindale/Llama-3.2-11B-Vision-Instruct
|
||||||
|
# optionally might have model_type or tokenizer_type or processor_type
|
||||||
processor_type: AutoProcessor
|
processor_type: AutoProcessor
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
# these 3 lines are needed for now to handle vision chat templates w images
|
# these 3 lines are needed for now to handle vision chat templates w images
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
base_model: NousResearch/Meta-Llama-3.1-8B
|
base_model: NousResearch/Meta-Llama-3.1-8B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
plugins:
|
plugins:
|
||||||
- axolotl.integrations.liger.LigerPlugin
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
base_model: NousResearch/Meta-Llama-3.1-8B
|
base_model: NousResearch/Meta-Llama-3.1-8B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: meta-llama/Meta-Llama-3-8B-Instruct
|
base_model: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: NousResearch/Meta-Llama-3-8B-Instruct
|
base_model: NousResearch/Meta-Llama-3-8B-Instruct
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: meta-llama/Llama-3.2-1B
|
base_model: meta-llama/Llama-3.2-1B
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: meta-llama/Llama-3.2-1B
|
base_model: meta-llama/Llama-3.2-1B
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
base_model: NousResearch/Llama-3.2-1B
|
base_model: NousResearch/Llama-3.2-1B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: NousResearch/Meta-Llama-3-8B
|
base_model: NousResearch/Meta-Llama-3-8B
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
base_model: meta-llama/Llama-3.2-1B
|
base_model: meta-llama/Llama-3.2-1B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
base_model: NousResearch/Llama-3.2-1B
|
base_model: NousResearch/Llama-3.2-1B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
base_model: hugging-quants/Meta-Llama-3.1-405B-BNB-NF4-BF16
|
base_model: hugging-quants/Meta-Llama-3.1-405B-BNB-NF4-BF16
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
strict: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: casperhansen/llama-3-70b-fp16
|
base_model: casperhansen/llama-3-70b-fp16
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: AutoTokenizer # PreTrainedTokenizerFast
|
tokenizer_type: AutoTokenizer # PreTrainedTokenizerFast
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: NousResearch/Meta-Llama-3-8B
|
base_model: NousResearch/Meta-Llama-3-8B
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
base_model: state-spaces/mamba-2.8b
|
base_model: state-spaces/mamba-2.8b
|
||||||
|
# optionally might have model_type or tokenizer_type or tokenizer_config
|
||||||
model_type: MambaLMHeadModel
|
model_type: MambaLMHeadModel
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
tokenizer_config: EleutherAI/gpt-neox-20b
|
tokenizer_config: EleutherAI/gpt-neox-20b
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
base_model: mistral-community/Mixtral-8x22B-v0.1
|
base_model: mistral-community/Mixtral-8x22B-v0.1
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: mistralai/Mistral-7B-v0.1
|
base_model: mistralai/Mistral-7B-v0.1
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: MistralForCausalLM
|
model_type: MistralForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: mistralai/Mistral-7B-v0.1
|
base_model: mistralai/Mistral-7B-v0.1
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: MistralForCausalLM
|
model_type: MistralForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: mistralai/Mistral-7B-v0.1
|
base_model: mistralai/Mistral-7B-v0.1
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: MistralForCausalLM
|
model_type: MistralForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -4,8 +4,11 @@
|
|||||||
#face problems with the special tokens.
|
#face problems with the special tokens.
|
||||||
|
|
||||||
base_model: mistralai/Mistral-7B-Instruct-v0.2
|
base_model: mistralai/Mistral-7B-Instruct-v0.2
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: MistralForCausalLM
|
model_type: MistralForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
base_model: mistralai/Mixtral-8x7B-v0.1
|
base_model: mistralai/Mixtral-8x7B-v0.1
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: mistralai/Mistral-7B-v0.1
|
base_model: mistralai/Mistral-7B-v0.1
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: MistralForCausalLM
|
model_type: MistralForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: mistral-community/Mixtral-8x22B-v0.1
|
base_model: mistral-community/Mixtral-8x22B-v0.1
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
base_model: mistralai/Mixtral-8x7B-v0.1
|
base_model: mistralai/Mixtral-8x7B-v0.1
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
base_model: mistralai/Mixtral-8x7B-v0.1
|
base_model: mistralai/Mixtral-8x7B-v0.1
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
base_model: mistral-community/Mixtral-8x22B-v0.1
|
base_model: mistral-community/Mixtral-8x22B-v0.1
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: mistralai/Mistral-7B-v0.1
|
base_model: mistralai/Mistral-7B-v0.1
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: MistralForCausalLM
|
model_type: MistralForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,5 +1,9 @@
|
|||||||
base_model: mosaicml/mpt-7b
|
base_model: mosaicml/mpt-7b
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true # required for mpt as their model class is not merged into transformers yet
|
trust_remote_code: true # required for mpt as their model class is not merged into transformers yet
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
datasets:
|
datasets:
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
base_model: openlm-research/open_llama_3b_v2
|
base_model: openlm-research/open_llama_3b_v2
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
base_model: openlm-research/open_llama_3b_v2
|
base_model: openlm-research/open_llama_3b_v2
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
base_model: openlm-research/open_llama_3b_v2
|
base_model: openlm-research/open_llama_3b_v2
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: LlamaForCausalLM
|
model_type: LlamaForCausalLM
|
||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
strict: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: microsoft/Phi-3.5-mini-instruct
|
base_model: microsoft/Phi-3.5-mini-instruct
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: microsoft/phi-1_5
|
base_model: microsoft/phi-1_5
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: microsoft/phi-1_5
|
base_model: microsoft/phi-1_5
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: microsoft/phi-2
|
base_model: microsoft/phi-2
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: microsoft/Phi-3-mini-4k-instruct
|
base_model: microsoft/Phi-3-mini-4k-instruct
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|||||||
@@ -1,7 +1,11 @@
|
|||||||
base_model: microsoft/Phi-3-mini-4k-instruct
|
base_model: microsoft/Phi-3-mini-4k-instruct
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
chat_template: phi_3
|
chat_template: phi_3
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,7 +1,11 @@
|
|||||||
base_model: EleutherAI/pythia-12b-deduped
|
base_model: EleutherAI/pythia-12b-deduped
|
||||||
base_model_ignore_patterns: pytorch* # prefer safetensors
|
base_model_ignore_patterns: pytorch* # prefer safetensors
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: GPTNeoXForCausalLM
|
model_type: GPTNeoXForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
gptq: false
|
gptq: false
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
base_model: EleutherAI/pythia-1.4b-deduped
|
base_model: EleutherAI/pythia-1.4b-deduped
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: Qwen/Qwen-7B
|
base_model: Qwen/Qwen-7B
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: Qwen/Qwen-7B
|
base_model: Qwen/Qwen-7B
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
base_model: Qwen/Qwen1.5-MoE-A2.7B
|
base_model: Qwen/Qwen1.5-MoE-A2.7B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
base_model: Qwen/Qwen1.5-MoE-A2.7B
|
base_model: Qwen/Qwen1.5-MoE-A2.7B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
base_model: Qwen/Qwen2.5-0.5B
|
base_model: Qwen/Qwen2.5-0.5B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
base_model: Qwen/Qwen2-7B
|
base_model: Qwen/Qwen2-7B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
base_model: togethercomputer/RedPajama-INCITE-Chat-3B-v1
|
base_model: togethercomputer/RedPajama-INCITE-Chat-3B-v1
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: GPTNeoXForCausalLM
|
model_type: GPTNeoXForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code:
|
trust_remote_code:
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
datasets:
|
datasets:
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
base_model: replit/replit-code-v1-3b
|
base_model: replit/replit-code-v1-3b
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
datasets:
|
datasets:
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
base_model: stabilityai/stablelm-2-1_6b
|
base_model: stabilityai/stablelm-2-1_6b
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
base_model: stabilityai/stablelm-2-1_6b
|
base_model: stabilityai/stablelm-2-1_6b
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: AutoModelForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
base_model: bigcode/starcoder2-3b
|
base_model: bigcode/starcoder2-3b
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user