Compare commits
3 Commits
peft-updat
...
mm_mc_chat
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c5c01c11d8 | ||
|
|
00ebf2faf9 | ||
|
|
641e84188b |
14
.github/workflows/base.yml
vendored
14
.github/workflows/base.yml
vendored
@@ -40,24 +40,12 @@ jobs:
|
|||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.6.0
|
pytorch: 2.6.0
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
- cuda: "126"
|
|
||||||
cuda_version: 12.6.3
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.6.0
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
- cuda: "128"
|
- cuda: "128"
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
cudnn_version: ""
|
cudnn_version: ""
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: nightly
|
pytorch: nightly
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
- cuda: "128"
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: next
|
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -79,7 +67,7 @@ jobs:
|
|||||||
uses: docker/build-push-action@v4
|
uses: docker/build-push-action@v4
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || matrix.pytorch == 'next' && './docker/Dockerfile-base-next' || './docker/Dockerfile-base' }}
|
file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || './docker/Dockerfile-base' }}
|
||||||
push: ${{ github.event_name != 'pull_request' }}
|
push: ${{ github.event_name != 'pull_request' }}
|
||||||
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
|
||||||
labels: ${{ steps.metadata.outputs.labels }}
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
|
|||||||
4
.github/workflows/main.yml
vendored
4
.github/workflows/main.yml
vendored
@@ -25,12 +25,12 @@ jobs:
|
|||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.5.1
|
pytorch: 2.5.1
|
||||||
axolotl_extras: vllm
|
axolotl_extras: vllm
|
||||||
|
is_latest: true
|
||||||
- cuda: 124
|
- cuda: 124
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.6.0
|
pytorch: 2.6.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
is_latest: true
|
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -87,12 +87,12 @@ jobs:
|
|||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.5.1
|
pytorch: 2.5.1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
|
is_latest: true
|
||||||
- cuda: 124
|
- cuda: 124
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.6.0
|
pytorch: 2.6.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
is_latest: true
|
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
|
|||||||
15
.github/workflows/multi-gpu-e2e.yml
vendored
15
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -24,13 +24,6 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- cuda: 124
|
|
||||||
cuda_version: 12.4.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.6.0
|
|
||||||
axolotl_extras: vllm
|
|
||||||
num_gpus: 2
|
|
||||||
nightly_build: "true"
|
|
||||||
- cuda: 124
|
- cuda: 124
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
@@ -45,6 +38,14 @@ jobs:
|
|||||||
axolotl_extras: vllm
|
axolotl_extras: vllm
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
nightly_build: "true"
|
nightly_build: "true"
|
||||||
|
- cuda: 124
|
||||||
|
cuda_version: 12.4.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
# awaiting vllm#12721
|
||||||
|
axolotl_extras:
|
||||||
|
num_gpus: 2
|
||||||
|
nightly_build: "true"
|
||||||
runs-on: [self-hosted, modal]
|
runs-on: [self-hosted, modal]
|
||||||
timeout-minutes: 120
|
timeout-minutes: 120
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
25
.github/workflows/tests-nightly.yml
vendored
25
.github/workflows/tests-nightly.yml
vendored
@@ -33,15 +33,6 @@ jobs:
|
|||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Restore HF cache
|
|
||||||
id: hf-cache-restore
|
|
||||||
uses: actions/cache/restore@v4
|
|
||||||
with:
|
|
||||||
path: |
|
|
||||||
/home/runner/.cache/huggingface/hub/datasets--*
|
|
||||||
/home/runner/.cache/huggingface/hub/models--*
|
|
||||||
key: ${{ runner.os }}-hf-hub-cache-v2
|
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
@@ -55,7 +46,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install PyTorch
|
- name: Install PyTorch
|
||||||
run: |
|
run: |
|
||||||
pip3 install torch==${{ matrix.pytorch_version }}
|
pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
|
||||||
|
|
||||||
- name: Update requirements.txt
|
- name: Update requirements.txt
|
||||||
run: |
|
run: |
|
||||||
@@ -67,7 +58,8 @@ jobs:
|
|||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
pip3 show torch
|
pip3 install --upgrade pip
|
||||||
|
pip3 install --upgrade packaging==23.2
|
||||||
pip3 install --no-build-isolation -U -e .
|
pip3 install --no-build-isolation -U -e .
|
||||||
python scripts/unsloth_install.py | sh
|
python scripts/unsloth_install.py | sh
|
||||||
python scripts/cutcrossentropy_install.py | sh
|
python scripts/cutcrossentropy_install.py | sh
|
||||||
@@ -81,15 +73,10 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
axolotl --help
|
axolotl --help
|
||||||
|
|
||||||
- name: Pre-Download dataset fixture
|
|
||||||
run: |
|
|
||||||
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
|
pytest -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
|
||||||
pytest -v tests/patched/
|
pytest tests/patched/
|
||||||
pytest -v tests/cli/
|
|
||||||
|
|
||||||
- name: cleanup pip cache
|
- name: cleanup pip cache
|
||||||
run: |
|
run: |
|
||||||
@@ -149,4 +136,4 @@ jobs:
|
|||||||
echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
|
echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
|
||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
run: |
|
run: |
|
||||||
modal run cicd.e2e_tests
|
modal run cicd.tests
|
||||||
|
|||||||
107
.github/workflows/tests.yml
vendored
107
.github/workflows/tests.yml
vendored
@@ -63,7 +63,7 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
/home/runner/.cache/huggingface/hub/datasets--*
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
/home/runner/.cache/huggingface/hub/models--*
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
key: ${{ runner.os }}-hf-hub-cache-v2
|
key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
@@ -96,10 +96,6 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
axolotl --help
|
axolotl --help
|
||||||
|
|
||||||
- name: Pre-Download dataset fixture
|
|
||||||
run: |
|
|
||||||
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
|
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
|
||||||
@@ -141,7 +137,7 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
/home/runner/.cache/huggingface/hub/datasets--*
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
/home/runner/.cache/huggingface/hub/models--*
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
key: ${{ runner.os }}-hf-hub-cache-v2
|
key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
@@ -175,9 +171,6 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
axolotl --help
|
axolotl --help
|
||||||
|
|
||||||
- name: Show HF cache
|
|
||||||
run: huggingface-cli scan-cache
|
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
|
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
|
||||||
@@ -208,53 +201,6 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- cuda: 124
|
|
||||||
cuda_version: 12.4.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.6.0
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras: vllm
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
- name: Install Modal
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install modal==0.71.8 jinja2
|
|
||||||
- name: Update env vars
|
|
||||||
run: |
|
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
|
||||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
|
||||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
|
||||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
|
||||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
|
||||||
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
|
||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
|
||||||
- name: Run tests job on Modal
|
|
||||||
run: |
|
|
||||||
modal run cicd.e2e_tests
|
|
||||||
|
|
||||||
docker-e2e-tests:
|
|
||||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
|
||||||
runs-on: [self-hosted, modal]
|
|
||||||
timeout-minutes: 90
|
|
||||||
needs: [pre-commit, pytest, docker-e2e-tests-1st]
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
include:
|
|
||||||
- cuda: 124
|
|
||||||
cuda_version: 12.4.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.4.1
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras:
|
|
||||||
- cuda: 124
|
- cuda: 124
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
@@ -283,4 +229,51 @@ jobs:
|
|||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
run: |
|
run: |
|
||||||
modal run cicd.e2e_tests
|
modal run cicd.tests
|
||||||
|
|
||||||
|
docker-e2e-tests:
|
||||||
|
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||||
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
|
runs-on: [self-hosted, modal]
|
||||||
|
timeout-minutes: 90
|
||||||
|
needs: [pre-commit, pytest, docker-e2e-tests-1st]
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- cuda: 124
|
||||||
|
cuda_version: 12.4.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.4.1
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras:
|
||||||
|
- cuda: 124
|
||||||
|
cuda_version: 12.4.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras:
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.11"
|
||||||
|
- name: Install Modal
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install modal==0.71.8 jinja2
|
||||||
|
- name: Update env vars
|
||||||
|
run: |
|
||||||
|
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
|
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||||
|
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||||
|
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||||
|
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||||
|
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
||||||
|
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||||
|
- name: Run tests job on Modal
|
||||||
|
run: |
|
||||||
|
modal run cicd.tests
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
[settings]
|
[settings]
|
||||||
profile=black
|
profile=black
|
||||||
known_third_party=wandb,comet_ml
|
known_third_party=wandb,comet_ml
|
||||||
known_local_folder=src,tests
|
|
||||||
|
|||||||
@@ -40,7 +40,6 @@ quartodoc:
|
|||||||
- cli.preprocess
|
- cli.preprocess
|
||||||
- cli.sweeps
|
- cli.sweeps
|
||||||
- cli.utils
|
- cli.utils
|
||||||
- cli.vllm_serve
|
|
||||||
- cli.cloud.base
|
- cli.cloud.base
|
||||||
- cli.cloud.modal_
|
- cli.cloud.modal_
|
||||||
- title: Trainers
|
- title: Trainers
|
||||||
@@ -244,7 +243,6 @@ website:
|
|||||||
- docs/unsloth.qmd
|
- docs/unsloth.qmd
|
||||||
- docs/torchao.qmd
|
- docs/torchao.qmd
|
||||||
- docs/custom_integrations.qmd
|
- docs/custom_integrations.qmd
|
||||||
- docs/sequence_parallelism.qmd
|
|
||||||
|
|
||||||
- section: "Troubleshooting"
|
- section: "Troubleshooting"
|
||||||
contents:
|
contents:
|
||||||
|
|||||||
@@ -2,5 +2,4 @@
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
# only run one test at a time so as not to OOM the GPU
|
# only run one test at a time so as not to OOM the GPU
|
||||||
pytest -v --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
|
pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/
|
||||||
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
|
|
||||||
|
|||||||
@@ -20,9 +20,9 @@ WORKDIR /workspace/axolotl
|
|||||||
|
|
||||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||||
else \
|
else \
|
||||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
RUN python scripts/unsloth_install.py | sh
|
RUN python scripts/unsloth_install.py | sh
|
||||||
|
|||||||
@@ -1,38 +0,0 @@
|
|||||||
ARG CUDA_VERSION="12.8.1"
|
|
||||||
ARG CUDNN_VERSION="8"
|
|
||||||
ARG UBUNTU_VERSION="22.04"
|
|
||||||
ARG MAX_JOBS=4
|
|
||||||
|
|
||||||
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
|
||||||
|
|
||||||
ENV PATH="/root/miniconda3/bin:${PATH}"
|
|
||||||
|
|
||||||
ARG PYTHON_VERSION="3.11"
|
|
||||||
ARG PYTORCH_VERSION="next"
|
|
||||||
ARG CUDA="128"
|
|
||||||
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
|
|
||||||
|
|
||||||
ENV PYTHON_VERSION=$PYTHON_VERSION
|
|
||||||
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
|
|
||||||
|
|
||||||
RUN apt-get update \
|
|
||||||
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
|
|
||||||
&& wget \
|
|
||||||
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
|
||||||
&& mkdir /root/.conda \
|
|
||||||
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
|
||||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
|
||||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
|
||||||
|
|
||||||
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
|
|
||||||
python3 -m pip install --no-cache-dir -U torch==2.7.0 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
|
|
||||||
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
|
|
||||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
|
|
||||||
|
|
||||||
RUN git lfs install --skip-repo && \
|
|
||||||
pip3 install awscli && \
|
|
||||||
pip3 install -U --no-cache-dir pydantic==2.10.6
|
|
||||||
40
docs/cli.qmd
40
docs/cli.qmd
@@ -170,7 +170,7 @@ axolotl merge-sharded-fsdp-weights config.yml
|
|||||||
|
|
||||||
### evaluate
|
### evaluate
|
||||||
|
|
||||||
Evaluates a model's performance (loss etc) on the train and eval datasets.
|
Evaluates a model's performance using metrics specified in the config.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Basic evaluation
|
# Basic evaluation
|
||||||
@@ -197,8 +197,6 @@ lm_eval_batch_size: # Batch size for evaluation
|
|||||||
output_dir: # Directory to save evaluation results
|
output_dir: # Directory to save evaluation results
|
||||||
```
|
```
|
||||||
|
|
||||||
See [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) for more details.
|
|
||||||
|
|
||||||
## Legacy CLI Usage
|
## Legacy CLI Usage
|
||||||
|
|
||||||
While the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:
|
While the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:
|
||||||
@@ -237,7 +235,7 @@ Create a cloud config YAML with your Modal settings:
|
|||||||
```yaml
|
```yaml
|
||||||
# cloud_config.yml
|
# cloud_config.yml
|
||||||
provider: modal
|
provider: modal
|
||||||
gpu: a100 # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4
|
gpu: a100 # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4
|
||||||
gpu_count: 1 # Number of GPUs to use
|
gpu_count: 1 # Number of GPUs to use
|
||||||
timeout: 86400 # Maximum runtime in seconds (24 hours)
|
timeout: 86400 # Maximum runtime in seconds (24 hours)
|
||||||
branch: main # Git branch to use (optional)
|
branch: main # Git branch to use (optional)
|
||||||
@@ -250,7 +248,7 @@ volumes: # Persistent storage volumes
|
|||||||
- name: axolotl-artifacts
|
- name: axolotl-artifacts
|
||||||
mount: /workspace/artifacts
|
mount: /workspace/artifacts
|
||||||
|
|
||||||
secrets: # Secrets to inject
|
env: # Environment variables
|
||||||
- WANDB_API_KEY
|
- WANDB_API_KEY
|
||||||
- HF_TOKEN
|
- HF_TOKEN
|
||||||
```
|
```
|
||||||
@@ -276,27 +274,15 @@ axolotl lm-eval config.yml --cloud cloud_config.yml
|
|||||||
### Cloud Configuration Options
|
### Cloud Configuration Options
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
provider: # compute provider, currently only `modal` is supported
|
provider: # compute provider, currently only `modal` is supported
|
||||||
gpu: # GPU type to use
|
gpu: # GPU type to use
|
||||||
gpu_count: # Number of GPUs (default: 1)
|
gpu_count: # Number of GPUs (default: 1)
|
||||||
memory: # RAM in GB (default: 128)
|
memory: # RAM in GB (default: 128)
|
||||||
timeout: # Maximum runtime in seconds
|
timeout: # Maximum runtime in seconds
|
||||||
timeout_preprocess: # Preprocessing timeout
|
timeout_preprocess: # Preprocessing timeout
|
||||||
branch: # Git branch to use
|
branch: # Git branch to use
|
||||||
docker_tag: # Custom Docker image tag
|
docker_tag: # Custom Docker image tag
|
||||||
volumes: # List of persistent storage volumes
|
volumes: # List of persistent storage volumes
|
||||||
|
env: # Environment variables to pass
|
||||||
# Environment variables to pass. Can be specified in two ways:
|
secrets: # Secrets to inject
|
||||||
# 1. As a string: Will load the value from the host computer's environment variables
|
|
||||||
# 2. As a key-value pair: Will use the specified value directly
|
|
||||||
# Example:
|
|
||||||
# env:
|
|
||||||
# - CUSTOM_VAR # Loads from host's $CUSTOM_VAR
|
|
||||||
# - {CUSTOM_VAR: "value"} # Uses "value" directly
|
|
||||||
env:
|
|
||||||
|
|
||||||
# Secrets to inject. Same input format as `env` but for sensitive data.
|
|
||||||
secrets:
|
|
||||||
# - HF_TOKEN
|
|
||||||
# - WANDB_API_KEY
|
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -238,10 +238,10 @@ simpo_gamma: 0.5 # Target reward margin for the SimPO loss
|
|||||||
# grpo
|
# grpo
|
||||||
trl:
|
trl:
|
||||||
use_vllm: # Optional[bool]. Whether to use VLLM for RL training.
|
use_vllm: # Optional[bool]. Whether to use VLLM for RL training.
|
||||||
vllm_server_host: # Optional[str]. Host of the vLLM server to connect to.
|
vllm_device: # Optional[str]. Device to use for VLLM.
|
||||||
vllm_server_port: # Optional[int]. Port of the vLLM server to connect to.
|
vllm_gpu_memory_utilization: # Optional[float]. GPU memory utilization for VLLM.
|
||||||
vllm_server_timeout: # Optional[int]. Total timeout (in seconds) to wait for the vLLM server to respond.
|
vllm_max_model_len: # Optional[int]. Maximum length of the model for VLLM.
|
||||||
vllm_guided_decoding_regex: # Optional[str]. Regex for vLLM guided decoding.
|
vllm_dtype: # Optional[str]. Data type for VLLM.
|
||||||
|
|
||||||
beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use
|
beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use
|
||||||
max_completion_length: # Optional[int]. Maximum length of the completion for RL training.
|
max_completion_length: # Optional[int]. Maximum length of the completion for RL training.
|
||||||
@@ -320,13 +320,9 @@ total_num_tokens:
|
|||||||
sample_packing_group_size: 100000
|
sample_packing_group_size: 100000
|
||||||
# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
|
# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
|
||||||
sample_packing_bin_size: 200
|
sample_packing_bin_size: 200
|
||||||
sample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially.
|
|
||||||
|
|
||||||
# whether to concatenate samples during pretraining
|
# whether to concatenate samples during pretraining
|
||||||
pretraining_sample_concatenation:
|
pretraining_sample_concatenation:
|
||||||
|
|
||||||
curriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning
|
|
||||||
|
|
||||||
# Use batch flattening for speedups when not using sample_packing
|
# Use batch flattening for speedups when not using sample_packing
|
||||||
batch_flattening:
|
batch_flattening:
|
||||||
|
|
||||||
@@ -358,27 +354,7 @@ lora_target_modules:
|
|||||||
# - down_proj
|
# - down_proj
|
||||||
# - up_proj
|
# - up_proj
|
||||||
lora_target_linear: # If true, will target all linear modules
|
lora_target_linear: # If true, will target all linear modules
|
||||||
|
peft_layers_to_transform: # The layer indices to transform, otherwise, apply to all layers
|
||||||
# List[int] | int. # The layer indices to transform, otherwise, apply to all layers
|
|
||||||
# https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig.layers_to_transform
|
|
||||||
peft_layers_to_transform:
|
|
||||||
|
|
||||||
# Optional[bool]. Whether to use DoRA.
|
|
||||||
# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#weight-decomposed-low-rank-adaptation-dora
|
|
||||||
peft_use_dora:
|
|
||||||
|
|
||||||
# Optional[bool]. Whether to use RSLoRA.
|
|
||||||
# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#rank-stabilized-lora
|
|
||||||
peft_use_rslora:
|
|
||||||
|
|
||||||
# Optional[list[tuple[int, int]]]. List of layer indices to replicate.
|
|
||||||
# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#memory-efficient-layer-replication-with-lora
|
|
||||||
peft_layer_replication:
|
|
||||||
|
|
||||||
# bool | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq"]
|
|
||||||
# How to initialize LoRA weights. Default to True which is MS original implementation.
|
|
||||||
# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#initialization
|
|
||||||
peft_init_lora_weights:
|
|
||||||
|
|
||||||
# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
|
# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
|
||||||
# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
|
# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
|
||||||
@@ -510,8 +486,7 @@ train_on_inputs: false
|
|||||||
# Note that training loss may have an oscillating pattern with this enabled.
|
# Note that training loss may have an oscillating pattern with this enabled.
|
||||||
group_by_length: false
|
group_by_length: false
|
||||||
|
|
||||||
# Whether to use gradient checkpointing. Available options are: true, false, "offload".
|
# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
|
||||||
# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
|
|
||||||
gradient_checkpointing: false
|
gradient_checkpointing: false
|
||||||
# additional kwargs to pass to the trainer for gradient checkpointing
|
# additional kwargs to pass to the trainer for gradient checkpointing
|
||||||
# gradient_checkpointing_kwargs:
|
# gradient_checkpointing_kwargs:
|
||||||
@@ -612,31 +587,26 @@ max_grad_norm:
|
|||||||
# currently only supported on Llama and Mistral
|
# currently only supported on Llama and Mistral
|
||||||
neftune_noise_alpha:
|
neftune_noise_alpha:
|
||||||
|
|
||||||
# Optional[bool]. Whether to bettertransformers
|
# Whether to bettertransformers
|
||||||
flash_optimum:
|
flash_optimum:
|
||||||
|
# Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
|
||||||
# Note: Only one of the following attention patches can be used at a time.
|
|
||||||
# For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`.
|
|
||||||
|
|
||||||
# Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
|
|
||||||
xformers_attention:
|
xformers_attention:
|
||||||
# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
|
# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
|
||||||
flash_attention:
|
flash_attention:
|
||||||
flash_attn_cross_entropy: # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only
|
flash_attn_cross_entropy: # Whether to use flash-attention cross entropy implementation - advanced use only
|
||||||
flash_attn_rms_norm: # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only
|
flash_attn_rms_norm: # Whether to use flash-attention rms norm implementation - advanced use only
|
||||||
flash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation
|
flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
|
||||||
flash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation
|
flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
|
||||||
# Optional[bool]. Whether to use scaled-dot-product attention
|
# Whether to use scaled-dot-product attention
|
||||||
# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
|
# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
|
||||||
sdp_attention:
|
sdp_attention:
|
||||||
# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
|
# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
|
||||||
s2_attention:
|
s2_attention:
|
||||||
|
|
||||||
# Optional[bool]. Whether to use low_cpu_mem_usage
|
# Optional[bool]. Whether to use low_cpu_mem_usage
|
||||||
low_cpu_mem_usage:
|
low_cpu_mem_usage:
|
||||||
# Optional[str]. Resume from a specific checkpoint dir
|
# Resume from a specific checkpoint dir
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off.
|
# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
|
||||||
# Be careful with this being turned on between different models.
|
# Be careful with this being turned on between different models.
|
||||||
auto_resume_from_checkpoints: false
|
auto_resume_from_checkpoints: false
|
||||||
|
|
||||||
@@ -688,9 +658,6 @@ ddp_broadcast_buffers:
|
|||||||
# subsequences, or set to 4 to split into four equal-sized subsequences.
|
# subsequences, or set to 4 to split into four equal-sized subsequences.
|
||||||
# See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
|
# See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
|
||||||
sequence_parallel_degree:
|
sequence_parallel_degree:
|
||||||
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
|
||||||
# Must evenly divide the number of KV heads in your model.
|
|
||||||
heads_k_stride: 1
|
|
||||||
|
|
||||||
# Path to torch distx for optim 'adamw_anyprecision'
|
# Path to torch distx for optim 'adamw_anyprecision'
|
||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
|
|||||||
12
docs/faq.qmd
12
docs/faq.qmd
@@ -35,22 +35,12 @@ description: Frequently asked questions
|
|||||||
|
|
||||||
**Q: How to call Axolotl via custom python scripts?**
|
**Q: How to call Axolotl via custom python scripts?**
|
||||||
|
|
||||||
> A: Since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.
|
> A: Yes, since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.
|
||||||
|
|
||||||
**Q: How to know the value to use for `fsdp_transformer_layer_cls_to_wrap`?**
|
**Q: How to know the value to use for `fsdp_transformer_layer_cls_to_wrap`?**
|
||||||
|
|
||||||
> A: This is the class name of the transformer layer to wrap with FSDP. For example, for `LlamaForCausalLM`, the value is `LlamaDecoderLayer`. To find this for a specific model, check the model's `PreTrainedModel` definition and look for `_no_split_modules` variable in the `modeling_<model_name>.py` file within `transformers` library.
|
> A: This is the class name of the transformer layer to wrap with FSDP. For example, for `LlamaForCausalLM`, the value is `LlamaDecoderLayer`. To find this for a specific model, check the model's `PreTrainedModel` definition and look for `_no_split_modules` variable in the `modeling_<model_name>.py` file within `transformers` library.
|
||||||
|
|
||||||
**Q: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token**
|
|
||||||
|
|
||||||
> A: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:
|
|
||||||
|
|
||||||
> ```yaml
|
|
||||||
> special_tokens:
|
|
||||||
> # str. If you're not sure, set to same as `eos_token`.
|
|
||||||
> pad_token: "..."
|
|
||||||
> ```
|
|
||||||
|
|
||||||
### Chat templates
|
### Chat templates
|
||||||
|
|
||||||
**Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
|
**Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
|
||||||
|
|||||||
@@ -17,7 +17,6 @@ We currently support several common model architectures, including (but not limi
|
|||||||
- `qwen2`
|
- `qwen2`
|
||||||
- `gemma`
|
- `gemma`
|
||||||
- `gemma2`
|
- `gemma2`
|
||||||
- `gemma3`
|
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
|
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ Axolotl supports several methods for multi-GPU training:
|
|||||||
|
|
||||||
- DeepSpeed (recommended)
|
- DeepSpeed (recommended)
|
||||||
- FSDP (Fully Sharded Data Parallel)
|
- FSDP (Fully Sharded Data Parallel)
|
||||||
- Sequence parallelism
|
|
||||||
- FSDP + QLoRA
|
- FSDP + QLoRA
|
||||||
|
|
||||||
## DeepSpeed {#sec-deepspeed}
|
## DeepSpeed {#sec-deepspeed}
|
||||||
@@ -67,28 +66,6 @@ fsdp_config:
|
|||||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||||
```
|
```
|
||||||
|
|
||||||
## Sequence parallelism {#sec-sequence-parallelism}
|
|
||||||
|
|
||||||
We support sequence parallelism (SP) via the
|
|
||||||
[ring-flash-attention](https://github.com/zhuzilin/ring-flash-attention) project. This
|
|
||||||
allows one to split up sequences across GPUs, which is useful in the event that a
|
|
||||||
single sequence causes OOM errors during model training.
|
|
||||||
|
|
||||||
First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
|
|
||||||
or from source with `pip install .[ring-flash-attn]`.
|
|
||||||
|
|
||||||
Your Axolotl YAML config should contain the following lines:
|
|
||||||
|
|
||||||
```{.yaml}
|
|
||||||
sequence_parallel_degree: 4 # Split each sequence into 4 parts, one per GPU
|
|
||||||
flash_attention: true # Required with sequence parallelism
|
|
||||||
|
|
||||||
# Optional; strides across the key dimension. Larger values use more memory but will make training faster.
|
|
||||||
heads_k_stride: 1
|
|
||||||
```
|
|
||||||
|
|
||||||
See our [dedicated guide](sequence_parallelism.qmd) for more details.
|
|
||||||
|
|
||||||
### FSDP + QLoRA {#sec-fsdp-qlora}
|
### FSDP + QLoRA {#sec-fsdp-qlora}
|
||||||
|
|
||||||
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
|
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
|
||||||
|
|||||||
@@ -502,48 +502,9 @@ The input format is a simple JSON input with customizable fields based on the ab
|
|||||||
Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
|
Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
|
||||||
:::
|
:::
|
||||||
|
|
||||||
If you have multiple GPUs available, we reccomend using `vLLM` with the `GRPOTrainer` to significantly speedup trajectory generation during training.
|
|
||||||
First, launch a `vLLM` server using `trl vllm-serve` - you may use a config file or CLI overrides to configure your vLLM server. In this example, we're
|
|
||||||
using 4 GPUs - 2 for training, and 2 for vLLM:
|
|
||||||
|
|
||||||
::: {.callout-important}
|
|
||||||
Make sure you've installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. `pip install axolotl[vllm]`.
|
|
||||||
:::
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
|
||||||
|
|
||||||
vllm:
|
|
||||||
host: 0.0.0.0
|
|
||||||
port: 8000
|
|
||||||
tensor_parallel_size: 2
|
|
||||||
gpu_memory_utilization: 0.85
|
|
||||||
dtype: auto
|
|
||||||
# max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand
|
|
||||||
|
|
||||||
rl: grpo
|
|
||||||
trl:
|
|
||||||
use_vllm: true
|
|
||||||
vllm_server_host: 0.0.0.0
|
|
||||||
vllm_server_port: 8000
|
|
||||||
vllm_server_timeout: 300
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
CUDA_VISIBLE_DEVICES=2,3 axolotl vllm_serve grpo.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Reward functions
|
|
||||||
|
|
||||||
GRPO uses custom reward functions and transformations. Please have them ready locally.
|
GRPO uses custom reward functions and transformations. Please have them ready locally.
|
||||||
|
|
||||||
For example, to load OpenAI's GSM8K and use a random reward for completions:
|
For ex, to load OpenAI's GSM8K and use a random reward for completions:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# rewards.py
|
# rewards.py
|
||||||
@@ -569,6 +530,8 @@ trl:
|
|||||||
beta: 0.001
|
beta: 0.001
|
||||||
max_completion_length: 256
|
max_completion_length: 256
|
||||||
use_vllm: True
|
use_vllm: True
|
||||||
|
vllm_device: auto
|
||||||
|
vllm_gpu_memory_utilization: 0.15
|
||||||
num_generations: 4
|
num_generations: 4
|
||||||
reward_funcs: ["rewards.rand_reward_func"] # format: '{file_name}.{fn_name}'
|
reward_funcs: ["rewards.rand_reward_func"] # format: '{file_name}.{fn_name}'
|
||||||
reward_weights: [1.0]
|
reward_weights: [1.0]
|
||||||
|
|||||||
@@ -25,8 +25,6 @@ To enable sequence parallelism, add the following to your configuration file:
|
|||||||
```yaml
|
```yaml
|
||||||
# Set to a divisor (> 1) of the number of GPUs available
|
# Set to a divisor (> 1) of the number of GPUs available
|
||||||
sequence_parallel_degree: 4 # Split sequences across 4 GPUs
|
sequence_parallel_degree: 4 # Split sequences across 4 GPUs
|
||||||
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
|
||||||
heads_k_stride: 1
|
|
||||||
```
|
```
|
||||||
|
|
||||||
The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
|
The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
|
||||||
@@ -60,16 +58,11 @@ To use sequence parallelism, you need:
|
|||||||
## Example
|
## Example
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
|
# Example config with sequence parallelism
|
||||||
base_model: meta-llama/Llama-3-8B-Instruct
|
base_model: meta-llama/Llama-3-8B-Instruct
|
||||||
sequence_len: 8192
|
sequence_len: 8192
|
||||||
|
sequence_parallel_degree: 2 # Split each sequence into 4 parts
|
||||||
...
|
|
||||||
|
|
||||||
sequence_parallel_degree: 4 # Split each sequence into 4 parts, one per GPU
|
|
||||||
flash_attention: true # Required with sequence parallelism
|
flash_attention: true # Required with sequence parallelism
|
||||||
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
|
||||||
heads_k_stride: 1
|
|
||||||
|
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -8,6 +8,9 @@ tokenizer_type: GPT2Tokenizer
|
|||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
tokenizer_use_fast: true
|
tokenizer_use_fast: true
|
||||||
tokenizer_legacy: true
|
tokenizer_legacy: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
hf_use_auth_token: true
|
hf_use_auth_token: true
|
||||||
@@ -31,6 +34,7 @@ lora_alpha:
|
|||||||
lora_dropout:
|
lora_dropout:
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -54,12 +58,16 @@ learning_rate: 0.000085
|
|||||||
train_on_inputs: true
|
train_on_inputs: true
|
||||||
group_by_length: false
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: false
|
gradient_checkpointing: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
sdp_attention:
|
sdp_attention:
|
||||||
flash_optimum:
|
flash_optimum:
|
||||||
@@ -72,6 +80,8 @@ evals_per_epoch: 4
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
save_total_limit:
|
save_total_limit:
|
||||||
|
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ lora_target_modules:
|
|||||||
- c_attn
|
- c_attn
|
||||||
- c_proj
|
- c_proj
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
|
lora_fan_in_fan_out:
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
wandb_watch:
|
wandb_watch:
|
||||||
@@ -35,10 +36,15 @@ optimizer: paged_adamw_8bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
xformers_attention: true
|
||||||
flash_attention:
|
flash_attention:
|
||||||
@@ -47,6 +53,10 @@ gptq_model_v1:
|
|||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -40,18 +41,29 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
s2_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,18 +43,28 @@ optimizer: paged_adamw_32bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -40,18 +41,29 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
s2_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,18 +43,28 @@ optimizer: paged_adamw_32bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -40,18 +41,29 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
s2_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,18 +43,28 @@ optimizer: paged_adamw_32bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -44,16 +44,28 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -3,6 +3,9 @@ base_model: LnL-AI/dbrx-base-converted-v2
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -45,20 +48,26 @@ optimizer: paged_adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: false # don't use with fsdp_activation_checkpointing
|
gradient_checkpointing: false # don't use with fsdp_activation_checkpointing
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -48,20 +48,26 @@ optimizer: paged_adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: false # don't use with fsdp_activation_checkpointing
|
gradient_checkpointing: false # don't use with fsdp_activation_checkpointing
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -3,6 +3,9 @@ base_model: LnL-AI/dbrx-base-converted-v2
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -32,19 +35,25 @@ optimizer: paged_adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
deepspeed: deepspeed_configs/zero3_bf16.json
|
deepspeed: deepspeed_configs/zero3_bf16.json
|
||||||
|
|||||||
@@ -2,6 +2,9 @@ base_model: deepseek-ai/DeepSeek-V2-Lite
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -28,19 +31,27 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 2e-5
|
learning_rate: 2e-5
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
|
eval_table_size:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
fsdp:
|
fsdp:
|
||||||
|
|||||||
@@ -52,19 +52,27 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 2e-5
|
learning_rate: 2e-5
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
|
eval_table_size:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
fsdp:
|
fsdp:
|
||||||
|
|||||||
@@ -25,7 +25,9 @@ max_packed_sequence_len:
|
|||||||
lora_r: 16
|
lora_r: 16
|
||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.0
|
lora_dropout: 0.0
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
wandb_watch:
|
wandb_watch:
|
||||||
@@ -39,10 +41,15 @@ optimizer: adamw_bnb_8bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00003
|
learning_rate: 0.00003
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
xformers_attention: true
|
||||||
flash_attention:
|
flash_attention:
|
||||||
@@ -51,7 +58,11 @@ gptq_model_v1:
|
|||||||
warmup_steps: 40
|
warmup_steps: 40
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
bos_token: "<|endoftext|>"
|
bos_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -38,7 +38,9 @@ lora_alpha: 16
|
|||||||
# 0.05 for 33B and 65B models
|
# 0.05 for 33B and 65B models
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
# add LoRA modules on all linear layers of the base model
|
# add LoRA modules on all linear layers of the base model
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -65,7 +67,10 @@ lr_scheduler: cosine
|
|||||||
# - 2e-4 for 7b & 13b
|
# - 2e-4 for 7b & 13b
|
||||||
# - 1e-4 for 33b & 64b
|
# - 1e-4 for 33b & 64b
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
# stop training after this many evaluation losses have increased in a row
|
# stop training after this many evaluation losses have increased in a row
|
||||||
@@ -73,6 +78,7 @@ gradient_checkpointing: true
|
|||||||
early_stopping_patience: 3
|
early_stopping_patience: 3
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
auto_resume_from_checkpoints: true
|
auto_resume_from_checkpoints: true
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
xformers_attention: true
|
||||||
flash_attention:
|
flash_attention:
|
||||||
@@ -81,7 +87,11 @@ gptq_model_v1:
|
|||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.000001
|
weight_decay: 0.000001
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
bos_token: "<|endoftext|>"
|
bos_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -7,6 +7,9 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
gptq: false
|
gptq: false
|
||||||
strict: false
|
strict: false
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
@@ -22,7 +25,9 @@ max_packed_sequence_len:
|
|||||||
lora_r: 64
|
lora_r: 64
|
||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.0
|
lora_dropout: 0.0
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
wandb_watch:
|
wandb_watch:
|
||||||
@@ -36,10 +41,15 @@ optimizer: adamw_bnb_8bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00003
|
learning_rate: 0.00003
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
xformers_attention: true
|
||||||
flash_attention:
|
flash_attention:
|
||||||
@@ -48,7 +58,11 @@ gptq_model_v1:
|
|||||||
warmup_steps: 40
|
warmup_steps: 40
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
bos_token: "<|endoftext|>"
|
bos_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -42,16 +42,28 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -48,16 +48,28 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -5,6 +5,9 @@ num_labels: 1
|
|||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
reward_model: true
|
reward_model: true
|
||||||
@@ -35,6 +38,8 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: true
|
bf16: true
|
||||||
fp16:
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
@@ -42,12 +47,21 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -2,16 +2,11 @@ base_model: google/gemma-3-4b-it
|
|||||||
processor_type: AutoProcessor
|
processor_type: AutoProcessor
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
load_in_4bit: true
|
|
||||||
|
|
||||||
# these 3 lines are needed for now to handle vision chat templates w images
|
# these 3 lines are needed for now to handle vision chat templates w images
|
||||||
skip_prepare_dataset: true
|
skip_prepare_dataset: true
|
||||||
remove_unused_columns: false
|
remove_unused_columns: false
|
||||||
sample_packing: false
|
sample_packing: false
|
||||||
|
|
||||||
# gemma3 doesn't seem to play nice with ddp
|
|
||||||
ddp_find_unused_parameters: true
|
|
||||||
|
|
||||||
chat_template: gemma3
|
chat_template: gemma3
|
||||||
datasets:
|
datasets:
|
||||||
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||||
@@ -22,7 +17,7 @@ dataset_prepared_path: last_run_prepared
|
|||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./outputs/out
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
adapter: qlora
|
adapter: lora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|
||||||
sequence_len: 2048
|
sequence_len: 2048
|
||||||
@@ -46,13 +41,14 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: true
|
bf16: true
|
||||||
fp16:
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
local_rank:
|
||||||
use_reentrant: false
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
eager_attention:
|
eager_attention:
|
||||||
@@ -60,4 +56,8 @@ eager_attention:
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
@@ -1,61 +0,0 @@
|
|||||||
base_model: google/gemma-3-4b-it
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
load_in_4bit: true
|
|
||||||
|
|
||||||
# gemma3 doesn't seem to play nice with ddp
|
|
||||||
ddp_find_unused_parameters: true
|
|
||||||
|
|
||||||
chat_template: gemma3
|
|
||||||
datasets:
|
|
||||||
- path: cgato/SlimOrcaDedupCleaned
|
|
||||||
type: chat_template
|
|
||||||
field_messages: conversations
|
|
||||||
message_property_mappings:
|
|
||||||
role: from
|
|
||||||
content: value
|
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.01
|
|
||||||
output_dir: ./outputs/out
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
sample_packing: true
|
|
||||||
pad_to_sequence_len: true
|
|
||||||
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_bnb_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
bf16: true
|
|
||||||
fp16:
|
|
||||||
tf32: true
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
gradient_checkpointing_kwargs:
|
|
||||||
use_reentrant: false
|
|
||||||
logging_steps: 1
|
|
||||||
flash_attention: true
|
|
||||||
eager_attention:
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
evals_per_epoch: 1
|
|
||||||
saves_per_epoch: 1
|
|
||||||
weight_decay: 0.0
|
|
||||||
@@ -5,15 +5,12 @@ tokenizer_type: AutoTokenizer
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
# gemma3 doesn't seem to play nice with ddp
|
|
||||||
ddp_find_unused_parameters: true
|
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
# huggingface repo
|
# huggingface repo
|
||||||
chat_template: gemma3
|
chat_template: gemma3_text
|
||||||
datasets:
|
datasets:
|
||||||
- path: cgato/SlimOrcaDedupCleaned
|
- path: cgato/SlimOrcaDedupCleaned
|
||||||
type: chat_template
|
type: chat_template
|
||||||
@@ -50,18 +47,28 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
early_stopping_patience:
|
||||||
use_reentrant: false
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
@@ -18,7 +18,9 @@ max_packed_sequence_len:
|
|||||||
lora_r: 8
|
lora_r: 8
|
||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
wandb_watch:
|
wandb_watch:
|
||||||
@@ -32,10 +34,15 @@ optimizer: paged_adamw_8bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0001
|
learning_rate: 0.0001
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
xformers_attention: true
|
||||||
flash_attention:
|
flash_attention:
|
||||||
@@ -44,6 +51,10 @@ gptq_model_v1:
|
|||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -40,18 +40,26 @@ optimizer: paged_adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00001
|
learning_rate: 0.00001
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -39,20 +39,26 @@ optimizer: paged_adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00001
|
learning_rate: 0.00001
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
deepspeed: deepspeed_configs/zero2.json
|
deepspeed: deepspeed_configs/zero2.json
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -39,6 +39,8 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00001
|
learning_rate: 0.00001
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: true
|
bf16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
|
|||||||
@@ -33,9 +33,13 @@ optimizer: adamw_bnb_8bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00003
|
learning_rate: 0.00003
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
tf32: true
|
tf32: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 5
|
logging_steps: 5
|
||||||
xformers_attention: true
|
xformers_attention: true
|
||||||
flash_attention:
|
flash_attention:
|
||||||
@@ -44,7 +48,11 @@ gptq_model_v1:
|
|||||||
warmup_steps: 20
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
tokens:
|
tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ model_type: LlamaForCausalLM
|
|||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -23,6 +26,7 @@ lora_r:
|
|||||||
lora_alpha:
|
lora_alpha:
|
||||||
lora_dropout:
|
lora_dropout:
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -37,12 +41,18 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
flash_attn_cross_entropy: false
|
flash_attn_cross_entropy: false
|
||||||
flash_attn_rms_norm: true
|
flash_attn_rms_norm: true
|
||||||
@@ -51,8 +61,11 @@ flash_attn_fuse_mlp: true
|
|||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
|
deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -10,6 +10,8 @@ gptq_disable_exllama: true
|
|||||||
|
|
||||||
tokenizer_use_fast: true
|
tokenizer_use_fast: true
|
||||||
tokenizer_legacy: true
|
tokenizer_legacy: true
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
hf_use_auth_token: true
|
hf_use_auth_token: true
|
||||||
@@ -31,6 +33,7 @@ lora_target_modules:
|
|||||||
- q_proj
|
- q_proj
|
||||||
- v_proj
|
- v_proj
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
|
lora_fan_in_fan_out:
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_watch:
|
wandb_watch:
|
||||||
wandb_name:
|
wandb_name:
|
||||||
@@ -47,19 +50,26 @@ torchdistx_path:
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
lr_quadratic_warmup: true
|
lr_quadratic_warmup: true
|
||||||
learning_rate: 0.000017
|
learning_rate: 0.000017
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: false
|
bf16: false
|
||||||
fp16: false
|
fp16: false
|
||||||
float16: true
|
float16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention:
|
flash_attention:
|
||||||
sdp_attention:
|
sdp_attention:
|
||||||
flash_optimum:
|
flash_optimum:
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ model_type: LlamaForCausalLM
|
|||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -23,6 +26,7 @@ lora_r:
|
|||||||
lora_alpha:
|
lora_alpha:
|
||||||
lora_dropout:
|
lora_dropout:
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
lisa_n_layers: 4
|
lisa_n_layers: 4
|
||||||
lisa_step_interval: 20
|
lisa_step_interval: 20
|
||||||
@@ -41,12 +45,18 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 5e-5 # recommendation from lisa paper for 7b
|
learning_rate: 5e-5 # recommendation from lisa paper for 7b
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
flash_attn_cross_entropy: false
|
flash_attn_cross_entropy: false
|
||||||
flash_attn_rms_norm: true
|
flash_attn_rms_norm: true
|
||||||
@@ -55,8 +65,13 @@ flash_attn_fuse_mlp: true
|
|||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ model_type: LlamaForCausalLM
|
|||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -23,6 +26,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
peft:
|
peft:
|
||||||
loftq_config:
|
loftq_config:
|
||||||
loftq_bits: 4
|
loftq_bits: 4
|
||||||
@@ -40,16 +44,29 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
s2_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -40,16 +41,29 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
s2_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,19 +43,28 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00001
|
learning_rate: 0.00001
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: true
|
use_reentrant: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,16 +43,27 @@ optimizer: paged_adamw_32bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -24,7 +24,9 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 8
|
lora_r: 8
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
relora_steps: 150
|
relora_steps: 150
|
||||||
relora_warmup_steps: 10
|
relora_warmup_steps: 10
|
||||||
@@ -43,18 +45,28 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -45,11 +45,14 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: true
|
bf16: true
|
||||||
fp16:
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
eager_attention:
|
eager_attention:
|
||||||
@@ -57,4 +60,8 @@ eager_attention:
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
|||||||
@@ -42,19 +42,27 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 2e-5
|
learning_rate: 2e-5
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
|
eval_table_size:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: NousResearch/Meta-Llama-3.1-8B
|
base_model: NousResearch/Meta-Llama-3.1-8B
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -27,19 +30,29 @@ optimizer: paged_adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 2e-5
|
learning_rate: 2e-5
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
|
eval_table_size:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|||||||
@@ -42,6 +42,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -56,15 +57,28 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
s2_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -51,17 +52,30 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
s2_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|||||||
@@ -58,6 +58,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -72,15 +73,28 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
s2_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ val_set_size: 0.0
|
|||||||
output_dir: ./outputs/lora-out
|
output_dir: ./outputs/lora-out
|
||||||
|
|
||||||
dataset_exact_deduplication: true
|
dataset_exact_deduplication: true
|
||||||
|
test_value: true
|
||||||
|
|
||||||
sequence_len: 4096
|
sequence_len: 4096
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
@@ -31,6 +32,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
lora_modules_to_save:
|
lora_modules_to_save:
|
||||||
- embed_tokens
|
- embed_tokens
|
||||||
- lm_head
|
- lm_head
|
||||||
@@ -48,17 +50,30 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
s2_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: NousResearch/Llama-3.2-1B
|
base_model: NousResearch/Llama-3.2-1B
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -21,6 +24,7 @@ lora_r: 16
|
|||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
# Currently, we don't support dropout with our custom Triton kernels
|
# Currently, we don't support dropout with our custom Triton kernels
|
||||||
# lora_dropout: 0.05
|
# lora_dropout: 0.05
|
||||||
|
lora_fan_in_fan_out:
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- gate_proj
|
- gate_proj
|
||||||
- down_proj
|
- down_proj
|
||||||
@@ -49,12 +53,18 @@ optimizer: adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -63,6 +73,10 @@ loss_watchdog_patience: 3
|
|||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: NousResearch/Llama-3.2-1B
|
base_model: NousResearch/Llama-3.2-1B
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -21,6 +24,7 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 16
|
lora_r: 16
|
||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_fan_in_fan_out:
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- gate_proj
|
- gate_proj
|
||||||
- down_proj
|
- down_proj
|
||||||
@@ -43,12 +47,18 @@ optimizer: adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -57,9 +67,11 @@ loss_watchdog_patience: 3
|
|||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
deepspeed: deepspeed_configs/zero3.json
|
deepspeed: deepspeed_configs/zero3.json
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|
||||||
|
|||||||
@@ -1,66 +0,0 @@
|
|||||||
base_model: meta-llama/Llama-3.2-1B
|
|
||||||
# optionally might have model_type or tokenizer_type
|
|
||||||
model_type: LlamaForCausalLM
|
|
||||||
tokenizer_type: AutoTokenizer
|
|
||||||
# Automatically upload checkpoint and final model to HF
|
|
||||||
# hub_model_id: username/custom_model_name
|
|
||||||
|
|
||||||
load_in_8bit: true
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
|
||||||
type: alpaca
|
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
|
||||||
type: alpaca
|
|
||||||
dataset_prepared_path:
|
|
||||||
val_set_size: 0.0
|
|
||||||
output_dir: ./outputs/lora-out
|
|
||||||
|
|
||||||
test_value: true
|
|
||||||
|
|
||||||
sequence_len: 4096
|
|
||||||
sample_packing: true
|
|
||||||
sample_packing_sequentially: true
|
|
||||||
curriculum_sampling: true
|
|
||||||
eval_sample_packing: false
|
|
||||||
pad_to_sequence_len: true
|
|
||||||
|
|
||||||
adapter: lora
|
|
||||||
lora_model_dir:
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_linear: true
|
|
||||||
lora_modules_to_save:
|
|
||||||
- embed_tokens
|
|
||||||
- lm_head
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 4
|
|
||||||
optimizer: adamw_bnb_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
bf16: auto
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_steps: 10
|
|
||||||
evals_per_epoch: 4
|
|
||||||
saves_per_epoch: 1
|
|
||||||
weight_decay: 0.0
|
|
||||||
special_tokens:
|
|
||||||
pad_token: <|end_of_text|>
|
|
||||||
@@ -1,6 +1,9 @@
|
|||||||
base_model: NousResearch/Llama-3.2-1B
|
base_model: NousResearch/Llama-3.2-1B
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -21,6 +24,7 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 16
|
lora_r: 16
|
||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_fan_in_fan_out:
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- gate_proj
|
- gate_proj
|
||||||
- down_proj
|
- down_proj
|
||||||
@@ -43,12 +47,18 @@ optimizer: adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -57,6 +67,10 @@ loss_watchdog_patience: 3
|
|||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
lora_modules_to_save:
|
lora_modules_to_save:
|
||||||
- embed_tokens
|
- embed_tokens
|
||||||
- lm_head
|
- lm_head
|
||||||
@@ -44,17 +45,30 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
s2_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ lora_r: 32
|
|||||||
lora_alpha: 64
|
lora_alpha: 64
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -46,19 +47,31 @@ optimizer: adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 20
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_fan_in_fan_out:
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- gate_proj
|
- gate_proj
|
||||||
- down_proj
|
- down_proj
|
||||||
@@ -46,12 +47,18 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -59,7 +66,13 @@ loss_watchdog_patience: 3
|
|||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 16
|
lora_r: 16
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
gradient_accumulation_steps: 4
|
||||||
@@ -33,6 +34,8 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00001
|
learning_rate: 0.00001
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: true
|
bf16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
|
|||||||
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 8
|
lora_r: 8
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,19 +43,28 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.00001
|
learning_rate: 0.00001
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: true
|
use_reentrant: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,17 +43,28 @@ optimizer: paged_adamw_32bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|||||||
@@ -41,11 +41,14 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: true
|
bf16: true
|
||||||
fp16:
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
eager_attention:
|
eager_attention:
|
||||||
@@ -53,4 +56,8 @@ eager_attention:
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
|||||||
@@ -5,6 +5,9 @@ tokenizer_type: AutoTokenizer
|
|||||||
tokenizer_config: EleutherAI/gpt-neox-20b
|
tokenizer_config: EleutherAI/gpt-neox-20b
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -35,17 +38,27 @@ train_on_inputs: false
|
|||||||
group_by_length: true
|
group_by_length: true
|
||||||
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: false
|
gradient_checkpointing: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention:
|
flash_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
tokens:
|
tokens:
|
||||||
save_safetensors: False
|
save_safetensors: False
|
||||||
|
|||||||
@@ -6,6 +6,9 @@ tokenizer_type: LlamaTokenizer
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
unfrozen_parameters:
|
unfrozen_parameters:
|
||||||
@@ -37,19 +40,27 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0001
|
learning_rate: 0.0001
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
save_total_limit: 1
|
save_total_limit: 1
|
||||||
save_steps:
|
save_steps:
|
||||||
|
debug:
|
||||||
deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_params.json
|
deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_params.json
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
eos_token: "<|im_end|>"
|
eos_token: "<|im_end|>"
|
||||||
tokens:
|
tokens:
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ model_type: MistralForCausalLM
|
|||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -31,16 +34,28 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.000005
|
learning_rate: 0.000005
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ model_type: MistralForCausalLM
|
|||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -25,6 +28,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- gate_proj
|
- gate_proj
|
||||||
- down_proj
|
- down_proj
|
||||||
@@ -47,13 +51,18 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
fp16: false
|
fp16: false
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: false
|
flash_attention: false
|
||||||
sdp_attention: true
|
sdp_attention: true
|
||||||
|
|
||||||
@@ -62,6 +71,12 @@ loss_watchdog_patience: 3
|
|||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_table_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- gate_proj
|
- gate_proj
|
||||||
- down_proj
|
- down_proj
|
||||||
@@ -49,12 +50,18 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -62,6 +69,12 @@ loss_watchdog_patience: 3
|
|||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ lora_r: 8
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.2
|
lora_dropout: 0.2
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- gate_proj
|
- gate_proj
|
||||||
@@ -66,18 +67,31 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0001
|
learning_rate: 0.0001
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: false
|
flash_attention: false
|
||||||
|
s2_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<|im_start|>"
|
bos_token: "<|im_start|>"
|
||||||
eos_token: "<|im_end|>"
|
eos_token: "<|im_end|>"
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -46,12 +47,18 @@ optimizer: paged_adamw_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -59,8 +66,10 @@ loss_watchdog_patience: 3
|
|||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- gate_proj
|
- gate_proj
|
||||||
- down_proj
|
- down_proj
|
||||||
@@ -54,12 +55,18 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -67,6 +74,12 @@ loss_watchdog_patience: 3
|
|||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -43,11 +43,14 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: true
|
bf16: true
|
||||||
fp16:
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet.
|
flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet.
|
||||||
eager_attention:
|
eager_attention:
|
||||||
@@ -55,5 +58,9 @@ eager_attention:
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -44,12 +45,18 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -57,8 +64,10 @@ loss_watchdog_patience: 3
|
|||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -46,12 +47,18 @@ optimizer: adamw_torch_fused
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -59,8 +66,10 @@ loss_watchdog_patience: 3
|
|||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
#lora_target_modules:
|
#lora_target_modules:
|
||||||
# - gate
|
# - gate
|
||||||
# - q_proj
|
# - q_proj
|
||||||
@@ -64,12 +65,18 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -77,8 +84,12 @@ loss_watchdog_patience: 3
|
|||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
deepspeed: deepspeed_configs/zero2.json
|
deepspeed: deepspeed_configs/zero2.json
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -6,6 +6,9 @@ tokenizer_type: LlamaTokenizer
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
unfrozen_parameters:
|
unfrozen_parameters:
|
||||||
@@ -35,19 +38,27 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0001
|
learning_rate: 0.0001
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
save_total_limit: 1
|
save_total_limit: 1
|
||||||
save_steps:
|
save_steps:
|
||||||
|
debug:
|
||||||
deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json
|
deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
eos_token: "<|im_end|>"
|
eos_token: "<|im_end|>"
|
||||||
tokens:
|
tokens:
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- gate_proj
|
- gate_proj
|
||||||
- down_proj
|
- down_proj
|
||||||
@@ -49,12 +50,18 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
@@ -62,6 +69,12 @@ loss_watchdog_patience: 3
|
|||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -35,17 +35,26 @@ optimizer: adamw_bnb_8bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0000002
|
learning_rate: 0.0000002
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
tf32: true
|
tf32: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 5
|
logging_steps: 5
|
||||||
|
xformers_attention:
|
||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 20
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0001
|
weight_decay: 0.0001
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
tokens:
|
tokens:
|
||||||
pad_token: "<|padding|>"
|
pad_token: "<|padding|>"
|
||||||
bos_token: "<|endoftext|>"
|
bos_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ model_type: LlamaForCausalLM
|
|||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
@@ -20,6 +23,7 @@ lora_alpha:
|
|||||||
lora_dropout:
|
lora_dropout:
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
|
lora_fan_in_fan_out:
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
wandb_watch:
|
wandb_watch:
|
||||||
@@ -33,20 +37,29 @@ optimizer: adamw_bnb_8bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.000003
|
learning_rate: 0.000003
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
float16: true
|
float16: true
|
||||||
bf16: false
|
bf16: false
|
||||||
fp16: false
|
fp16: false
|
||||||
tf32: false
|
tf32: false
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 20
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ lora_target_modules:
|
|||||||
- v_proj
|
- v_proj
|
||||||
- k_proj
|
- k_proj
|
||||||
- o_proj
|
- o_proj
|
||||||
|
lora_fan_in_fan_out:
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
wandb_watch:
|
wandb_watch:
|
||||||
@@ -42,19 +43,29 @@ optimizer: adamw_bnb_8bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: false
|
bf16: false
|
||||||
fp16: true
|
fp16: true
|
||||||
tf32: false
|
tf32: false
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
|
s2_attention:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 20
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -21,7 +21,9 @@ sample_packing: true
|
|||||||
lora_r: 8
|
lora_r: 8
|
||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
wandb_watch:
|
wandb_watch:
|
||||||
@@ -35,19 +37,28 @@ optimizer: paged_adamw_32bit
|
|||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: false
|
bf16: false
|
||||||
fp16: true
|
fp16: true
|
||||||
tf32: false
|
tf32: false
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 20
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -51,16 +52,28 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bfloat16: true
|
bfloat16: true
|
||||||
bf16: true
|
bf16: true
|
||||||
fp16:
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
s2_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 4
|
saves_per_epoch: 4
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ model_type: AutoModelForCausalLM
|
|||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -24,6 +27,7 @@ lora_r:
|
|||||||
lora_alpha:
|
lora_alpha:
|
||||||
lora_dropout:
|
lora_dropout:
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,20 +45,30 @@ max_grad_norm: 1.0
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.000003
|
learning_rate: 0.000003
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: True
|
use_reentrant: True
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
resize_token_embeddings_to_32x: true
|
resize_token_embeddings_to_32x: true
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ lora_r: 64
|
|||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -44,20 +45,30 @@ max_grad_norm: 1.0
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.000003
|
learning_rate: 0.000003
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: True
|
use_reentrant: True
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
resize_token_embeddings_to_32x: true
|
resize_token_embeddings_to_32x: true
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ model_type: AutoModelForCausalLM
|
|||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -24,6 +27,7 @@ lora_r:
|
|||||||
lora_alpha:
|
lora_alpha:
|
||||||
lora_dropout:
|
lora_dropout:
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -41,20 +45,30 @@ max_grad_norm: 1.0
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.000003
|
learning_rate: 0.000003
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: True
|
use_reentrant: True
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
resize_token_embeddings_to_32x: true
|
resize_token_embeddings_to_32x: true
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ model_type: AutoModelForCausalLM
|
|||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -25,6 +28,7 @@ lora_r:
|
|||||||
lora_alpha:
|
lora_alpha:
|
||||||
lora_dropout:
|
lora_dropout:
|
||||||
lora_target_linear:
|
lora_target_linear:
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project: phi3
|
wandb_project: phi3
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -42,19 +46,27 @@ max_grad_norm: 1.0
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.000003
|
learning_rate: 0.000003
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: true
|
use_reentrant: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -7,6 +7,9 @@ tokenizer_type: AutoTokenizer
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
chat_template: phi_3
|
chat_template: phi_3
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
strict: false
|
strict: false
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
@@ -27,6 +30,7 @@ lora_r: 64
|
|||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
gradient_accumulation_steps: 1
|
gradient_accumulation_steps: 1
|
||||||
micro_batch_size: 2
|
micro_batch_size: 2
|
||||||
@@ -38,6 +42,8 @@ max_grad_norm: 1.0
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 5.0e-6
|
learning_rate: 5.0e-6
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
@@ -49,9 +55,9 @@ flash_attention: true
|
|||||||
|
|
||||||
eval_steps: 1000
|
eval_steps: 1000
|
||||||
save_steps: 5000
|
save_steps: 5000
|
||||||
|
eval_table_size: 2
|
||||||
eval_batch_size: 2
|
eval_batch_size: 2
|
||||||
eval_sample_packing: false
|
eval_sample_packing: false
|
||||||
eval_table_size: 2
|
|
||||||
eval_max_new_tokens: 32
|
eval_max_new_tokens: 32
|
||||||
eval_causal_lm_metrics: ["perplexity"]
|
eval_causal_lm_metrics: ["perplexity"]
|
||||||
do_causal_lm_eval: true
|
do_causal_lm_eval: true
|
||||||
|
|||||||
@@ -41,11 +41,14 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: true
|
bf16: true
|
||||||
fp16:
|
fp16:
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet
|
flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet
|
||||||
eager_attention:
|
eager_attention:
|
||||||
@@ -53,6 +56,10 @@ eager_attention:
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <pad>
|
pad_token: <pad>
|
||||||
|
|||||||
@@ -5,6 +5,9 @@ model_type: GPTNeoXForCausalLM
|
|||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
gptq: false
|
gptq: false
|
||||||
device_map: auto
|
device_map: auto
|
||||||
datasets:
|
datasets:
|
||||||
@@ -19,6 +22,7 @@ max_packed_sequence_len: 2048
|
|||||||
lora_r: 64
|
lora_r: 64
|
||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.0
|
lora_dropout: 0.0
|
||||||
|
lora_target_modules:
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific
|
lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific
|
||||||
wandb_project:
|
wandb_project:
|
||||||
@@ -33,10 +37,16 @@ num_epochs: 5
|
|||||||
learning_rate: 0.00003
|
learning_rate: 0.00003
|
||||||
optimizer: adamw_bnb_8bit
|
optimizer: adamw_bnb_8bit
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: false
|
bf16: false
|
||||||
fp16: false
|
fp16: false
|
||||||
float16: true
|
float16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
flash_optimum: true
|
flash_optimum: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
|||||||
@@ -28,9 +28,13 @@ gradient_accumulation_steps: 1
|
|||||||
micro_batch_size: 4
|
micro_batch_size: 4
|
||||||
num_epochs: 4
|
num_epochs: 4
|
||||||
learning_rate: 0.00001
|
learning_rate: 0.00001
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
tf32: true
|
tf32: true
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
lora_fan_in_fan_out:
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
@@ -42,16 +43,28 @@ optimizer: adamw_bnb_8bit
|
|||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 0.0002
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
bf16: auto
|
bf16: auto
|
||||||
|
fp16:
|
||||||
tf32: false
|
tf32: false
|
||||||
|
|
||||||
gradient_checkpointing: false
|
gradient_checkpointing: false
|
||||||
|
early_stopping_patience:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
flash_attention:
|
flash_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
eval_table_size:
|
||||||
|
eval_max_new_tokens: 128
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user