Compare commits

..

7 Commits

Author SHA1 Message Date
Dan Saunders
103edc7211 refactor build() into smaller fns 2025-05-12 20:36:52 +00:00
Wing Lian
c7b6790614 Various fixes for CI, save_only_model for RL, prevent packing multiprocessing deadlocks (#2661)
* lean mistral ft tests, remove e2e torch 2.4.1 test

* make sure to pass save_only_model for RL

* more tests to make ci leaner, add cleanup to modal ci

* fix module for import in e2e tests

* use mp spawn to prevent deadlocks with packing

* make sure cleanup shell script is executable when cloned out
2025-05-12 10:51:18 -04:00
Dan Saunders
47e0e71bc8 don't sort multipack sampler (#2657)
* don't sort multipack sampler

* increased packing efficiency increases loss

---------

Co-authored-by: Wing Lian <wing@axolotl.ai>
2025-05-09 20:28:58 -04:00
Wing Lian
0f3587174d swap tinymodels that have safetensors for some ci tests (#2641) 2025-05-07 15:06:07 -04:00
xzuyn
25e6c5f9bd Add CAME Optimizer (#2385) 2025-05-07 10:31:46 -04:00
NanoCode012
32f51bca35 fix(doc): clarify instruction to delinearize llama4 similar to cli doc (#2644) [skip ci] 2025-05-07 10:29:47 -04:00
NanoCode012
9daa04da90 Fix: improve error message on failed dataset load (#2637) [skip ci]
* fix(log): clarify error on dataset loading failed

* fix: add path for easy tracking of broken config

* fix: improve error message based on pr feedback
2025-05-07 10:29:05 -04:00
143 changed files with 1863 additions and 1615 deletions

View File

@@ -18,9 +18,96 @@ jobs:
env: env:
SKIP: no-commit-to-branch SKIP: no-commit-to-branch
preload-cache:
name: Preload HF cache
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python_version: ["3.11"]
pytorch_version: ["2.6.0"]
timeout-minutes: 20
env:
AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Restore HF cache
id: hf-cache-restore
uses: actions/cache/restore@v4
with:
path: |
/home/runner/.cache/huggingface/hub/datasets--*
/home/runner/.cache/huggingface/hub/models--*
key: ${{ runner.os }}-hf-hub-cache-v2
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python_version }}
cache: 'pip' # caching pip dependencies
- name: upgrade pip
run: |
pip3 install --upgrade pip
pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
- name: Install PyTorch
run: |
pip3 install torch==${{ matrix.pytorch_version }}
- name: Install dependencies
run: |
pip3 show torch
pip3 install --no-build-isolation -U -e .
python scripts/unsloth_install.py | sh
python scripts/cutcrossentropy_install.py | sh
pip3 install -r requirements-dev.txt -r requirements-tests.txt
- name: Make sure PyTorch version wasn't clobbered
run: |
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
- name: Ensure axolotl CLI was installed
run: |
axolotl --help
- name: Pre-Download dataset fixture
run: |
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
- name: Run tests
run: |
pytest -v tests/conftest.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml
flags: unittests,pytorch-${{ matrix.pytorch_version }}
fail_ci_if_error: false
- name: cleanup pip cache
run: |
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
- name: Save HF cache
id: hf-cache
uses: actions/cache/save@v4
with:
path: |
/home/runner/.cache/huggingface/hub/datasets--*
/home/runner/.cache/huggingface/hub/models--*
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
pytest: pytest:
name: PyTest name: PyTest
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: [preload-cache]
strategy: strategy:
fail-fast: false fail-fast: false
max-parallel: 2 max-parallel: 2

View File

@@ -335,12 +335,6 @@ jobs:
pytorch: 2.6.0 pytorch: 2.6.0
num_gpus: 1 num_gpus: 1
axolotl_extras: llmcompressor axolotl_extras: llmcompressor
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.4.1
num_gpus: 1
axolotl_extras:
- cuda: 124 - cuda: 124
cuda_version: 12.4.1 cuda_version: 12.4.1
python_version: "3.11" python_version: "3.11"
@@ -377,3 +371,43 @@ jobs:
- name: Run tests job on Modal - name: Run tests job on Modal
run: | run: |
modal run cicd.e2e_tests modal run cicd.e2e_tests
docker-e2e-cleanup:
runs-on: [self-hosted, modal]
timeout-minutes: 90
needs: [docker-e2e-tests]
strategy:
fail-fast: false
matrix:
include:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
num_gpus: 1
axolotl_extras: vllm
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install Modal
run: |
python -m pip install --upgrade pip
pip install modal==0.71.8 jinja2
- name: Update env vars
run: |
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
- name: Run tests job on Modal
run: |
modal run cicd.cleanup

0
cicd/__init__.py Normal file
View File

View File

@@ -18,7 +18,7 @@ pytest -v --durations=10 \
--cov-append --cov-append
# Run patched tests excluding lora kernels with coverage append # Run patched tests excluding lora kernels with coverage append
pytest -v --durations=10 \ pytest --full-trace -vvv --durations=10 \
--ignore=tests/e2e/patched/lora_kernels \ --ignore=tests/e2e/patched/lora_kernels \
/workspace/axolotl/tests/e2e/patched \ /workspace/axolotl/tests/e2e/patched \
--cov=axolotl \ --cov=axolotl \

19
cicd/cleanup.py Normal file
View File

@@ -0,0 +1,19 @@
"""Modal app to run axolotl GPU cleanup"""
from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
@app.function(
image=cicd_image,
timeout=60 * 60,
cpu=8.0,
memory=131072,
volumes=VOLUME_CONFIG,
)
def cleanup():
run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
@app.local_entrypoint()
def main():
cleanup.remote()

6
cicd/cleanup.sh Executable file
View File

@@ -0,0 +1,6 @@
#!/bin/bash
set -e
# cleanup old cache files for datasets processing and intermediate mappings
find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;

View File

@@ -1,69 +1,6 @@
"""Modal app to run axolotl GPU tests""" """Modal app to run axolotl GPU tests"""
# pylint: disable=duplicate-code from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
import os
import pathlib
import tempfile
import jinja2
import modal
from jinja2 import select_autoescape
from modal import App, Image
cicd_path = pathlib.Path(__file__).parent.resolve()
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
template_env = jinja2.Environment(
loader=template_loader, autoescape=select_autoescape()
)
df_template = template_env.get_template("Dockerfile.jinja")
df_args = {
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
"CUDA": os.environ.get("CUDA", "121"),
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
"HF_HOME": "/workspace/data/huggingface-cache/hub",
}
dockerfile_contents = df_template.render(**df_args)
temp_dir = tempfile.mkdtemp()
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
f.write(dockerfile_contents)
cicd_image = Image.from_dockerfile(
pathlib.Path(temp_dir) / "Dockerfile",
context_mount=None,
force_build=True,
gpu="A10G",
).env(df_args)
app = App("Axolotl CI/CD", secrets=[])
hf_cache_volume = modal.Volume.from_name(
"axolotl-ci-hf-hub-cache", create_if_missing=True
)
VOLUME_CONFIG = {
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
}
N_GPUS = int(os.environ.get("N_GPUS", 1))
GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
def run_cmd(cmd: str, run_folder: str):
import subprocess # nosec
# Propagate errors from subprocess.
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
exit(exit_code) # pylint: disable=consider-using-sys-exit
@app.function( @app.function(

66
cicd/single_gpu.py Normal file
View File

@@ -0,0 +1,66 @@
"""Modal app to run axolotl GPU tests"""
# pylint: disable=duplicate-code
import os
import pathlib
import tempfile
import jinja2
import modal
from jinja2 import select_autoescape
from modal import App, Image
cicd_path = pathlib.Path(__file__).parent.resolve()
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
template_env = jinja2.Environment(
loader=template_loader, autoescape=select_autoescape()
)
df_template = template_env.get_template("Dockerfile.jinja")
df_args = {
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
"CUDA": os.environ.get("CUDA", "121"),
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
"HF_HOME": "/workspace/data/huggingface-cache/hub",
}
dockerfile_contents = df_template.render(**df_args)
temp_dir = tempfile.mkdtemp()
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
f.write(dockerfile_contents)
cicd_image = Image.from_dockerfile(
pathlib.Path(temp_dir) / "Dockerfile",
context_mount=None,
force_build=True,
gpu="A10G",
).env(df_args)
app = App("Axolotl CI/CD", secrets=[])
hf_cache_volume = modal.Volume.from_name(
"axolotl-ci-hf-hub-cache", create_if_missing=True
)
VOLUME_CONFIG = {
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
}
N_GPUS = int(os.environ.get("N_GPUS", 1))
GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
def run_cmd(cmd: str, run_folder: str):
import subprocess # nosec
# Propagate errors from subprocess.
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
exit(exit_code) # pylint: disable=consider-using-sys-exit

View File

@@ -612,6 +612,7 @@ lr_div_factor: # Learning rate div factor
# - optimi_adamw # - optimi_adamw
# - ao_adamw_8bit # - ao_adamw_8bit
# - ao_adamw_fp8 # - ao_adamw_fp8
# - came_pytorch
optimizer: optimizer:
# Dictionary of arguments to pass to the optimizer # Dictionary of arguments to pass to the optimizer
optim_args: optim_args:

View File

@@ -59,7 +59,9 @@ gradient_checkpointing: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
sdp_attention:
flash_optimum:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:

View File

@@ -39,7 +39,8 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: xformers xformers_attention: true
flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_steps: 10

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -49,8 +49,7 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: evals_per_epoch:

View File

@@ -112,7 +112,9 @@
"early_stopping_patience:\n", "early_stopping_patience:\n",
"resume_from_checkpoint:\n", "resume_from_checkpoint:\n",
"logging_steps: 1\n", "logging_steps: 1\n",
"attention: sdpa\n", "xformers_attention:\n",
"flash_attention: false\n",
"sdp_attention: true\n",
"\n", "\n",
"warmup_steps: 1\n", "warmup_steps: 1\n",
"max_steps: 25\n", "max_steps: 25\n",

View File

@@ -52,8 +52,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: evals_per_epoch:

View File

@@ -55,8 +55,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: evals_per_epoch:

View File

@@ -39,8 +39,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: evals_per_epoch:

View File

@@ -35,8 +35,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 100 warmup_steps: 100
evals_per_epoch: 2 evals_per_epoch: 2

View File

@@ -59,8 +59,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 100 warmup_steps: 100
evals_per_epoch: 2 evals_per_epoch: 2

View File

@@ -43,7 +43,8 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: xformers xformers_attention: true
flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 40 warmup_steps: 40

View File

@@ -73,7 +73,8 @@ early_stopping_patience: 3
resume_from_checkpoint: resume_from_checkpoint:
auto_resume_from_checkpoints: true auto_resume_from_checkpoints: true
logging_steps: 1 logging_steps: 1
attention: xformers xformers_attention: true
flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_steps: 10

View File

@@ -40,7 +40,8 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: xformers xformers_attention: true
flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 40 warmup_steps: 40

View File

@@ -47,8 +47,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -53,8 +53,7 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: evals_per_epoch:

View File

@@ -43,8 +43,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: evals_per_epoch:

View File

@@ -57,8 +57,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: evals_per_epoch:

View File

@@ -51,7 +51,8 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs: gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
eager_attention:
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1

View File

@@ -53,7 +53,8 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs: gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
eager_attention:
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1

View File

@@ -36,7 +36,8 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: xformers xformers_attention: true
flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_steps: 10

View File

@@ -47,8 +47,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: evals_per_epoch:

View File

@@ -46,8 +46,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: evals_per_epoch:

View File

@@ -45,8 +45,7 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs: gradient_checkpointing_kwargs:
use_reentrant: true use_reentrant: true
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 1 evals_per_epoch: 1

View File

@@ -37,7 +37,8 @@ bf16: auto
tf32: true tf32: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 5 logging_steps: 5
attention: xformers xformers_attention: true
flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20

View File

@@ -42,8 +42,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
flash_attn_cross_entropy: false flash_attn_cross_entropy: false
flash_attn_rms_norm: true flash_attn_rms_norm: true
flash_attn_fuse_qkv: false flash_attn_fuse_qkv: false

View File

@@ -53,7 +53,9 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention:
sdp_attention:
flash_optimum:
warmup_steps: 100 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
flash_attn_cross_entropy: false flash_attn_cross_entropy: false
flash_attn_rms_norm: true flash_attn_rms_norm: true
flash_attn_fuse_qkv: false flash_attn_fuse_qkv: false

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -45,8 +45,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
use_reentrant: true use_reentrant: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -50,7 +50,8 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
eager_attention:
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1

View File

@@ -49,8 +49,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 100 warmup_steps: 100
evals_per_epoch: 2 evals_per_epoch: 2

View File

@@ -34,8 +34,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 100 warmup_steps: 100
evals_per_epoch: 2 evals_per_epoch: 2

View File

@@ -61,8 +61,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -56,8 +56,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -77,8 +77,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -53,8 +53,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -54,8 +54,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3

View File

@@ -48,8 +48,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3

View File

@@ -55,8 +55,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3

View File

@@ -49,8 +49,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -53,8 +53,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 20 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -51,8 +51,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3

View File

@@ -39,8 +39,7 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs: gradient_checkpointing_kwargs:
use_reentrant: true use_reentrant: true
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
use_reentrant: true use_reentrant: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -34,3 +34,5 @@ We provide a script to delinearize Llama 4 linearized models into regular Huggin
```bash ```bash
axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
``` ```
Note: This only works with the non-quantized linearized model. If you have an adapter, merge it with the *non-quantized linearized* model before delinearizing.

View File

@@ -46,7 +46,8 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
eager_attention:
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1

View File

@@ -39,7 +39,7 @@ tf32: true
gradient_checkpointing: false gradient_checkpointing: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: eager flash_attention:
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -42,8 +42,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
save_total_limit: 1 save_total_limit: 1
save_steps: save_steps:

View File

@@ -36,8 +36,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -53,7 +53,8 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: sdpa flash_attention: false
sdp_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3

View File

@@ -54,8 +54,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3

View File

@@ -71,7 +71,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: eager flash_attention: false
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -51,8 +51,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3

View File

@@ -59,8 +59,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3

View File

@@ -48,7 +48,9 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
logging_steps: 1 logging_steps: 1
attention: eager # PixtralVisionModel does not support Flash Attention 2.0 yet. flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet.
eager_attention:
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -49,8 +49,7 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3

View File

@@ -51,8 +51,7 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3

View File

@@ -69,8 +69,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3

View File

@@ -40,8 +40,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
save_total_limit: 1 save_total_limit: 1
save_steps: save_steps:

View File

@@ -54,8 +54,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3

View File

@@ -39,7 +39,7 @@ bf16: auto
tf32: true tf32: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 5 logging_steps: 5
attention: eager flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20

View File

@@ -39,8 +39,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20

View File

@@ -47,8 +47,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20

View File

@@ -40,8 +40,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20

View File

@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
use_reentrant: True use_reentrant: True
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 100 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -51,8 +51,7 @@ gradient_checkpointing_kwargs:
use_reentrant: True use_reentrant: True
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 100 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
use_reentrant: True use_reentrant: True
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 100 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -49,8 +49,7 @@ gradient_checkpointing_kwargs:
use_reentrant: true use_reentrant: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 100 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -44,8 +44,7 @@ gradient_checkpointing_kwargs:
use_reentrant: True use_reentrant: True
early_stopping_patience: 3 early_stopping_patience: 3
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
eval_steps: 1000 eval_steps: 1000
save_steps: 5000 save_steps: 5000

View File

@@ -46,7 +46,8 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
logging_steps: 1 logging_steps: 1
attention: eager # PixtralVisionModel does not support Flash Attention 2.0 yet flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet
eager_attention:
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1

View File

@@ -47,7 +47,7 @@ tf32: false
gradient_checkpointing: false gradient_checkpointing: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: eager flash_attention:
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -47,7 +47,7 @@ tf32: false
gradient_checkpointing: false gradient_checkpointing: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention:
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -43,8 +43,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -46,8 +46,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -46,7 +46,8 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
eager_attention:
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1

View File

@@ -49,8 +49,7 @@ tf32: false
gradient_checkpointing: true gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -47,8 +47,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: evals_per_epoch:

View File

@@ -47,8 +47,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_steps: 10 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4

View File

@@ -43,8 +43,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: flash flash_attention: true
warmup_ratio: 0.1 warmup_ratio: 0.1
evals_per_epoch: evals_per_epoch:

View File

@@ -40,7 +40,7 @@ bf16: auto
tf32: true tf32: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 5 logging_steps: 5
attention: flash flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20

View File

@@ -38,7 +38,7 @@ tf32: true
gradient_checkpointing: gradient_checkpointing:
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
attention: eager flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20

Some files were not shown because too many files have changed in this diff Show More