Merge branch 'main' into destroy-pg

ray bugfix
update
2025-03-31 14:36:43 -04:00 · 2025-03-31 18:35:41 +00:00 · 2025-03-31 14:46:15 +00:00 · 2025-03-31 14:32:50 +00:00
195 changed files with 1997 additions and 4283 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -40,24 +40,12 @@ jobs:
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "126"
-            cuda_version: 12.6.3
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.6.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: nightly
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: next
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -79,7 +67,7 @@ jobs:
        uses: docker/build-push-action@v4
        with:
          context: .
-          file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || matrix.pytorch == 'next' && './docker/Dockerfile-base-next' || './docker/Dockerfile-base' }}
+          file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || './docker/Dockerfile-base' }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -25,12 +25,12 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras: vllm
+            is_latest: true
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
-            is_latest: true
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -87,12 +87,12 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
+            is_latest: true
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
-            is_latest: true
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -24,13 +24,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras: vllm
-            num_gpus: 2
-            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -45,6 +38,14 @@ jobs:
            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.6.0
+            # awaiting vllm#12721
+            axolotl_extras:
+            num_gpus: 2
+            nightly_build: "true"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -33,15 +33,6 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
-
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
@@ -55,7 +46,7 @@ jobs:

      - name: Install PyTorch
        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
+          pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu

      - name: Update requirements.txt
        run: |
@@ -67,7 +58,8 @@ jobs:

      - name: Install dependencies
        run: |
-          pip3 show torch
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==23.2
          pip3 install --no-build-isolation -U -e .
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
@@ -81,15 +73,10 @@ jobs:
        run: |
          axolotl --help

-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v tests/patched/
-          pytest -v tests/cli/
+          pytest -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
+          pytest tests/patched/

      - name: cleanup pip cache
        run: |
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -96,10 +96,6 @@ jobs:
        run: |
          axolotl --help

-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
      - name: Run tests
        run: |
          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
@@ -211,7 +207,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.5.1
            num_gpus: 1
            axolotl_extras: vllm
    steps:
@@ -258,9 +254,9 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.6.0
            num_gpus: 1
-            axolotl_extras: vllm
+            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -40,7 +40,6 @@ quartodoc:
        - cli.preprocess
        - cli.sweeps
        - cli.utils
-        - cli.vllm_serve
        - cli.cloud.base
        - cli.cloud.modal_
    - title: Trainers
@@ -231,7 +230,6 @@ website:
            - docs/reward_modelling.qmd
            - docs/lr_groups.qmd
            - docs/lora_optims.qmd
-            - docs/dataset_loading.qmd

        - section: "Core Concepts"
          contents:
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -68,7 +68,7 @@ def run_cmd(cmd: str, run_folder: str):
@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=90 * 60,
+    timeout=60 * 60,
    cpu=8.0,
    memory=131072 * N_GPUS,
    volumes=VOLUME_CONFIG,
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -2,5 +2,4 @@
 set -e

 # only run one test at a time so as not to OOM the GPU
-pytest -v  --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
-pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
+pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,9 +20,9 @@ WORKDIR /workspace/axolotl

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi

 RUN python scripts/unsloth_install.py | sh
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
-    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"

--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -1,38 +0,0 @@
-ARG CUDA_VERSION="12.8.1"
-ARG CUDNN_VERSION="8"
-ARG UBUNTU_VERSION="22.04"
-ARG MAX_JOBS=4
-
-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
-
-ENV PATH="/root/miniconda3/bin:${PATH}"
-
-ARG PYTHON_VERSION="3.11"
-ARG PYTORCH_VERSION="next"
-ARG CUDA="128"
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-
-ENV PYTHON_VERSION=$PYTHON_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-
-RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
-    && wget \
-    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
-    && mkdir /root/.conda \
-    && bash Miniconda3-latest-Linux-x86_64.sh -b \
-    && rm -f Miniconda3-latest-Linux-x86_64.sh \
-    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
-
-ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
-
-WORKDIR /workspace
-
-RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==2.7.0 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
-    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
-    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
-
-RUN git lfs install --skip-repo && \
-    pip3 install awscli && \
-    pip3 install -U --no-cache-dir pydantic==2.10.6
--- a/docs/cli.qmd
+++ b/docs/cli.qmd
@@ -170,7 +170,7 @@ axolotl merge-sharded-fsdp-weights config.yml

 ### evaluate

-Evaluates a model's performance (loss etc) on the train and eval datasets.
+Evaluates a model's performance using metrics specified in the config.

 ```bash
 # Basic evaluation
@@ -197,8 +197,6 @@ lm_eval_batch_size: # Batch size for evaluation
 output_dir: # Directory to save evaluation results
 ```

-See [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) for more details.
-
 ## Legacy CLI Usage

 While the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:
@@ -237,7 +235,7 @@ Create a cloud config YAML with your Modal settings:
 ```yaml
 # cloud_config.yml
 provider: modal
-gpu: a100       # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4
+gpu: a100  # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4
 gpu_count: 1    # Number of GPUs to use
 timeout: 86400  # Maximum runtime in seconds (24 hours)
 branch: main    # Git branch to use (optional)
@@ -250,7 +248,7 @@ volumes:        # Persistent storage volumes
  - name: axolotl-artifacts
    mount: /workspace/artifacts

-secrets:        # Secrets to inject
+env:            # Environment variables
  - WANDB_API_KEY
  - HF_TOKEN
 ```
@@ -276,27 +274,15 @@ axolotl lm-eval config.yml --cloud cloud_config.yml
 ### Cloud Configuration Options

 ```yaml
-provider:    # compute provider, currently only `modal` is supported
-gpu:         # GPU type to use
-gpu_count:   # Number of GPUs (default: 1)
-memory:      # RAM in GB (default: 128)
-timeout:     # Maximum runtime in seconds
+provider: # compute provider, currently only `modal` is supported
+gpu: # GPU type to use
+gpu_count: # Number of GPUs (default: 1)
+memory: # RAM in GB (default: 128)
+timeout: # Maximum runtime in seconds
 timeout_preprocess: # Preprocessing timeout
-branch:      # Git branch to use
-docker_tag:  # Custom Docker image tag
-volumes:     # List of persistent storage volumes
-
-# Environment variables to pass. Can be specified in two ways:
-# 1. As a string: Will load the value from the host computer's environment variables
-# 2. As a key-value pair: Will use the specified value directly
-# Example:
-# env:
-#   - CUSTOM_VAR  # Loads from host's $CUSTOM_VAR
-#   - {CUSTOM_VAR: "value"}  # Uses "value" directly
-env:
-
-# Secrets to inject. Same input format as `env` but for sensitive data.
-secrets:
-  # - HF_TOKEN
-  # - WANDB_API_KEY
+branch: # Git branch to use
+docker_tag: # Custom Docker image tag
+volumes: # List of persistent storage volumes
+env: # Environment variables to pass
+secrets: # Secrets to inject
 ```
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -109,7 +109,7 @@ datasets:
    preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)

    name: # Optional[str] name of dataset configuration to load
-    split: train # Optional[str] name of dataset split to load from
+    train_on_split: train # Optional[str] name of dataset split to load from
    revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
    trust_remote_code: # Optional[bool] Trust remote code for untrusted source

@@ -165,9 +165,7 @@ datasets:
      content: value
      # ...

-    # Optional[Dict[str, List]]. Roles mapping in the messages.
-    # The format is {target_role: [source_roles]}. All source roles will be mapped to the target role.
-    # The default is:
+    # Optional[Dict[str, List]]. Roles mapping in the messages. The default is:
    roles:
      user: ["human", "user"]
      assistant: ["gpt", "assistant"]
@@ -240,10 +238,10 @@ simpo_gamma: 0.5  # Target reward margin for the SimPO loss
 # grpo
 trl:
  use_vllm: # Optional[bool]. Whether to use VLLM for RL training.
-  vllm_server_host: # Optional[str]. Host of the vLLM server to connect to.
-  vllm_server_port: # Optional[int]. Port of the vLLM server to connect to.
-  vllm_server_timeout: # Optional[int]. Total timeout (in seconds) to wait for the vLLM server to respond.
-  vllm_guided_decoding_regex: # Optional[str]. Regex for vLLM guided decoding.
+  vllm_device: # Optional[str]. Device to use for VLLM.
+  vllm_gpu_memory_utilization: # Optional[float]. GPU memory utilization for VLLM.
+  vllm_max_model_len: # Optional[int]. Maximum length of the model for VLLM.
+  vllm_dtype: # Optional[str]. Data type for VLLM.

  beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use
  max_completion_length: # Optional[int]. Maximum length of the completion for RL training.
@@ -322,13 +320,9 @@ total_num_tokens:
 sample_packing_group_size: 100000
 # The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
 sample_packing_bin_size: 200
-sample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially.
-
 # whether to concatenate samples during pretraining
 pretraining_sample_concatenation:

-curriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning
-
 # Use batch flattening for speedups when not using sample_packing
 batch_flattening:

@@ -360,27 +354,7 @@ lora_target_modules:
 #  - down_proj
 #  - up_proj
 lora_target_linear: # If true, will target all linear modules
-
-# List[int] | int. # The layer indices to transform, otherwise, apply to all layers
-# https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig.layers_to_transform
-peft_layers_to_transform:
-
-# Optional[bool]. Whether to use DoRA.
-# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#weight-decomposed-low-rank-adaptation-dora
-peft_use_dora:
-
-# Optional[bool]. Whether to use RSLoRA.
-# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#rank-stabilized-lora
-peft_use_rslora:
-
-# Optional[list[tuple[int, int]]]. List of layer indices to replicate.
-# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#memory-efficient-layer-replication-with-lora
-peft_layer_replication:
-
-# bool | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq"]
-# How to initialize LoRA weights. Default to True which is MS original implementation.
-# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#initialization
-peft_init_lora_weights:
+peft_layers_to_transform: # The layer indices to transform, otherwise, apply to all layers

 # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
 # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
@@ -512,8 +486,7 @@ train_on_inputs: false
 # Note that training loss may have an oscillating pattern with this enabled.
 group_by_length: false

-# Whether to use gradient checkpointing. Available options are: true, false, "offload".
-# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
+# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 gradient_checkpointing: false
 # additional kwargs to pass to the trainer for gradient checkpointing
 # gradient_checkpointing_kwargs:
@@ -614,31 +587,26 @@ max_grad_norm:
 # currently only supported on Llama and Mistral
 neftune_noise_alpha:

-# Optional[bool]. Whether to bettertransformers
+# Whether to bettertransformers
 flash_optimum:
-
-# Note: Only one of the following attention patches can be used at a time.
-# For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`.
-
-# Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
+# Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
 xformers_attention:
-# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
+# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
 flash_attention:
-flash_attn_cross_entropy:  # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only
-flash_attn_rms_norm:  # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only
-flash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation
-flash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation
-# Optional[bool]. Whether to use scaled-dot-product attention
+flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
+flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
+flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
+flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
+# Whether to use scaled-dot-product attention
 # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 sdp_attention:
-# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
+# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
 s2_attention:
-
 # Optional[bool]. Whether to use low_cpu_mem_usage
 low_cpu_mem_usage:
-# Optional[str]. Resume from a specific checkpoint dir
+# Resume from a specific checkpoint dir
 resume_from_checkpoint:
-# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off.
+# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
 # Be careful with this being turned on between different models.
 auto_resume_from_checkpoints: false

--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -13,13 +13,6 @@ As there are a lot of available options in Axolotl, this guide aims to provide a

 Axolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.

-::: {.callout-tip}
-
-This guide will mainly use JSONL as an introduction. Please refer to the [dataset loading docs](../dataset_loading.qmd) to understand how to load datasets from other sources.
-
-For `pretraining_dataset:` specifically, please refer to the [Pre-training section](#pre-training).
-:::
-
 ## Pre-training

 When aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports [streaming](https://huggingface.co/docs/datasets/en/stream) to only load batches into memory at a time.
--- a/docs/dataset_loading.qmd
+++ b/docs/dataset_loading.qmd
@@ -1,276 +0,0 @@
---
-title: Dataset Loading
-description: Understanding how to load datasets from different sources
-back-to-top-navigation: true
-toc: true
-toc-depth: 5
---
-
-## Overview
-
-Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.
-
-## Loading Datasets
-
-We use the `datasets` library to load datasets and a mix of `load_dataset` and `load_from_disk` to load them.
-
-You may recognize the similar named configs between `load_dataset` and the `datasets` section of the config file.
-
-```yaml
-datasets:
-  - path:
-    name:
-    data_files:
-    split:
-    revision:
-    trust_remote_code:
-```
-
-::: {.callout-tip}
-
-Do not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be `path` and sometimes `data_files`.
-
-:::
-
-This matches the API of [`datasets.load_dataset`](https://github.com/huggingface/datasets/blob/0b5998ac62f08e358f8dcc17ec6e2f2a5e9450b6/src/datasets/load.py#L1838-L1858), so if you're familiar with that, you will feel right at home.
-
-For HuggingFace's guide to load different dataset types, see [here](https://huggingface.co/docs/datasets/loading).
-
-For full details on the config, see [config.qmd](config.qmd).
-
-::: {.callout-note}
-
-You can set multiple datasets in the config file by more than one entry under `datasets`.
-
-```yaml
-datasets:
-  - path: /path/to/your/dataset
-  - path: /path/to/your/other/dataset
-```
-
-:::
-
-### Local dataset
-
-#### Files
-
-Usually, to load a JSON file, you would do something like this:
-
-```python
-from datasets import load_dataset
-
-dataset = load_dataset("json", data_files="data.json")
-```
-
-Which translates to the following config:
-
-```yaml
-datasets:
-  - path: json
-    data_files: /path/to/your/file.jsonl
-```
-
-However, to make things easier, we have added a few shortcuts for loading local dataset files.
-
-You can just point the `path` to the file or directory along with the `ds_type` to load the dataset. The below example shows for a JSON file:
-
-```yaml
-datasets:
-  - path: /path/to/your/file.jsonl
-    ds_type: json
-```
-
-This works for CSV, JSON, Parquet, and Arrow files.
-
-::: {.callout-tip}
-
-If `path` points to a file and `ds_type` is not specified, we will automatically infer the dataset type from the file extension, so you could omit `ds_type` if you'd like.
-
-:::
-
-#### Directory
-
-If you're loading a directory, you can point the `path` to the directory.
-
-Then, you have two options:
-
-##### Loading entire directory
-
-You do not need any additional configs.
-
-We will attempt to load in the following order:
- datasets saved with `datasets.save_to_disk`
- loading entire directory of files (such as with parquet/arrow files)
-
-```yaml
-datasets:
-  - path: /path/to/your/directory
-```
-
-##### Loading specific files in directory
-
-Provide `data_files` with a list of files to load.
-
-```yaml
-datasets:
-    # single file
-  - path: /path/to/your/directory
-    ds_type: csv
-    data_files: file1.csv
-
-    # multiple files
-  - path: /path/to/your/directory
-    ds_type: json
-    data_files:
-      - file1.jsonl
-      - file2.jsonl
-
-    # multiple files for parquet
-  - path: /path/to/your/directory
-    ds_type: parquet
-    data_files:
-      - file1.parquet
-      - file2.parquet
-
-```
-
-### HuggingFace Hub
-
-The method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.
-
-::: {.callout-note}
-
-If you're using a private dataset, you will need to enable the `hf_use_auth_token` flag in the root-level of the config file.
-
-:::
-
-#### Folder uploaded
-
-This would mean that the dataset is a single file or file(s) uploaded to the Hub.
-
-```yaml
-datasets:
-  - path: org/dataset-name
-    data_files:
-      - file1.jsonl
-      - file2.jsonl
-```
-
-#### HuggingFace Dataset
-
-This means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via `datasets.push_to_hub`.
-
-```yaml
-datasets:
-  - path: org/dataset-name
-```
-
-::: {.callout-note}
-
-There are some other configs which may be required like `name`, `split`, `revision`, `trust_remote_code`, etc depending on the dataset.
-
-:::
-
-### Remote Filesystems
-
-Via the `storage_options` config under `load_dataset`, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.
-
-::: {.callout-warning}
-
-This is currently experimental. Please let us know if you run into any issues!
-
-:::
-
-The only difference between the providers is that you need to prepend the path with the respective protocols.
-
-```yaml
-datasets:
-    # Single file
-  - path: s3://bucket-name/path/to/your/file.jsonl
-
-    # Directory
-  - path: s3://bucket-name/path/to/your/directory
-```
-
-For directory, we load via `load_from_disk`.
-
-#### S3
-
-Prepend the path with `s3://`.
-
-The credentials are pulled in the following order:
-
- `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN` environment variables
- from the `~/.aws/credentials` file
- for nodes on EC2, the IAM metadata provider
-
-::: {.callout-note}
-
-We assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.
-
-:::
-
-Other environment variables that can be set can be found in [boto3 docs](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#using-environment-variables)
-
-#### GCS
-
-Prepend the path with `gs://` or `gcs://`.
-
-The credentials are loaded in the following order:
-
- gcloud credentials
- for nodes on GCP, the google metadata service
- anonymous access
-
-#### Azure
-
-##### Gen 1
-
-Prepend the path with `adl://`.
-
-Ensure you have the following environment variables set:
-
- `AZURE_STORAGE_TENANT_ID`
- `AZURE_STORAGE_CLIENT_ID`
- `AZURE_STORAGE_CLIENT_SECRET`
-
-##### Gen 2
-
-Prepend the path with `abfs://` or `az://`.
-
-Ensure you have the following environment variables set:
-
- `AZURE_STORAGE_ACCOUNT_NAME`
- `AZURE_STORAGE_ACCOUNT_KEY`
-
-Other environment variables that can be set can be found in [adlfs docs](https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials)
-
-#### OCI
-
-Prepend the path with `oci://`.
-
-It would attempt to read in the following order:
-
- `OCIFS_IAM_TYPE`, `OCIFS_CONFIG_LOCATION`, and `OCIFS_CONFIG_PROFILE` environment variables
- when on OCI resource, resource principal
-
-Other environment variables:
-
- `OCI_REGION_METADATA`
-
-Please see the [ocifs docs](https://ocifs.readthedocs.io/en/latest/getting-connected.html#Using-Environment-Variables).
-
-### HTTPS
-
-The path should start with `https://`.
-
-```yaml
-datasets:
-  - path: https://path/to/your/dataset/file.jsonl
-```
-
-This must be publically accessible.
-
-## Next steps
-
-Now that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format [dataset formats docs](dataset-formats).
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -35,22 +35,12 @@ description: Frequently asked questions

 **Q: How to call Axolotl via custom python scripts?**

-> A: Since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.
+> A: Yes, since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.

 **Q: How to know the value to use for `fsdp_transformer_layer_cls_to_wrap`?**

 > A: This is the class name of the transformer layer to wrap with FSDP. For example, for `LlamaForCausalLM`, the value is `LlamaDecoderLayer`. To find this for a specific model, check the model's `PreTrainedModel` definition and look for `_no_split_modules` variable in the `modeling_<model_name>.py` file within `transformers` library.

-**Q: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token**
-
-> A: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:
-
-> ```yaml
-> special_tokens:
->   # str. If you're not sure, set to same as `eos_token`.
->   pad_token: "..."
-> ```
-
 ### Chat templates

 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -17,7 +17,6 @@ We currently support several common model architectures, including (but not limi
 - `qwen2`
 - `gemma`
 - `gemma2`
- `gemma3`

 <details>

--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -9,7 +9,6 @@ format:
 ## Supported Models

 - [Mllama](#sec-mllama)
- [Llama4](#sec-llama4)
 - [Pixtral](#sec-pixtral)
 - [Llava-1.5](#sec-llava-15)
 - [Mistral-Small-3.1](#sec-mistral-small-31)
@@ -64,14 +63,6 @@ base_model: meta-llama/Llama-3.2-11B-Vision-Instruct
 chat_template: llama3_2_vision
 ```

-### Llama4 {#sec-llama4}
-
-```yaml
-base_model: meta-llama/Llama-4-Scout-17B-16E-Instruct
-
-chat_template: llama4
-```
-
 ### Pixtral {#sec-pixtral}

 ```yaml
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -502,48 +502,9 @@ The input format is a simple JSON input with customizable fields based on the ab
 Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
 :::

-If you have multiple GPUs available, we reccomend using `vLLM` with the `GRPOTrainer` to significantly speedup trajectory generation during training.
-First, launch a `vLLM` server using `trl vllm-serve` - you may use a config file or CLI overrides to configure your vLLM server. In this example, we're
-using 4 GPUs - 2 for training, and 2 for vLLM:
-
-::: {.callout-important}
-Make sure you've installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. `pip install axolotl[vllm]`.
-:::
-
-```yaml
-base_model: Qwen/Qwen2.5-1.5B-Instruct
-
-vllm:
-    host: 0.0.0.0
-    port: 8000
-    tensor_parallel_size: 2
-    gpu_memory_utilization: 0.85
-    dtype: auto
-    # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand
-
-rl: grpo
-trl:
-    use_vllm: true
-    vllm_server_host: 0.0.0.0
-    vllm_server_port: 8000
-    vllm_server_timeout: 300
-```
-
-```bash
-CUDA_VISIBLE_DEVICES=2,3 axolotl vllm_serve grpo.yaml
-```
-
-Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
-```
-
-#### Reward functions
-
 GRPO uses custom reward functions and transformations. Please have them ready locally.

-For example, to load OpenAI's GSM8K and use a random reward for completions:
+For ex, to load OpenAI's GSM8K and use a random reward for completions:

 ```python
 # rewards.py
@@ -569,6 +530,8 @@ trl:
    beta: 0.001
    max_completion_length: 256
    use_vllm: True
+    vllm_device: auto
+    vllm_gpu_memory_utilization: 0.15
    num_generations: 4
    reward_funcs: ["rewards.rand_reward_func"]    # format: '{file_name}.{fn_name}'
    reward_weights: [1.0]
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -8,6 +8,9 @@ tokenizer_type: GPT2Tokenizer
 trust_remote_code: true
 tokenizer_use_fast: true
 tokenizer_legacy: true
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false
 push_dataset_to_hub:
 hf_use_auth_token: true
@@ -31,6 +34,7 @@ lora_alpha:
 lora_dropout:
 lora_target_modules:
 lora_target_linear:
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -54,12 +58,16 @@ learning_rate: 0.000085
 train_on_inputs: true
 group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1

+xformers_attention:
 flash_attention: true
 sdp_attention:
 flash_optimum:
@@ -72,6 +80,8 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 save_total_limit:

+debug:
+deepspeed:
 weight_decay: 0.1
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -22,6 +22,7 @@ lora_target_modules:
  - c_attn
  - c_proj
 lora_target_linear:
+lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -35,10 +36,15 @@ optimizer: paged_adamw_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -47,6 +53,10 @@ gptq_model_v1:
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -26,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -40,18 +41,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -41,18 +43,28 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -26,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -40,18 +41,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -41,18 +43,28 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -26,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -40,18 +41,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -41,18 +43,28 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -44,16 +44,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/dbrx/16bit-lora.yaml
+++ b/examples/dbrx/16bit-lora.yaml
@@ -3,6 +3,9 @@ base_model: LnL-AI/dbrx-base-converted-v2
 # hub_model_id: username/custom_model_name

 trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -45,20 +48,26 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: false  # don't use with fsdp_activation_checkpointing
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/dbrx/8bit-lora.yaml
+++ b/examples/dbrx/8bit-lora.yaml
@@ -48,20 +48,26 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: false  # don't use with fsdp_activation_checkpointing
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/dbrx/fft-ds-zero3.yaml
+++ b/examples/dbrx/fft-ds-zero3.yaml
@@ -3,6 +3,9 @@ base_model: LnL-AI/dbrx-base-converted-v2
 # hub_model_id: username/custom_model_name

 trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -32,19 +35,25 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 deepspeed: deepspeed_configs/zero3_bf16.json
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -2,6 +2,9 @@ base_model: deepseek-ai/DeepSeek-V2-Lite
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -28,19 +31,27 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 2e-5

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
 special_tokens:
 fsdp:
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -52,19 +52,27 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 2e-5

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
 special_tokens:
 fsdp:
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -25,7 +25,9 @@ max_packed_sequence_len:
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.0
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -39,10 +41,15 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.00003
+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -51,7 +58,11 @@ gptq_model_v1:
 warmup_steps: 40
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -38,7 +38,9 @@ lora_alpha: 16
 # 0.05 for 33B and 65B models
 lora_dropout: 0.05
 # add LoRA modules on all linear layers of the base model
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -65,7 +67,10 @@ lr_scheduler: cosine
 # - 2e-4 for 7b & 13b
 # - 1e-4 for 33b & 64b
 learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 # stop training after this many evaluation losses have increased in a row
@@ -73,6 +78,7 @@ gradient_checkpointing: true
 early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
+local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -81,7 +87,11 @@ gptq_model_v1:
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.000001
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -7,6 +7,9 @@ tokenizer_type: AutoTokenizer

 # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
 trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: false
 gptq: false
 strict: false
 push_dataset_to_hub:
@@ -22,7 +25,9 @@ max_packed_sequence_len:
 lora_r: 64
 lora_alpha: 32
 lora_dropout: 0.0
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -36,10 +41,15 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.00003
+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -48,7 +58,11 @@ gptq_model_v1:
 warmup_steps: 40
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -42,16 +42,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -48,16 +48,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -5,6 +5,9 @@ num_labels: 1
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 reward_model: true
@@ -35,6 +38,8 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: true
 fp16:
 tf32: true
@@ -42,12 +47,21 @@ tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -5,9 +5,6 @@ tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

-# gemma3 doesn't seem to play nice with ddp
-ddp_find_unused_parameters: true
-
 load_in_8bit: false
 load_in_4bit: true
 strict: false
@@ -50,18 +47,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -2,16 +2,11 @@ base_model: google/gemma-3-4b-it
 processor_type: AutoProcessor
 strict: false

-load_in_4bit: true
-
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false

-# gemma3 doesn't seem to play nice with ddp
-ddp_find_unused_parameters: true
-
 chat_template: gemma3
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
@@ -22,7 +17,7 @@ dataset_prepared_path: last_run_prepared
 val_set_size: 0.01
 output_dir: ./outputs/out

-adapter: qlora
+adapter: lora
 lora_model_dir:

 sequence_len: 2048
@@ -46,13 +41,14 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: true
 fp16:
 tf32: true

 gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
+local_rank:
 logging_steps: 1
 flash_attention: true
 eager_attention:
@@ -60,4 +56,8 @@ eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -1,61 +0,0 @@
-base_model: google/gemma-3-4b-it
-strict: false
-
-load_in_4bit: true
-
-# gemma3 doesn't seem to play nice with ddp
-ddp_find_unused_parameters: true
-
-chat_template: gemma3
-datasets:
-  - path: cgato/SlimOrcaDedupCleaned
-    type: chat_template
-    field_messages: conversations
-    message_property_mappings:
-      role: from
-      content: value
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./outputs/out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: true
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-logging_steps: 1
-flash_attention: true
-eager_attention:
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -18,7 +18,9 @@ max_packed_sequence_len:
 lora_r: 8
 lora_alpha: 32
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -32,10 +34,15 @@ optimizer: paged_adamw_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0001
+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -44,6 +51,10 @@ gptq_model_v1:
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -40,18 +40,26 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.00001

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
 special_tokens:
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -39,20 +39,26 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.00001

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
-
+debug:
 deepspeed: deepspeed_configs/zero2.json
 weight_decay: 0.0
 special_tokens:
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -39,6 +39,8 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.00001

+train_on_inputs: false
+group_by_length: false
 bf16: true
 tf32: true

--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -33,9 +33,13 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.00003
+train_on_inputs: false
+group_by_length: false
 bf16: auto
 tf32: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 5
 xformers_attention: true
 flash_attention:
@@ -44,7 +48,11 @@ gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -4,6 +4,9 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -23,6 +26,7 @@ lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -37,12 +41,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
@@ -51,8 +61,11 @@ flash_attn_fuse_mlp: true

 warmup_steps: 100
 evals_per_epoch: 4
+eval_table_size:
 saves_per_epoch: 1
-
+debug:
 deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -10,6 +10,8 @@ gptq_disable_exllama: true

 tokenizer_use_fast: true
 tokenizer_legacy: true
+load_in_8bit: false
+load_in_4bit: false
 strict: false
 push_dataset_to_hub:
 hf_use_auth_token: true
@@ -31,6 +33,7 @@ lora_target_modules:
  - q_proj
  - v_proj
 lora_target_linear:
+lora_fan_in_fan_out:
 wandb_project:
 wandb_watch:
 wandb_name:
@@ -47,19 +50,26 @@ torchdistx_path:
 lr_scheduler: cosine
 lr_quadratic_warmup: true
 learning_rate: 0.000017
+train_on_inputs: false
+group_by_length: false
 bf16: false
 fp16: false
 float16: true
 tf32: true
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention:
 sdp_attention:
 flash_optimum:
 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
 special_tokens:
  bos_token: "<s>"
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -4,6 +4,9 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -23,6 +26,7 @@ lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
+lora_fan_in_fan_out:

 lisa_n_layers: 4
 lisa_step_interval: 20
@@ -41,12 +45,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 5e-5 # recommendation from lisa paper for 7b

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
@@ -55,8 +65,13 @@ flash_attn_fuse_mlp: true

 warmup_steps: 100
 evals_per_epoch: 4
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -4,6 +4,9 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -23,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:
 peft:
  loftq_config:
    loftq_bits: 4
@@ -40,16 +44,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -26,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -40,16 +41,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -41,19 +43,28 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.00001

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -41,16 +43,27 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -24,7 +24,9 @@ pad_to_sequence_len: true
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 relora_steps: 150
 relora_warmup_steps: 10
@@ -43,18 +45,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -45,11 +45,14 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: true
 fp16:
 tf32: true

 gradient_checkpointing: true
+local_rank:
 logging_steps: 1
 flash_attention: true
 eager_attention:
@@ -57,4 +60,8 @@ eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -42,19 +42,27 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 2e-5

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -1,6 +1,9 @@
 base_model: NousResearch/Meta-Llama-3.1-8B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -27,19 +30,29 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 2e-5

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: <|end_of_text|>
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -42,6 +42,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -56,15 +57,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -37,6 +37,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -51,17 +52,30 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
   pad_token: <|end_of_text|>
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -58,6 +58,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -72,15 +73,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -31,6 +31,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:
 lora_modules_to_save:
  - embed_tokens
  - lm_head
@@ -48,17 +49,30 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
   pad_token: <|end_of_text|>
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -1,6 +1,9 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -21,6 +24,7 @@ lora_r: 16
 lora_alpha: 32
 # Currently, we don't support dropout with our custom Triton kernels
 # lora_dropout: 0.05
+lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -49,12 +53,18 @@ optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -63,6 +73,10 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -1,6 +1,9 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -21,6 +24,7 @@ pad_to_sequence_len: true
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
+lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -43,12 +47,18 @@ optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -57,9 +67,11 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
-
+debug:
 deepspeed: deepspeed_configs/zero3.json
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"

--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -1,66 +0,0 @@
-base_model: meta-llama/Llama-3.2-1B
-# optionally might have model_type or tokenizer_type
-model_type: LlamaForCausalLM
-tokenizer_type: AutoTokenizer
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path:
-val_set_size: 0.0
-output_dir: ./outputs/lora-out
-
-test_value: true
-
-sequence_len: 4096
-sample_packing: true
-sample_packing_sequentially: true
-curriculum_sampling: true
-eval_sample_packing: false
-pad_to_sequence_len: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_modules_to_save:
-  - embed_tokens
-  - lm_head
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
-  pad_token: <|end_of_text|>
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -1,6 +1,9 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -21,6 +24,7 @@ pad_to_sequence_len: true
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
+lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -43,12 +47,18 @@ optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -57,6 +67,10 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -27,6 +27,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:
 lora_modules_to_save:
  - embed_tokens
  - lm_head
@@ -44,17 +45,30 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
   pad_token: <|end_of_text|>
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -32,6 +32,7 @@ lora_r: 32
 lora_alpha: 64
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -46,19 +47,31 @@ optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 20
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -24,6 +24,7 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
+lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -46,12 +47,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -59,7 +66,13 @@ loss_watchdog_patience: 3

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -24,6 +24,7 @@ pad_to_sequence_len: true
 lora_r: 16
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true

 gradient_accumulation_steps: 4
@@ -33,6 +34,8 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.00001

+train_on_inputs: false
+group_by_length: false
 bf16: true
 tf32: true

--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -41,19 +43,28 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.00001

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -41,17 +43,28 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-4/README.md
+++ b/examples/llama-4/README.md
@@ -1,10 +0,0 @@
-# Llama 4 by Meta AI
-
-## Available Examples
-
-### Llama 4 Scout 17Bx16Experts (109B)
- [Multi-Modal/Vision QLoRA w/ FSDP1](./scout-vision-qlora-fsdp.yaml)
- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml)
- [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml)
-
-Our Single GPU implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second.
--- a/examples/llama-4/scout-qlora-fsdp1.yaml
+++ b/examples/llama-4/scout-qlora-fsdp1.yaml
@@ -1,93 +0,0 @@
-base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
-model_type: Llama4ForConditionalGeneration
-  # Automatically upload checkpoint and final model to HF
-  # hub_model_id: username/custom_model_name
-
-strict: false
-
-# torch_compile: true
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-
-liger_glu_activation: true
-liger_rms_norm: true
-liger_layer_norm: true
-
-llama4_linearized_experts: true
-load_in_4bit: true
-adapter: qlora
-lora_r: 32
-lora_alpha: 64
-lora_target_modules:
-  - self_attn.q_proj
-  - self_attn.k_proj
-  - self_attn.v_proj
-  - self_attn.o_proj
-  - shared_expert.gate_proj
-  - shared_expert.up_proj
-  - shared_expert.down_proj
-    # - experts.gate_projs.[0-9]+$
-    # - experts.up_projs.[0-9]+$
-    # - experts.down_projs.[0-9]+$
-lora_modules_to_save:
-  - lm_head
-  - embed_tokens
-
-chat_template: llama4
-datasets:
-  - path: mlabonne/FineTome-100k
-    type: chat_template
-    split: train[:20%]
-    field_messages: conversations
-    message_property_mappings:
-      role: from
-      content: value
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-sequence_len: 4096
-sample_packing: true
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_torch_fused
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-bf16: true
-tf32: true
-
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 100
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
-fsdp:
-  - auto_wrap
-  - full_shard
-fsdp_config:
-  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
-  fsdp_limit_all_gathers: true
-  fsdp_sync_module_states: true
-  fsdp_offload_params: true
-  fsdp_use_orig_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
-  fsdp_activation_checkpointing: true
-special_tokens:
-  pad_token: <|finetune_right_pad_id|>
-  eos_token: <|eot|>
--- a/examples/llama-4/scout-qlora-single-h100.yaml
+++ b/examples/llama-4/scout-qlora-single-h100.yaml
@@ -1,86 +0,0 @@
-base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
-model_type: Llama4ForConditionalGeneration
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-strict: false
-
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-
-liger_glu_activation: true
-liger_rms_norm: true
-liger_layer_norm: true
-
-llama4_linearized_experts: true
-load_in_4bit: true
-adapter: qlora
-lora_r: 32
-lora_alpha: 64
-lora_target_modules:
-  - self_attn.q_proj
-  - self_attn.k_proj
-  - self_attn.v_proj
-  - self_attn.o_proj
-  - shared_expert.gate_proj
-  - shared_expert.up_proj
-  - shared_expert.down_proj
-  # - experts.gate_projs.[0-9]+$
-  # - experts.up_projs.[0-9]+$
-  # - experts.down_projs.[0-9]+$
-lora_modules_to_save:
-  # - lm_head
-  # - embed_tokens
-
-lora_mlp_kernel: true
-lora_qkv_kernel: true
-lora_o_kernel: true
-
-chat_template: llama4
-datasets:
-  - path: mlabonne/FineTome-100k
-    type: chat_template
-    split: train[:20%]
-    field_messages: conversations
-    message_property_mappings:
-      role: from
-      content: value
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-sequence_len: 4096  # up to 8k will work on a single H100
-sample_packing: true
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_torch_4bit
-lr_scheduler: cosine
-learning_rate: 1e-4
-
-bf16: true
-tf32: true
-
-logging_steps: 1
-flash_attention: true
-
-gradient_checkpointing: offload
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-
-warmup_steps: 20
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
-  pad_token: <|finetune_right_pad_id|>
-  eos_token: <|eot|>
--- a/examples/llama-4/scout-vision-qlora-fsdp.yaml
+++ b/examples/llama-4/scout-vision-qlora-fsdp.yaml
@@ -1,89 +0,0 @@
-base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
-model_type: Llama4ForConditionalGeneration
-processor_type: Llama4Processor
-  # Automatically upload checkpoint and final model to HF
-  # hub_model_id: username/custom_model_name
-
-strict: false
-
-# these 3 lines are needed for now to handle vision chat templates w images
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-
-sequence_len: 4096
-
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-
-liger_glu_activation: true
-liger_rms_norm: true
-liger_layer_norm: true
-
-llama4_linearized_experts: true  # use Axolotl's customized model
-load_in_4bit: true
-adapter: qlora
-lora_r: 32
-lora_alpha: 64
-lora_target_modules:
-  - self_attn.q_proj
-  - self_attn.k_proj
-  - self_attn.v_proj
-  - self_attn.o_proj
-  - shared_expert.gate_proj
-  - shared_expert.up_proj
-  - shared_expert.down_proj
-  - vision_adapter.mlp.fc1
-  - vision_adapter.mlp.fc2
-  # - experts.gate_projs.[0-9]+$
-  # - experts.up_projs.[0-9]+$
-  # - experts.down_projs.[0-9]+$
-lora_modules_to_save:
-  - lm_head
-  - embed_tokens
-
-chat_template: llama4
-datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
-    type: chat_template
-    split: train[:1%]
-    field_messages: messages
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_torch_4bit
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-bf16: true
-tf32: true
-
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 100
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
-fsdp:
-  - auto_wrap
-  - full_shard
-fsdp_config:
-  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
-  fsdp_limit_all_gathers: true
-  fsdp_sync_module_states: true
-  fsdp_offload_params: true
-  fsdp_use_orig_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
-  fsdp_activation_checkpointing: true
-special_tokens:
-  pad_token: <|finetune_right_pad_id|>
-  eos_token: <|eot|>
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -41,11 +41,14 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: true
 fp16:
 tf32: true

 gradient_checkpointing: true
+local_rank:
 logging_steps: 1
 flash_attention: true
 eager_attention:
@@ -53,4 +56,8 @@ eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -5,6 +5,9 @@ tokenizer_type: AutoTokenizer
 tokenizer_config: EleutherAI/gpt-neox-20b
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -35,17 +38,27 @@ train_on_inputs: false
 group_by_length: true

 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
 tokens:
 save_safetensors: False
--- a/examples/mistral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral-ds-zero3.yaml
@@ -6,6 +6,9 @@ tokenizer_type: LlamaTokenizer
 # hub_model_id: username/custom_model_name

 trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 unfrozen_parameters:
@@ -37,19 +40,27 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0001

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 save_total_limit: 1
 save_steps:
-
+debug:
 deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_params.json
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  eos_token: "<|im_end|>"
 tokens:
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -4,6 +4,9 @@ model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -31,16 +34,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.000005

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -4,6 +4,9 @@ model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -25,6 +28,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -47,13 +51,18 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
 fp16: false
 tf32: true

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: false
 sdp_attention: true

@@ -62,6 +71,12 @@ loss_watchdog_patience: 3

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_table_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -27,6 +27,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -49,12 +50,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -62,6 +69,12 @@ loss_watchdog_patience: 3

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -40,6 +40,7 @@ lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.2
 lora_target_linear: true
+lora_fan_in_fan_out:

 lora_target_modules:
  - gate_proj
@@ -66,18 +67,31 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0001

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: false
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<|im_start|>"
  eos_token: "<|im_end|>"
--- a/examples/mistral/mistral-qlora-fsdp.yml
+++ b/examples/mistral/mistral-qlora-fsdp.yml
@@ -32,6 +32,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -46,12 +47,18 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -59,8 +66,10 @@ loss_watchdog_patience: 3

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/mistral/mistral-qlora-orpo.yml
+++ b/examples/mistral/mistral-qlora-orpo.yml
@@ -32,6 +32,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -54,12 +55,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -67,6 +74,12 @@ loss_watchdog_patience: 3

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/mistral/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small-3.1-24B-lora.yml
@@ -43,11 +43,14 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: true
 fp16:
 tf32: true

 gradient_checkpointing: true
+local_rank:
 logging_steps: 1
 flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet.
 eager_attention:
@@ -55,5 +58,9 @@ eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
@@ -30,6 +30,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -44,12 +45,18 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -57,8 +64,10 @@ loss_watchdog_patience: 3

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/mistral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-qlora-fsdp.yml
@@ -32,6 +32,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -46,12 +47,18 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -59,8 +66,10 @@ loss_watchdog_patience: 3

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -41,6 +41,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:
 #lora_target_modules:
 #  - gate
 #  - q_proj
@@ -64,12 +65,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -77,8 +84,12 @@ loss_watchdog_patience: 3

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
-
+debug:
 deepspeed: deepspeed_configs/zero2.json
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/mistral/mixtral_22.yml
+++ b/examples/mistral/mixtral_22.yml
@@ -6,6 +6,9 @@ tokenizer_type: LlamaTokenizer
 # hub_model_id: username/custom_model_name

 trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 unfrozen_parameters:
@@ -35,19 +38,27 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0001

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 save_total_limit: 1
 save_steps:
-
+debug:
 deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  eos_token: "<|im_end|>"
 tokens:
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -27,6 +27,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -49,12 +50,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -62,6 +69,12 @@ loss_watchdog_patience: 3

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/mpt-7b/config.yml
+++ b/examples/mpt-7b/config.yml
@@ -35,17 +35,26 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0000002
+train_on_inputs: false
+group_by_length: false
 bf16: auto
 tf32: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 5
+xformers_attention:
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0001
+fsdp:
+fsdp_config:
 tokens:
  pad_token: "<|padding|>"
  bos_token: "<|endoftext|>"
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -4,6 +4,9 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false
 push_dataset_to_hub:
 datasets:
@@ -20,6 +23,7 @@ lora_alpha:
 lora_dropout:
 lora_target_modules:
 lora_target_linear:
+lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -33,20 +37,29 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.000003
+train_on_inputs: false
+group_by_length: false
 float16: true
 bf16: false
 fp16: false
 tf32: false
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -29,6 +29,7 @@ lora_target_modules:
  - v_proj
  - k_proj
  - o_proj
+lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -42,19 +43,29 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
 bf16: false
 fp16: true
 tf32: false
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
 gptq_groupsize:
+s2_attention:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/openllama-3b/qlora.yml
+++ b/examples/openllama-3b/qlora.yml
@@ -21,7 +21,9 @@ sample_packing: true
 lora_r: 8
 lora_alpha: 32
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -35,19 +37,28 @@ optimizer: paged_adamw_32bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
 bf16: false
 fp16: true
 tf32: false
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/phi/lora-3.5.yaml
+++ b/examples/phi/lora-3.5.yaml
@@ -37,6 +37,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -51,16 +52,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bfloat16: true
 bf16: true
 fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 4
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -4,6 +4,9 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -24,6 +27,7 @@ lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -41,20 +45,30 @@ max_grad_norm: 1.0
 lr_scheduler: cosine
 learning_rate: 0.000003

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: True
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -27,6 +27,7 @@ lora_r: 64
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -44,20 +45,30 @@ max_grad_norm: 1.0
 lr_scheduler: cosine
 learning_rate: 0.000003

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: True
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/phi/phi2-ft.yml
+++ b/examples/phi/phi2-ft.yml
@@ -4,6 +4,9 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -24,6 +27,7 @@ lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -41,20 +45,30 @@ max_grad_norm: 1.0
 lr_scheduler: cosine
 learning_rate: 0.000003

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: True
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/phi/phi3-ft-fsdp.yml
+++ b/examples/phi/phi3-ft-fsdp.yml
@@ -4,6 +4,9 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
 strict: false

 datasets:
@@ -25,6 +28,7 @@ lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
+lora_fan_in_fan_out:

 wandb_project: phi3
 wandb_entity:
@@ -42,19 +46,27 @@ max_grad_norm: 1.0
 lr_scheduler: cosine
 learning_rate: 0.000003

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
 fsdp:
  - full_shard
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Dan Saunders	1defb8a955	Merge branch 'main' into destroy-pg	2025-03-31 14:36:43 -04:00
Dan Saunders	70b466aa67	ray bugfix	2025-03-31 18:35:41 +00:00
Dan Saunders	32ce167404	update	2025-03-31 14:46:15 +00:00
Dan Saunders	1c4cc639f5	fix nccl pg destroy warning	2025-03-31 14:32:50 +00:00