update peft to 0.15.1

use 4.51.0 for now
slightly smaller train set
2025-04-06 19:55:07 -04:00 · 2025-04-06 18:14:14 -04:00 · 2025-04-06 17:11:52 -04:00 · 2025-04-06 17:11:52 -04:00 · 2025-04-06 17:11:52 -04:00 · 2025-04-06 17:11:50 -04:00
205 changed files with 624 additions and 3428 deletions
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,14 +0,0 @@
 [run]
 source = axolotl
 omit =
    */tests/*
    setup.py
 [report]
 exclude_lines =
    pragma: no cover
    def __repr__
    raise NotImplementedError
    if __name__ == .__main__.:
    pass
    raise ImportError
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -29,7 +29,7 @@ jobs:
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
-            axolotl_extras: vllm
+            axolotl_extras:
            is_latest: true
    runs-on: axolotl-gpu-runner
    steps:
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -102,16 +102,9 @@ jobs:
      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml
+          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v tests/patched/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v tests/patched/
-          pytest -v tests/cli/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v tests/cli/
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        with:
          files: ./coverage.xml
          flags: unittests,pytorch-${{ matrix.pytorch_version }}
          fail_ci_if_error: false
      - name: cleanup pip cache
        run: |
--- a/1
+++ b/1
@@ -1 +0,0 @@
 docs.axolotl.ai
--- a/README.md
+++ b/README.md
@@ -9,7 +9,6 @@
 <p align="center">
    <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
    <a href="https://codecov.io/gh/axolotl-ai-cloud/axolotl"><img src="https://codecov.io/gh/axolotl-ai-cloud/axolotl/branch/main/graph/badge.svg" alt="codecov"></a>
    <a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
    <br/>
    <a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
@@ -64,7 +63,7 @@ axolotl fetch examples
 axolotl fetch deepspeed_configs  # OPTIONAL
 ```
-Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
+Other installation approaches are described [here](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html).
 ### Your First Fine-tune
@@ -79,7 +78,7 @@ axolotl fetch examples --dest path/to/folder
 axolotl train examples/llama-3/lora-1b.yml
 ```
-That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.
+That's it! Check out our [Getting Started Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/getting-started.html) for a more detailed walkthrough.
 ## ✨ Key Features
@@ -92,20 +91,20 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge
 ## 📚 Documentation
- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
+- [Installation Options](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html) - Detailed setup instructions for different environments
- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
+- [Configuration Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html) - Full configuration options and examples
- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
+- [Dataset Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/) - Supported formats and how to use them
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-GPU Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [Multi-Node Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html)
- [Multipacking](https://docs.axolotl.ai/docs/multipack.html)
+- [Multipacking](https://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html)
- [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
+- [API Reference](https://axolotl-ai-cloud.github.io/axolotl/docs/api/) - Auto-generated code documentation
- [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions
+- [FAQ](https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html) - Frequently asked questions
 ## 🤝 Getting Help
 - Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
 - Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
- Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html)
+- Read our [Debugging Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html)
 - Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options
 ## 🌟 Contributing
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -231,7 +231,6 @@ website:
            - docs/reward_modelling.qmd
            - docs/lr_groups.qmd
            - docs/lora_optims.qmd
            - docs/dataset_loading.qmd
        - section: "Core Concepts"
          contents:
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -3,59 +3,10 @@ set -e
 python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
-# Run unit tests with initial coverage report
+pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli /workspace/axolotl/tests/
-pytest -v --durations=10 -n8 \
+pytest -v --durations=10 /workspace/axolotl/tests/e2e/patched/lora_kernels  # running these with the other patches causes a failure
-  --ignore=tests/e2e/ \
+pytest -v --durations=10 --ignore=tests/e2e/patched/lora_kernels /workspace/axolotl/tests/e2e/patched
-  --ignore=tests/patched/ \
+pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/solo/
-  --ignore=tests/cli \
+pytest -v --durations=10 /workspace/axolotl/tests/e2e/integrations/
-  /workspace/axolotl/tests/ \
+pytest -v --durations=10 /workspace/axolotl/tests/cli
-  --cov=axolotl \
+pytest -v --durations=10 --ignore=tests/e2e/solo/ --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ --ignore=tests/cli /workspace/axolotl/tests/e2e/
  --cov-report=xml:coverage.xml
 # Run lora kernels tests with coverage append
 pytest -v --durations=10 \
  /workspace/axolotl/tests/e2e/patched/lora_kernels \
  --cov=axolotl \
  --cov-append
 # Run patched tests excluding lora kernels with coverage append
 pytest -v --durations=10 \
  --ignore=tests/e2e/patched/lora_kernels \
  /workspace/axolotl/tests/e2e/patched \
  --cov=axolotl \
  --cov-append
 # Run solo tests with coverage append
 pytest -v --durations=10 -n1 \
  /workspace/axolotl/tests/e2e/solo/ \
  --cov=axolotl \
  --cov-append
 # Run integration tests with coverage append
 pytest -v --durations=10 \
  /workspace/axolotl/tests/e2e/integrations/ \
  --cov=axolotl \
  --cov-append
 pytest -v --durations=10 /workspace/axolotl/tests/cli \
  --cov=axolotl \
  --cov-append
 # Run remaining e2e tests with coverage append and final report
 pytest -v --durations=10 \
  --ignore=tests/e2e/solo/ \
  --ignore=tests/e2e/patched/ \
  --ignore=tests/e2e/multigpu/ \
  --ignore=tests/e2e/integrations/ \
  --ignore=tests/cli \
  /workspace/axolotl/tests/e2e/ \
  --cov=axolotl \
  --cov-append \
  --cov-report=xml:coverage.xml
 # Upload coverage to Codecov
 if [ -f e2e-coverage.xml ]; then
  codecov -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION}
 else
  echo "Coverage file not found. Coverage report may have failed."
 fi
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -68,7 +68,7 @@ def run_cmd(cmd: str, run_folder: str):
@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=90 * 60,
+    timeout=60 * 60,
    cpu=8.0,
    memory=131072 * N_GPUS,
    volumes=VOLUME_CONFIG,
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -4,22 +4,3 @@ set -e
 # only run one test at a time so as not to OOM the GPU
 pytest -v  --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
 pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
 # Only run two tests at a time to avoid OOM on GPU (with coverage collection)
 pytest -v -n2 \
  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
  /workspace/axolotl/tests/e2e/multigpu/ \
  --cov=axolotl \
  --cov-report=xml:multigpu-coverage.xml
 pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/ \
  --cov=axolotl \
  --cov-append \
  --cov-report=xml:multigpu-coverage.xml
 # Upload coverage to Codecov
 if [ -f multigpu-coverage.xml ]; then
  codecov -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}
 else
  echo "Coverage file not found. Coverage report may have failed."
 fi
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,51 +0,0 @@
 codecov:
  require_ci_to_pass: yes
 coverage:
  precision: 2
  round: down
  range: "70...100"
  status:
    project:
      default:
        # basic
        target: auto
        threshold: 0%
        base: auto
        # advanced
        branches: null
        if_no_uploads: error
        if_not_found: success
        if_ci_failed: error
        only_pulls: false
        flags: null
        paths: null
    patch:
      default:
        # basic
        target: auto
        threshold: 0%
        base: auto
        # advanced
        branches: null
        if_no_uploads: error
        if_not_found: success
        if_ci_failed: error
        only_pulls: false
        flags: null
        paths: null
 parsers:
  gcov:
    branch_detection:
      conditional: yes
      loop: yes
      method: no
      macro: no
 comment:
  layout: "reach,diff,flags,files,footer"
  behavior: default
  require_changes: no
  require_base: no
  require_head: yes
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace
 RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
-    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -90,7 +90,7 @@ lora_on_cpu: true
 # List[str]. Add plugins to extend the pipeline.
 # See `src/axolotl/integrations` for the available plugins or doc below for more details.
-# https://docs.axolotl.ai/docs/custom_integrations.html
+# https://axolotl-ai-cloud.github.io/axolotl/docs/custom_integrations.html
 plugins:
  # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
@@ -109,7 +109,7 @@ datasets:
    preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)
    name: # Optional[str] name of dataset configuration to load
-    split: train # Optional[str] name of dataset split to load from
+    train_on_split: train # Optional[str] name of dataset split to load from
    revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
    trust_remote_code: # Optional[bool] Trust remote code for untrusted source
@@ -165,9 +165,7 @@ datasets:
      content: value
      # ...
-    # Optional[Dict[str, List]]. Roles mapping in the messages.
+    # Optional[Dict[str, List]]. Roles mapping in the messages. The default is:
    # The format is {target_role: [source_roles]}. All source roles will be mapped to the target role.
    # The default is:
    roles:
      user: ["human", "user"]
      assistant: ["gpt", "assistant"]
@@ -394,7 +392,7 @@ lora_fan_in_fan_out: false
 # Apply custom LoRA autograd functions and activation function Triton kernels for
 # speed and memory savings
-# See: https://docs.axolotl.ai/docs/lora_optims.html
+# See: https://axolotl-ai-cloud.github.io/axolotl/docs/lora_optims.html
 lora_mlp_kernel: true
 lora_qkv_kernel: true
 lora_o_kernel: true
@@ -688,14 +686,11 @@ ddp_broadcast_buffers:
 # Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
 # E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
 # subsequences, or set to 4 to split into four equal-sized subsequences.
-# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details.
+# See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
 sequence_parallel_degree:
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 # Must evenly divide the number of KV heads in your model.
 heads_k_stride: 1
 # One of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to "varlen_llama3"
 # in the sample packing case, and "batch_ring" in the non-sample packing case.
 ring_attn_func:
 # Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -13,13 +13,6 @@ As there are a lot of available options in Axolotl, this guide aims to provide a
 Axolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.
 ::: {.callout-tip}
 This guide will mainly use JSONL as an introduction. Please refer to the [dataset loading docs](../dataset_loading.qmd) to understand how to load datasets from other sources.
 For `pretraining_dataset:` specifically, please refer to the [Pre-training section](#pre-training).
 :::
 ## Pre-training
 When aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports [streaming](https://huggingface.co/docs/datasets/en/stream) to only load batches into memory at a time.
@@ -457,7 +450,10 @@ datasets:
    type: alpaca
 ```
-Axolotl supports many kinds of instruction dataset. All of them can be found in the [Instruction Dataset Documentation](inst_tune.qmd) with their respective type and sample row format.
+Axolotl supports many kinds of instruction dataset. All of them can be found here (https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html) with their respective type and sample row format.
 Reference: [Instruction Dataset Documentation](inst_tune.qmd).
 #### Custom Instruct Prompt Format
--- a/docs/dataset_loading.qmd
+++ b/docs/dataset_loading.qmd
@@ -1,276 +0,0 @@
 ---
 title: Dataset Loading
 description: Understanding how to load datasets from different sources
 back-to-top-navigation: true
 toc: true
 toc-depth: 5
 ---
 ## Overview
 Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.
 ## Loading Datasets
 We use the `datasets` library to load datasets and a mix of `load_dataset` and `load_from_disk` to load them.
 You may recognize the similar named configs between `load_dataset` and the `datasets` section of the config file.
 ```yaml
 datasets:
  - path:
    name:
    data_files:
    split:
    revision:
    trust_remote_code:
 ```
 ::: {.callout-tip}
 Do not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be `path` and sometimes `data_files`.
 :::
 This matches the API of [`datasets.load_dataset`](https://github.com/huggingface/datasets/blob/0b5998ac62f08e358f8dcc17ec6e2f2a5e9450b6/src/datasets/load.py#L1838-L1858), so if you're familiar with that, you will feel right at home.
 For HuggingFace's guide to load different dataset types, see [here](https://huggingface.co/docs/datasets/loading).
 For full details on the config, see [config.qmd](config.qmd).
 ::: {.callout-note}
 You can set multiple datasets in the config file by more than one entry under `datasets`.
 ```yaml
 datasets:
  - path: /path/to/your/dataset
  - path: /path/to/your/other/dataset
 ```
 :::
 ### Local dataset
 #### Files
 Usually, to load a JSON file, you would do something like this:
 ```python
 from datasets import load_dataset
 dataset = load_dataset("json", data_files="data.json")
 ```
 Which translates to the following config:
 ```yaml
 datasets:
  - path: json
    data_files: /path/to/your/file.jsonl
 ```
 However, to make things easier, we have added a few shortcuts for loading local dataset files.
 You can just point the `path` to the file or directory along with the `ds_type` to load the dataset. The below example shows for a JSON file:
 ```yaml
 datasets:
  - path: /path/to/your/file.jsonl
    ds_type: json
 ```
 This works for CSV, JSON, Parquet, and Arrow files.
 ::: {.callout-tip}
 If `path` points to a file and `ds_type` is not specified, we will automatically infer the dataset type from the file extension, so you could omit `ds_type` if you'd like.
 :::
 #### Directory
 If you're loading a directory, you can point the `path` to the directory.
 Then, you have two options:
 ##### Loading entire directory
 You do not need any additional configs.
 We will attempt to load in the following order:
 - datasets saved with `datasets.save_to_disk`
 - loading entire directory of files (such as with parquet/arrow files)
 ```yaml
 datasets:
  - path: /path/to/your/directory
 ```
 ##### Loading specific files in directory
 Provide `data_files` with a list of files to load.
 ```yaml
 datasets:
    # single file
  - path: /path/to/your/directory
    ds_type: csv
    data_files: file1.csv
    # multiple files
  - path: /path/to/your/directory
    ds_type: json
    data_files:
      - file1.jsonl
      - file2.jsonl
    # multiple files for parquet
  - path: /path/to/your/directory
    ds_type: parquet
    data_files:
      - file1.parquet
      - file2.parquet
 ```
 ### HuggingFace Hub
 The method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.
 ::: {.callout-note}
 If you're using a private dataset, you will need to enable the `hf_use_auth_token` flag in the root-level of the config file.
 :::
 #### Folder uploaded
 This would mean that the dataset is a single file or file(s) uploaded to the Hub.
 ```yaml
 datasets:
  - path: org/dataset-name
    data_files:
      - file1.jsonl
      - file2.jsonl
 ```
 #### HuggingFace Dataset
 This means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via `datasets.push_to_hub`.
 ```yaml
 datasets:
  - path: org/dataset-name
 ```
 ::: {.callout-note}
 There are some other configs which may be required like `name`, `split`, `revision`, `trust_remote_code`, etc depending on the dataset.
 :::
 ### Remote Filesystems
 Via the `storage_options` config under `load_dataset`, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.
 ::: {.callout-warning}
 This is currently experimental. Please let us know if you run into any issues!
 :::
 The only difference between the providers is that you need to prepend the path with the respective protocols.
 ```yaml
 datasets:
    # Single file
  - path: s3://bucket-name/path/to/your/file.jsonl
    # Directory
  - path: s3://bucket-name/path/to/your/directory
 ```
 For directory, we load via `load_from_disk`.
 #### S3
 Prepend the path with `s3://`.
 The credentials are pulled in the following order:
 - `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN` environment variables
 - from the `~/.aws/credentials` file
 - for nodes on EC2, the IAM metadata provider
 ::: {.callout-note}
 We assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.
 :::
 Other environment variables that can be set can be found in [boto3 docs](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#using-environment-variables)
 #### GCS
 Prepend the path with `gs://` or `gcs://`.
 The credentials are loaded in the following order:
 - gcloud credentials
 - for nodes on GCP, the google metadata service
 - anonymous access
 #### Azure
 ##### Gen 1
 Prepend the path with `adl://`.
 Ensure you have the following environment variables set:
 - `AZURE_STORAGE_TENANT_ID`
 - `AZURE_STORAGE_CLIENT_ID`
 - `AZURE_STORAGE_CLIENT_SECRET`
 ##### Gen 2
 Prepend the path with `abfs://` or `az://`.
 Ensure you have the following environment variables set:
 - `AZURE_STORAGE_ACCOUNT_NAME`
 - `AZURE_STORAGE_ACCOUNT_KEY`
 Other environment variables that can be set can be found in [adlfs docs](https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials)
 #### OCI
 Prepend the path with `oci://`.
 It would attempt to read in the following order:
 - `OCIFS_IAM_TYPE`, `OCIFS_CONFIG_LOCATION`, and `OCIFS_CONFIG_PROFILE` environment variables
 - when on OCI resource, resource principal
 Other environment variables:
 - `OCI_REGION_METADATA`
 Please see the [ocifs docs](https://ocifs.readthedocs.io/en/latest/getting-connected.html#Using-Environment-Variables).
 ### HTTPS
 The path should start with `https://`.
 ```yaml
 datasets:
  - path: https://path/to/your/dataset/file.jsonl
 ```
 This must be publically accessible.
 ## Next steps
 Now that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format [dataset formats docs](dataset-formats).
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -36,9 +36,6 @@ deepspeed: deepspeed_configs/zero1.json
 ### Usage {#sec-deepspeed-usage}
 ```{.bash}
 # Fetch deepspeed configs (if not already present)
 axolotl fetch deepspeed_configs
 # Passing arg via config
 axolotl train config.yml
@@ -51,20 +48,10 @@ axolotl train config.yml --deepspeed deepspeed_configs/zero1.json
 We provide default configurations for:
 - ZeRO Stage 1 (`zero1.json`)
 - ZeRO Stage 1 with torch compile (`zero1_torch_compile.json`)
 - ZeRO Stage 2 (`zero2.json`)
 - ZeRO Stage 3 (`zero3.json`)
 - ZeRO Stage 3 with bf16 (`zero3_bf16.json`)
 - ZeRO Stage 3 with bf16 and CPU offload params(`zero3_bf16_cpuoffload_params.json`)
 - ZeRO Stage 3 with bf16 and CPU offload params and optimizer (`zero3_bf16_cpuoffload_all.json`)
-::: {.callout-tip}
+Choose based on your memory requirements and performance needs.
 Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.
 Start from Stage 1 -> Stage 2 -> Stage 3.
 :::
 ## FSDP {#sec-fsdp}
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -9,7 +9,6 @@ format:
 ## Supported Models
 - [Mllama](#sec-mllama)
 - [Llama4](#sec-llama4)
 - [Pixtral](#sec-pixtral)
 - [Llava-1.5](#sec-llava-15)
 - [Mistral-Small-3.1](#sec-mistral-small-31)
@@ -64,14 +63,6 @@ base_model: meta-llama/Llama-3.2-11B-Vision-Instruct
 chat_template: llama3_2_vision
 ```
 ### Llama4 {#sec-llama4}
 ```yaml
 base_model: meta-llama/Llama-4-Scout-17B-16E-Instruct
 chat_template: llama4
 ```
 ### Pixtral {#sec-pixtral}
 ```yaml
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -530,7 +530,7 @@ trl:
 ```
 ```bash
-CUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml
+CUDA_VISIBLE_DEVICES=2,3 axolotl vllm_serve grpo.yaml
 ```
 Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -27,9 +27,6 @@ To enable sequence parallelism, add the following to your configuration file:
 sequence_parallel_degree: 4  # Split sequences across 4 GPUs
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
 # Optional; one of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to
 # "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
 ring_attn_func:
 ```
 The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -8,6 +8,7 @@ tokenizer_type: GPT2Tokenizer
 trust_remote_code: true
 tokenizer_use_fast: true
 tokenizer_legacy: true
 strict: false
 push_dataset_to_hub:
 hf_use_auth_token: true
 datasets:
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -4,6 +4,7 @@ base_model: cerebras/Cerebras-GPT-1.3B
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -4,6 +4,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 chat_template: cohere
--- a/examples/dbrx/16bit-lora.yaml
+++ b/examples/dbrx/16bit-lora.yaml
@@ -3,6 +3,7 @@ base_model: LnL-AI/dbrx-base-converted-v2
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/dbrx/8bit-lora.yaml
+++ b/examples/dbrx/8bit-lora.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/dbrx/fft-ds-zero3.yaml
+++ b/examples/dbrx/fft-ds-zero3.yaml
@@ -3,6 +3,7 @@ base_model: LnL-AI/dbrx-base-converted-v2
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/deepcoder/deepcoder-14B-preview-lora.yml
@@ -1,58 +0,0 @@
 base_model: agentica-org/DeepCoder-14B-Preview
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -1,58 +0,0 @@
 base_model: deepcogito/cogito-v1-preview-llama-3B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -1,58 +0,0 @@
 base_model: deepcogito/cogito-v1-preview-qwen-14B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -2,6 +2,7 @@ base_model: deepseek-ai/DeepSeek-V2-Lite
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 plugins:
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -11,6 +11,7 @@ trust_remote_code: true
 load_in_8bit: true
 load_in_4bit: false
 gptq: false
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -15,6 +15,7 @@ load_in_8bit: false
 # enable 4bit for QLoRA
 load_in_4bit: true
 gptq: false
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: QingyiSi/Alpaca-CoT
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -8,6 +8,7 @@ tokenizer_type: AutoTokenizer
 # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
 trust_remote_code: true
 gptq: false
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -8,6 +8,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 datasets:
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 chat_template: gemma
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -5,6 +5,7 @@ num_labels: 1
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 reward_model: true
 chat_template: gemma
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -10,6 +10,7 @@ ddp_find_unused_parameters: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 chat_template: gemma3
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -1,4 +1,5 @@
 base_model: google/gemma-3-4b-it
 strict: false
 load_in_4bit: true
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -1,5 +1,6 @@
 base_model: google/gemma-3-4b-it
 processor_type: AutoProcessor
 strict: false
 load_in_4bit: true
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -4,6 +4,7 @@ base_model: EleutherAI/gpt-j-6b
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -5,6 +5,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -5,6 +5,7 @@ tokenizer_type: AutoTokenizer
 # hub_model_id: username/custom_model_name
 load_in_4bit: true
 strict: false
 use_tensorboard: true
 chat_template: jamba
 datasets:
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -10,6 +10,7 @@ gptq_disable_exllama: true
 tokenizer_use_fast: true
 tokenizer_legacy: true
 strict: false
 push_dataset_to_hub:
 hf_use_auth_token: true
 datasets:
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: yahma/alpaca-cleaned
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -5,6 +5,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -4,6 +4,7 @@ processor_type: AutoProcessor
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -9,6 +9,7 @@ liger_rms_norm: true
 liger_glu_activation: true
 liger_fused_linear_cross_entropy: true
 strict: false
 chat_template: llama3
 datasets:
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Meta-Llama-3.1-8B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 chat_template: llama3
 rl: dpo
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 chat_template: llama3
 datasets:
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 chat_template: llama3
 rl: dpo
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -4,6 +4,7 @@ base_model: meta-llama/Llama-3.2-1B
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 rl: kto
 rl_beta: 0.5
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -4,6 +4,7 @@ base_model: NousResearch/Llama-3.2-1B
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -5,6 +5,7 @@ tokenizer_type: AutoTokenizer
 # hub_model_id: username/custom_model_name
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer  # PreTrainedTokenizerFast
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: aaditya/alpaca_subset_1
--- a/examples/llama-4/README.md
+++ b/examples/llama-4/README.md
@@ -1,28 +0,0 @@
 # Llama 4 by Meta AI
 ## Flash Attention vs Flex Attention
 While Flash Attention to support is "enabled" for Llama-4, the upstream implementation is not correct and usage of Flex Attention is recommended.
 ## Available Examples
 ### Llama 4 Scout 17Bx16Experts (109B)
 Flex Attention
 - [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100-flex.yaml)
 - [Text Multi GPU QLoRA w/ FSDP2](./scout-qlora-flexattn-fsdp2.yaml)
 [//]: # (Flash Attention &#40;Do not use&#41;)
 [//]: # (- [Multi-Modal/Vision QLoRA w/ FSDP1]&#40;./scout-vision-qlora-fsdp.yaml&#41;)
 [//]: # (- [Text Single GPU &#40;H100&#41; QLoRA]&#40;./scout-qlora-single-h100.yaml&#41;)
 [//]: # (- [Text Multi GPU QLoRA w/ FSDP1]&#40;./scout-qlora-fsdp1.yaml&#41;)
 Our Single H100 implementation for Llama 4 Scout uses only 64.5GB VRAM for post-training with 4k context length @ 519 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/wpie7dkj)
 Multi-GPU (4xH100) for Llama 4 Scout uses 62.8GB VRAM/GPU @ 4k contenxt length @ 280tps/gpu, [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/2lkezdj8)
 ### Llama 4 Maverick 17Bx128Experts (400B)
 Coming Soon
--- a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
@@ -1,88 +0,0 @@
 base_model: axolotl-quants/Llama-4-Maverick-17B-128E-Linearized-bnb-nf4-bf16
 model_type: Llama4ForConditionalGeneration
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
 llama4_linearized_experts: true
 load_in_4bit: true
 adapter: qlora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
 lora_modules_to_save:
 # - lm_head
 # - embed_tokens
 chat_template: llama4
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 1e-4
 bf16: true
 tf32: true
 logging_steps: 1
 flash_attention: true
 gradient_checkpointing: offload
 gradient_checkpointing_kwargs:
  use_reentrant: false
 warmup_steps: 20
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 fsdp:
  - auto_wrap
  - full_shard
 fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
@@ -1,92 +0,0 @@
 base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
 model_type: Llama4ForConditionalGeneration
  # Automatically upload checkpoint and final model to HF
  # hub_model_id: username/custom_model_name
 # torch_compile: true
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
 llama4_linearized_experts: true
 load_in_4bit: true
 adapter: qlora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
    # - experts.gate_projs.[0-9]+$
    # - experts.up_projs.[0-9]+$
    # - experts.down_projs.[0-9]+$
 lora_modules_to_save:
  - lm_head
  - embed_tokens
 chat_template: llama4
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 2e-5
 bf16: true
 tf32: true
 logging_steps: 1
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 fsdp:
  - auto_wrap
  - full_shard
 fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_activation_checkpointing: true
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
@@ -1,85 +0,0 @@
 base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
 model_type: Llama4ForConditionalGeneration
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
 llama4_linearized_experts: true
 load_in_4bit: true
 adapter: qlora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
 lora_modules_to_save:
  # - lm_head
  # - embed_tokens
 lora_mlp_kernel: true
 lora_qkv_kernel: true
 lora_o_kernel: true
 chat_template: llama4
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 sequence_len: 4096  # up to 8k will work on a single H100
 sample_packing: true
 pad_to_sequence_len: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 1e-4
 bf16: true
 tf32: true
 logging_steps: 1
 flash_attention: true
 gradient_checkpointing: offload
 gradient_checkpointing_kwargs:
  use_reentrant: false
 warmup_steps: 20
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
--- a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
@@ -1,88 +0,0 @@
 base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
 model_type: Llama4ForConditionalGeneration
 processor_type: Llama4Processor
  # Automatically upload checkpoint and final model to HF
  # hub_model_id: username/custom_model_name
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false
 sequence_len: 4096
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
 llama4_linearized_experts: true  # use Axolotl's customized model
 load_in_4bit: true
 adapter: qlora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  - vision_adapter.mlp.fc1
  - vision_adapter.mlp.fc2
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
 lora_modules_to_save:
  - lm_head
  - embed_tokens
 chat_template: llama4
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
    field_messages: messages
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 2e-5
 bf16: true
 tf32: true
 logging_steps: 1
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 fsdp:
  - auto_wrap
  - full_shard
 fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_activation_checkpointing: true
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
--- a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
+++ b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
@@ -1,86 +0,0 @@
 base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
 model_type: Llama4ForConditionalGeneration
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
 llama4_linearized_experts: true
 load_in_4bit: true
 adapter: qlora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
 lora_modules_to_save:
  # - lm_head
  # - embed_tokens
 chat_template: llama4
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 3
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 1e-4
 bf16: true
 tf32: true
 logging_steps: 1
 flex_attention: true
 flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 fsdp:
  - auto_wrap
  - full_shard
 fsdp_config:
  fsdp_version: 2
  fsdp_offload_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_reshard_after_forward: true
  fsdp_activation_checkpointing: true
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
--- a/examples/llama-4/scout-qlora-single-h100-flex.yaml
+++ b/examples/llama-4/scout-qlora-single-h100-flex.yaml
@@ -1,85 +0,0 @@
 base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
 model_type: Llama4ForConditionalGeneration
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
 cut_cross_entropy: true
 llama4_linearized_experts: true  # needed with custom linearized experts model
 load_in_4bit: true
 adapter: qlora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  # - experts.gate_projs.[0-9]+$  # optionally train the moe experts
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
 lora_modules_to_save:
  # - lm_head  # needed if modifying vocabulary
  # - embed_tokens
 lora_mlp_kernel: true
 lora_qkv_kernel: true
 lora_o_kernel: true
 chat_template: llama4
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 sequence_len: 4096  # up to 8k will work on a single H100
 sample_packing: true
 pad_to_sequence_len: true
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 1e-4
 bf16: true
 tf32: true
 torch_compile: true
 flex_attention: true
 flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs
 gradient_checkpointing: offload
 gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
 warmup_steps: 20
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
--- a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
+++ b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
@@ -1,89 +0,0 @@
 base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
 model_type: Llama4ForConditionalGeneration
 processor_type: Llama4Processor
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false
 sequence_len: 4096
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
 llama4_linearized_experts: true  # use Axolotl's customized model
 load_in_4bit: true
 adapter: qlora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  - vision_adapter.mlp.fc1
  - vision_adapter.mlp.fc2
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
 lora_modules_to_save:
  - lm_head
  - embed_tokens
 chat_template: llama4
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
    field_messages: messages
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 1e-4
 bf16: true
 tf32: true
 logging_steps: 1
 flex_attention: true
 flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 fsdp:
  - auto_wrap
  - full_shard
 fsdp_config:
  fsdp_version: 2
  fsdp_offload_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_reshard_after_forward: true
  fsdp_activation_checkpointing: true
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -1,5 +1,6 @@
 base_model: llava-hf/llava-1.5-7b-hf
 processor_type: AutoProcessor
 strict: false
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -5,6 +5,7 @@ tokenizer_type: AutoTokenizer
 tokenizer_config: EleutherAI/gpt-neox-20b
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/mistral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral-ds-zero3.yaml
@@ -6,6 +6,7 @@ tokenizer_type: LlamaTokenizer
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 unfrozen_parameters:
  - ^lm_head.weight$
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -4,6 +4,7 @@ model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -4,6 +4,7 @@ model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -12,6 +12,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 chat_template: chatml
 rl: dpo
--- a/examples/mistral/mistral-qlora-fsdp.yml
+++ b/examples/mistral/mistral-qlora-fsdp.yml
@@ -9,6 +9,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/mistral/mistral-qlora-orpo.yml
+++ b/examples/mistral/mistral-qlora-orpo.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 rl: orpo
 orpo_alpha: 0.1
--- a/examples/mistral/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small-3.1-24B-lora.yml
@@ -1,5 +1,6 @@
 base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
 processor_type: AutoProcessor
 strict: false
 load_in_8bit: true
--- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/mistral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-qlora-fsdp.yml
@@ -9,6 +9,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -9,6 +9,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/mistral/mixtral_22.yml
+++ b/examples/mistral/mixtral_22.yml
@@ -6,6 +6,7 @@ tokenizer_type: LlamaTokenizer
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 unfrozen_parameters:
  - ^lm_head.weight$
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/openllama-3b/qlora.yml
+++ b/examples/openllama-3b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/phi/lora-3.5.yaml
+++ b/examples/phi/lora-3.5.yaml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 chat_template: phi_3
 datasets:
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Wing Lian	320553850a	update peft to 0.15.1	2025-04-06 19:55:07 -04:00
Wing Lian	b38f70e068	use 4.51.0 for now	2025-04-06 18:14:14 -04:00
Wing Lian	cf4c84e21d	slightly smaller train set	2025-04-06 17:11:52 -04:00
Wing Lian	98d98ea1dd	reordering to trigger torch 2.6.0 tests first	2025-04-06 17:11:52 -04:00
Wing Lian	0cf42ab8a3	don't use deepspeed for the fix_untrained_tokens test	2025-04-06 17:11:52 -04:00
Wing Lian	3d0ab75a0c	be flexible on transformers version and skip test on version	2025-04-06 17:11:50 -04:00
Wing Lian	d375be90ff	add xet support [skip ci]	2025-04-06 17:09:23 -04:00
Wing Lian	98827e8f3b	llama4 support	2025-04-06 17:08:57 -04:00