precommit

fix tests
2025-03-21 16:43:14 +00:00 · 2025-03-21 16:36:56 +00:00 · 2025-03-21 16:36:54 +00:00 · 2025-03-21 16:36:34 +00:00 · 2025-03-21 16:36:34 +00:00 · 2025-03-21 16:36:32 +00:00
222 changed files with 2505 additions and 7058 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -40,24 +40,12 @@ jobs:
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: nightly
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: next
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -79,7 +67,7 @@ jobs:
        uses: docker/build-push-action@v4
        with:
          context: .
-          file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || matrix.pytorch == 'next' && './docker/Dockerfile-base-next' || './docker/Dockerfile-base' }}
+          file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || './docker/Dockerfile-base' }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -23,7 +23,6 @@ jobs:
        - name: Install dependencies
          run: |
            python3 -m pip install jupyter quartodoc
            python3 -m pip install -e . --no-deps
        - name: Build autodoc
          run: quartodoc build
        - name: Publish to GitHub Pages (and render)
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -25,12 +25,12 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras: vllm
            is_latest: true
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
            is_latest: true
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -87,12 +87,12 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
            is_latest: true
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
            is_latest: true
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -24,13 +24,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -45,6 +38,14 @@ jobs:
            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            # awaiting vllm#12721
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -33,15 +33,6 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4
      - name: Restore HF cache
        id: hf-cache-restore
        uses: actions/cache/restore@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ runner.os }}-hf-hub-cache-v2
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
@@ -55,7 +46,7 @@ jobs:
      - name: Install PyTorch
        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
+          pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
      - name: Update requirements.txt
        run: |
@@ -67,7 +58,8 @@ jobs:
      - name: Install dependencies
        run: |
-          pip3 show torch
+          pip3 install --upgrade pip
          pip3 install --upgrade packaging==23.2
          pip3 install --no-build-isolation -U -e .
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
@@ -81,15 +73,10 @@ jobs:
        run: |
          axolotl --help
      - name: Pre-Download dataset fixture
        run: |
          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
+          pytest -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
-          pytest -v tests/patched/
+          pytest tests/patched/
          pytest -v tests/cli/
      - name: cleanup pip cache
        run: |
@@ -149,4 +136,4 @@ jobs:
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.e2e_tests
+          modal run cicd.tests
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -63,7 +63,7 @@ jobs:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+          key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}
      - name: Setup Python
        uses: actions/setup-python@v5
@@ -96,10 +96,6 @@ jobs:
        run: |
          axolotl --help
      - name: Pre-Download dataset fixture
        run: |
          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
      - name: Run tests
        run: |
          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
@@ -141,7 +137,7 @@ jobs:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+          key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}
      - name: Setup Python
        uses: actions/setup-python@v5
@@ -175,9 +171,6 @@ jobs:
        run: |
          axolotl --help
      - name: Show HF cache
        run: huggingface-cli scan-cache
      - name: Run tests
        run: |
          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
@@ -208,53 +201,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras: vllm
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 90
    needs: [pre-commit, pytest, docker-e2e-tests-1st]
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            num_gpus: 1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -283,4 +229,51 @@ jobs:
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.e2e_tests
+          modal run cicd.tests
  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 90
    needs: [pre-commit, pytest, docker-e2e-tests-1st]
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            num_gpus: 1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.tests
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -1,4 +1,3 @@
 [settings]
 profile=black
 known_third_party=wandb,comet_ml
 known_local_folder=src,tests
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -40,7 +40,6 @@ quartodoc:
        - cli.preprocess
        - cli.sweeps
        - cli.utils
        - cli.vllm_serve
        - cli.cloud.base
        - cli.cloud.modal_
    - title: Trainers
@@ -134,7 +133,6 @@ quartodoc:
        - utils.schemas.datasets
        - utils.schemas.peft
        - utils.schemas.trl
        - utils.schemas.multimodal
        - utils.schemas.integrations
        - utils.schemas.enums
        - utils.schemas.utils
@@ -244,7 +242,6 @@ website:
            - docs/unsloth.qmd
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
            - docs/sequence_parallelism.qmd
        - section: "Troubleshooting"
          contents:
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -2,5 +2,4 @@
 set -e
 # only run one test at a time so as not to OOM the GPU
-pytest -v  --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
+pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/
 pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,9 +20,9 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi
 RUN python scripts/unsloth_install.py | sh
--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -1,38 +0,0 @@
 ARG CUDA_VERSION="12.8.1"
 ARG CUDNN_VERSION="8"
 ARG UBUNTU_VERSION="22.04"
 ARG MAX_JOBS=4
 FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
 ENV PATH="/root/miniconda3/bin:${PATH}"
 ARG PYTHON_VERSION="3.11"
 ARG PYTORCH_VERSION="next"
 ARG CUDA="128"
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
 ENV PYTHON_VERSION=$PYTHON_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
 RUN apt-get update \
    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
    && wget \
    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
    && mkdir /root/.conda \
    && bash Miniconda3-latest-Linux-x86_64.sh -b \
    && rm -f Miniconda3-latest-Linux-x86_64.sh \
    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
 ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace
 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
    python3 -m pip install --no-cache-dir -U torch==2.7.0 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
 RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    pip3 install -U --no-cache-dir pydantic==2.10.6
--- a/docs/cli.qmd
+++ b/docs/cli.qmd
@@ -170,7 +170,7 @@ axolotl merge-sharded-fsdp-weights config.yml
 ### evaluate
-Evaluates a model's performance (loss etc) on the train and eval datasets.
+Evaluates a model's performance using metrics specified in the config.
 ```bash
 # Basic evaluation
@@ -197,8 +197,6 @@ lm_eval_batch_size: # Batch size for evaluation
 output_dir: # Directory to save evaluation results
 ```
 See [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) for more details.
 ## Legacy CLI Usage
 While the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:
@@ -237,7 +235,7 @@ Create a cloud config YAML with your Modal settings:
 ```yaml
 # cloud_config.yml
 provider: modal
-gpu: a100       # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4
+gpu: a100  # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4
 gpu_count: 1    # Number of GPUs to use
 timeout: 86400  # Maximum runtime in seconds (24 hours)
 branch: main    # Git branch to use (optional)
@@ -250,7 +248,7 @@ volumes:        # Persistent storage volumes
  - name: axolotl-artifacts
    mount: /workspace/artifacts
-secrets:        # Secrets to inject
+env:            # Environment variables
  - WANDB_API_KEY
  - HF_TOKEN
 ```
@@ -276,27 +274,15 @@ axolotl lm-eval config.yml --cloud cloud_config.yml
 ### Cloud Configuration Options
 ```yaml
-provider:    # compute provider, currently only `modal` is supported
+provider: # compute provider, currently only `modal` is supported
-gpu:         # GPU type to use
+gpu: # GPU type to use
-gpu_count:   # Number of GPUs (default: 1)
+gpu_count: # Number of GPUs (default: 1)
-memory:      # RAM in GB (default: 128)
+memory: # RAM in GB (default: 128)
-timeout:     # Maximum runtime in seconds
+timeout: # Maximum runtime in seconds
 timeout_preprocess: # Preprocessing timeout
-branch:      # Git branch to use
+branch: # Git branch to use
-docker_tag:  # Custom Docker image tag
+docker_tag: # Custom Docker image tag
-volumes:     # List of persistent storage volumes
+volumes: # List of persistent storage volumes
-
+env: # Environment variables to pass
-# Environment variables to pass. Can be specified in two ways:
+secrets: # Secrets to inject
 # 1. As a string: Will load the value from the host computer's environment variables
 # 2. As a key-value pair: Will use the specified value directly
 # Example:
 # env:
 #   - CUSTOM_VAR  # Loads from host's $CUSTOM_VAR
 #   - {CUSTOM_VAR: "value"}  # Uses "value" directly
 env:
 # Secrets to inject. Same input format as `env` but for sensitive data.
 secrets:
  # - HF_TOKEN
  # - WANDB_API_KEY
 ```
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -238,10 +238,10 @@ simpo_gamma: 0.5  # Target reward margin for the SimPO loss
 # grpo
 trl:
  use_vllm: # Optional[bool]. Whether to use VLLM for RL training.
-  vllm_server_host: # Optional[str]. Host of the vLLM server to connect to.
+  vllm_device: # Optional[str]. Device to use for VLLM.
-  vllm_server_port: # Optional[int]. Port of the vLLM server to connect to.
+  vllm_gpu_memory_utilization: # Optional[float]. GPU memory utilization for VLLM.
-  vllm_server_timeout: # Optional[int]. Total timeout (in seconds) to wait for the vLLM server to respond.
+  vllm_max_model_len: # Optional[int]. Maximum length of the model for VLLM.
-  vllm_guided_decoding_regex: # Optional[str]. Regex for vLLM guided decoding.
+  vllm_dtype: # Optional[str]. Data type for VLLM.
  beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use
  max_completion_length: # Optional[int]. Maximum length of the completion for RL training.
@@ -320,13 +320,9 @@ total_num_tokens:
 sample_packing_group_size: 100000
 # The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
 sample_packing_bin_size: 200
 sample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially.
 # whether to concatenate samples during pretraining
 pretraining_sample_concatenation:
 curriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning
 # Use batch flattening for speedups when not using sample_packing
 batch_flattening:
@@ -358,27 +354,7 @@ lora_target_modules:
 #  - down_proj
 #  - up_proj
 lora_target_linear: # If true, will target all linear modules
-
+peft_layers_to_transform: # The layer indices to transform, otherwise, apply to all layers
 # List[int] | int. # The layer indices to transform, otherwise, apply to all layers
 # https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig.layers_to_transform
 peft_layers_to_transform:
 # Optional[bool]. Whether to use DoRA.
 # https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#weight-decomposed-low-rank-adaptation-dora
 peft_use_dora:
 # Optional[bool]. Whether to use RSLoRA.
 # https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#rank-stabilized-lora
 peft_use_rslora:
 # Optional[list[tuple[int, int]]]. List of layer indices to replicate.
 # https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#memory-efficient-layer-replication-with-lora
 peft_layer_replication:
 # bool | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq"]
 # How to initialize LoRA weights. Default to True which is MS original implementation.
 # https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#initialization
 peft_init_lora_weights:
 # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
 # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
@@ -490,7 +466,6 @@ auto_find_batch_size: # Optional[bool]
 eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
 eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
 do_causal_lm_eval: # Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`.
 eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"]
 profiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.
@@ -510,8 +485,7 @@ train_on_inputs: false
 # Note that training loss may have an oscillating pattern with this enabled.
 group_by_length: false
-# Whether to use gradient checkpointing. Available options are: true, false, "offload".
+# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 # https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 gradient_checkpointing: false
 # additional kwargs to pass to the trainer for gradient checkpointing
 # gradient_checkpointing_kwargs:
@@ -532,58 +506,36 @@ lr_div_factor: # Learning rate div factor
 # Specify optimizer
 # Valid values are driven by the Transformers OptimizerNames class, see:
-# https://github.com/huggingface/transformers/blob/cbf924b76c03828101a34069a96d209314114fd5/src/transformers/training_args.py#L144-L189
+# https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
 #
 # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
 # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
 # in the examples/ for your model and fine-tuning use case.
 #
 # Valid values for 'optimizer' include:
 # - adamw_hf
 # - adamw_torch
 # - adamw_torch_fused
 # - adamw_torch_xla
 # - adamw_torch_npu_fused
 # - adamw_apex_fused
-# - adopt_adamw  (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)
+# - adopt_adamw (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)
 # - adafactor
 # - adamw_anyprecision
 # - adamw_torch_4bit
 # - ademamix
 # - sgd
 # - adagrad
 # - adamw_bnb_8bit
 # - adamw_8bit   # alias for adamw_bnb_8bit
 # - ademamix_8bit
 # - lion_8bit
 # - lion_32bit
 # - paged_adamw_32bit
 # - paged_adamw_8bit
 # - paged_ademamix_32bit
 # - paged_ademamix_8bit
 # - paged_lion_32bit
 # - paged_lion_8bit
 # - rmsprop
 # - rmsprop_bnb
 # - rmsprop_bnb_8bit
 # - rmsprop_bnb_32bit
 # - galore_adamw
 # - galore_adamw_8bit
 # - galore_adafactor
 # - galore_adamw_layerwise
 # - galore_adamw_8bit_layerwise
 # - galore_adafactor_layerwise
 # - lomo
 # - adalomo
 # - grokadamw
 # - schedule_free_adamw
 # - schedule_free_sgd
 # - apollo_adamw
 # - apollo_adamw_layerwise
 #
 # Additional custom optimizers include:
 # - optimi_adamw
 # - ao_adamw_8bit
 # - ao_adamw_fp8
 optimizer:
 # Dictionary of arguments to pass to the optimizer
 optim_args:
@@ -612,42 +564,29 @@ max_grad_norm:
 # currently only supported on Llama and Mistral
 neftune_noise_alpha:
-# Optional[bool]. Whether to bettertransformers
+# Whether to bettertransformers
 flash_optimum:
-
+# Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
 # Note: Only one of the following attention patches can be used at a time.
 # For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`.
 # Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
 xformers_attention:
-# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
+# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
 flash_attention:
-flash_attn_cross_entropy:  # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only
+flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
-flash_attn_rms_norm:  # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only
+flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
-flash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation
+flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
-flash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation
+flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
-# Optional[bool]. Whether to use scaled-dot-product attention
+# Whether to use scaled-dot-product attention
 # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 sdp_attention:
-# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
+# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
 s2_attention:
 # Optional[bool]. Whether to use low_cpu_mem_usage
 low_cpu_mem_usage:
-# Optional[str]. Resume from a specific checkpoint dir
+# Resume from a specific checkpoint dir
 resume_from_checkpoint:
-# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off.
+# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
 # Be careful with this being turned on between different models.
 auto_resume_from_checkpoints: false
 ## Multimodal section
 # int | tuple[int, int] | None . Size to resize images to, width x height.
 # Will read from model/processor config if not set.
 image_size:
 # str. Algorithm to use for image resizing. "bilinear", "bicubic", "lanczos". Default is "bilinear".
 image_resize_algorithm: 'bilinear'
 ## End of multimodal section
 # Don't mess with this, it's here for accelerate and torchrun
 local_rank:
@@ -688,9 +627,6 @@ ddp_broadcast_buffers:
 # subsequences, or set to 4 to split into four equal-sized subsequences.
 # See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
 sequence_parallel_degree:
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 # Must evenly divide the number of KV heads in your model.
 heads_k_stride: 1
 # Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -103,7 +103,8 @@ This uses the same tags as the [`main` image](#sec-main-tags).
 - `JUPYTER_DISABLE`: Disable Jupyter lab.
 - `JUPYTER_PASSWORD`: Set a password for the Jupyter lab.
- `PUBLIC_KEY` / `SSH_KEY`: Add a public key for the SSH service.
+- `PUBLIC_KEY`: Add a public key for the SSH service.
 - `SSH_KEY`: Add a private key for the SSH service.
 #### Volume mounts
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -35,22 +35,12 @@ description: Frequently asked questions
 **Q: How to call Axolotl via custom python scripts?**
-> A: Since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.
+> A: Yes, since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.
 **Q: How to know the value to use for `fsdp_transformer_layer_cls_to_wrap`?**
 > A: This is the class name of the transformer layer to wrap with FSDP. For example, for `LlamaForCausalLM`, the value is `LlamaDecoderLayer`. To find this for a specific model, check the model's `PreTrainedModel` definition and look for `_no_split_modules` variable in the `modeling_<model_name>.py` file within `transformers` library.
 **Q: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token**
 > A: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:
 > ```yaml
 > special_tokens:
 >   # str. If you're not sure, set to same as `eos_token`.
 >   pad_token: "..."
 > ```
 ### Chat templates
 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -17,7 +17,6 @@ We currently support several common model architectures, including (but not limi
 - `qwen2`
 - `gemma`
 - `gemma2`
 - `gemma3`
 <details>
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -18,7 +18,6 @@ Axolotl supports several methods for multi-GPU training:
 - DeepSpeed (recommended)
 - FSDP (Fully Sharded Data Parallel)
 - Sequence parallelism
 - FSDP + QLoRA
 ## DeepSpeed {#sec-deepspeed}
@@ -67,28 +66,6 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```
 ## Sequence parallelism {#sec-sequence-parallelism}
 We support sequence parallelism (SP) via the
 [ring-flash-attention](https://github.com/zhuzilin/ring-flash-attention) project. This
 allows one to split up sequences across GPUs, which is useful in the event that a
 single sequence causes OOM errors during model training.
 First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
 or from source with `pip install .[ring-flash-attn]`.
 Your Axolotl YAML config should contain the following lines:
 ```{.yaml}
 sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
 flash_attention: true  # Required with sequence parallelism
 # Optional; strides across the key dimension. Larger values use more memory but will make training faster.
 heads_k_stride: 1
 ```
 See our [dedicated guide](sequence_parallelism.qmd) for more details.
 ### FSDP + QLoRA {#sec-fsdp-qlora}
 For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -1,171 +1,28 @@
---
+# MultiModal / Vision Language Models (BETA)
 title: MultiModal / Vision Language Models (BETA)
 format:
  html:
    toc: true
    toc-depth: 3
 ---
-## Supported Models
+### Supported Models
- [Mllama](#sec-mllama)
+- Mllama, i.e. llama with vision models
 - [Pixtral](#sec-pixtral)
 - [Llava-1.5](#sec-llava-15)
 - [Mistral-Small-3.1](#sec-mistral-small-31)
 - [Gemma-3](#sec-gemma-3)
 - [Qwen2-VL](#sec-qwen2-vl)
 - [Qwen2.5-VL](#sec-qwen25-vl)
-## Usage
+### Usage
-Multimodal support is limited and doesn't have full feature parity.
+Currently multimodal support is limited and doesn't have full feature parity. To finetune a multimodal Llama w/ LoRA,
-
+you'll need to use the following in YAML in combination with the rest of the required hyperparams.
 Here are the hyperparams you'll need to use to finetune a multimodal model.
 ```yaml
 base_model: alpindale/Llama-3.2-11B-Vision-Instruct
 processor_type: AutoProcessor
 skip_prepare_dataset: true
 remove_unused_columns: false  # leave columns in place as they are needed to handle image embeddings during training
 sample_packing: false  # not yet supported with multimodal
-chat_template:  # see in next section
+chat_template: llama3_2_vision
 # example dataset
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
    field_messages: messages
 remove_unused_columns: false
 sample_packing: false
-# (optional) if doing lora, only finetune the Language model,
+# only finetune the Language model, leave the vision model and vision tower frozen
 # leave the vision model and vision tower frozen
 # load_in_8bit: true
 adapter: lora
 lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 # (optional) if you want to resize images to a set size
 image_size: 512
 image_resize_algorithm: bilinear
 ```
 Please see [examples](https://github.com/axolotl-ai/axolotl/tree/main/examples) folder for full configs.
 ::: {.callout-warning}
 Some of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.
 :::
 ### Mllama {#sec-mllama}
 ```yaml
 base_model: meta-llama/Llama-3.2-11B-Vision-Instruct
 chat_template: llama3_2_vision
 ```
 ### Pixtral {#sec-pixtral}
 ```yaml
 base_model: mistralai/Pixtral-12B-2409
 chat_template: pixtral
 ```
 ### Llava-1.5 {#sec-llava-15}
 ```yaml
 base_model: llava-hf/llava-1.5-7b-hf
 chat_template: llava
 ```
 ### Mistral-Small-3.1 {#sec-mistral-small-31}
 ```yaml
 base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
 chat_template: mistral_v7_tekken
 ```
 ### Gemma-3 {#sec-gemma-3}
 ::: {.callout-tip}
 The Gemma3-1B model is a text-only model, so please train as regular text model.
 :::
 For multi-modal 4B/12B/27B models, use the following config:
 ```yaml
 base_model: google/gemma-3-4b-it
 chat_template: gemma3
 ```
 ### Qwen2-VL {#sec-qwen2-vl}
 ```yaml
 base_model: Qwen/Qwen2-VL-7B-Instruct
 chat_template: qwen2_vl
 ```
 ### Qwen2.5-VL {#sec-qwen25-vl}
 ```yaml
 base_model: Qwen/Qwen2.5-VL-7B-Instruct
 chat_template: qwen2_vl  # same as qwen2-vl
 ```
 ## Dataset Format
 For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
 - A message is a list of `role` and `content`.
 - `role` can be `system`, `user`, `assistant`, etc.
 - `content` is a list of `type` and (`text` or `image` or `path` or `url` or `base64`).
 ::: {.callout-note}
 For backwards compatibility:
 - If the dataset has a `images` or `image` column of `list[Image]`, it will be appended to the first `content` list as `{"type": "image", "image": ...}`. However, if the content already has a `{"type": "image"}` but no `image` key, it will be set the `image` key.
 - If `content` is a string, it will be converted to a list with `type` as `text`.
 :::
 ::: {.callout-tip}
 For image loading, you can use the following keys within `content` alongside `"type": "image"`:
 - `"path": "/path/to/image.jpg"`
 - `"url": "https://example.com/image.jpg"`
 - `"base64": "..."`
 - `"image": PIL.Image`
 :::
 Here is an example of a multi-modal dataset:
 ```json
 [
  {
    "messages": [
        {
            "role": "system",
            "content": [
              {"type": "text", "text": "You are a helpful assistant."}
              ]
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
                {"type": "text", "text": "Describe this image in detail."}
            ]
        },
        {
            "role": "assistant",
            "content": [
              {"type": "text", "text": "The image is a bee."}
            ]
        }
    ]
  }
 ]
 ```
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -502,48 +502,9 @@ The input format is a simple JSON input with customizable fields based on the ab
 Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
 :::
 If you have multiple GPUs available, we reccomend using `vLLM` with the `GRPOTrainer` to significantly speedup trajectory generation during training.
 First, launch a `vLLM` server using `trl vllm-serve` - you may use a config file or CLI overrides to configure your vLLM server. In this example, we're
 using 4 GPUs - 2 for training, and 2 for vLLM:
 ::: {.callout-important}
 Make sure you've installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. `pip install axolotl[vllm]`.
 :::
 ```yaml
 base_model: Qwen/Qwen2.5-1.5B-Instruct
 vllm:
    host: 0.0.0.0
    port: 8000
    tensor_parallel_size: 2
    gpu_memory_utilization: 0.85
    dtype: auto
    # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand
 rl: grpo
 trl:
    use_vllm: true
    vllm_server_host: 0.0.0.0
    vllm_server_port: 8000
    vllm_server_timeout: 300
 ```
 ```bash
 CUDA_VISIBLE_DEVICES=2,3 axolotl vllm_serve grpo.yaml
 ```
 Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:
 ```bash
 CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
 ```
 #### Reward functions
 GRPO uses custom reward functions and transformations. Please have them ready locally.
-For example, to load OpenAI's GSM8K and use a random reward for completions:
+For ex, to load OpenAI's GSM8K and use a random reward for completions:
 ```python
 # rewards.py
@@ -569,6 +530,8 @@ trl:
    beta: 0.001
    max_completion_length: 256
    use_vllm: True
    vllm_device: auto
    vllm_gpu_memory_utilization: 0.15
    num_generations: 4
    reward_funcs: ["rewards.rand_reward_func"]    # format: '{file_name}.{fn_name}'
    reward_weights: [1.0]
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -25,8 +25,6 @@ To enable sequence parallelism, add the following to your configuration file:
 ```yaml
 # Set to a divisor (> 1) of the number of GPUs available
 sequence_parallel_degree: 4  # Split sequences across 4 GPUs
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
 ```
 The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
@@ -60,16 +58,11 @@ To use sequence parallelism, you need:
 ## Example
 ```yaml
 # Example config with sequence parallelism
 base_model: meta-llama/Llama-3-8B-Instruct
 sequence_len: 8192
-
+sequence_parallel_degree: 2  # Split each sequence into 4 parts
 ...
 sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
 flash_attention: true  # Required with sequence parallelism
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
 ...
 ```
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -8,6 +8,9 @@ tokenizer_type: GPT2Tokenizer
 trust_remote_code: true
 tokenizer_use_fast: true
 tokenizer_legacy: true
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 push_dataset_to_hub:
 hf_use_auth_token: true
@@ -31,6 +34,7 @@ lora_alpha:
 lora_dropout:
 lora_target_modules:
 lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -54,12 +58,16 @@ learning_rate: 0.000085
 train_on_inputs: true
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 sdp_attention:
 flash_optimum:
@@ -72,6 +80,8 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 save_total_limit:
 debug:
 deepspeed:
 weight_decay: 0.1
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -22,6 +22,7 @@ lora_target_modules:
  - c_attn
  - c_proj
 lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -35,10 +36,15 @@ optimizer: paged_adamw_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -47,6 +53,10 @@ gptq_model_v1:
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -26,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -40,18 +41,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -41,18 +43,28 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -26,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -40,18 +41,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -41,18 +43,28 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -26,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -40,18 +41,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -41,18 +43,28 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -1,59 +0,0 @@
 base_model: CohereForAI/c4ai-command-r7b-12-2024
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 chat_template: cohere
 datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 val_set_size: 0.0
 output_dir: ./outputs/out
 adapter: qlora
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 4
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/dbrx/16bit-lora.yaml
+++ b/examples/dbrx/16bit-lora.yaml
@@ -3,6 +3,9 @@ base_model: LnL-AI/dbrx-base-converted-v2
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -45,20 +48,26 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: false  # don't use with fsdp_activation_checkpointing
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/dbrx/8bit-lora.yaml
+++ b/examples/dbrx/8bit-lora.yaml
@@ -48,20 +48,26 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: false  # don't use with fsdp_activation_checkpointing
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/dbrx/fft-ds-zero3.yaml
+++ b/examples/dbrx/fft-ds-zero3.yaml
@@ -3,6 +3,9 @@ base_model: LnL-AI/dbrx-base-converted-v2
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -32,19 +35,25 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 deepspeed: deepspeed_configs/zero3_bf16.json
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -2,6 +2,9 @@ base_model: deepseek-ai/DeepSeek-V2-Lite
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -28,19 +31,27 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 2e-5
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 2
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 special_tokens:
 fsdp:
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -52,19 +52,27 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 2e-5
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 2
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 special_tokens:
 fsdp:
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -25,7 +25,9 @@ max_packed_sequence_len:
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.0
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -39,10 +41,15 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -51,7 +58,11 @@ gptq_model_v1:
 warmup_steps: 40
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -38,7 +38,9 @@ lora_alpha: 16
 # 0.05 for 33B and 65B models
 lora_dropout: 0.05
 # add LoRA modules on all linear layers of the base model
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -65,7 +67,10 @@ lr_scheduler: cosine
 # - 2e-4 for 7b & 13b
 # - 1e-4 for 33b & 64b
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 # stop training after this many evaluation losses have increased in a row
@@ -73,6 +78,7 @@ gradient_checkpointing: true
 early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -81,7 +87,11 @@ gptq_model_v1:
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.000001
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -7,6 +7,9 @@ tokenizer_type: AutoTokenizer
 # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
 trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: false
 gptq: false
 strict: false
 push_dataset_to_hub:
@@ -22,7 +25,9 @@ max_packed_sequence_len:
 lora_r: 64
 lora_alpha: 32
 lora_dropout: 0.0
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -36,10 +41,15 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -48,7 +58,11 @@ gptq_model_v1:
 warmup_steps: 40
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -42,16 +42,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -48,16 +48,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch:
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -5,6 +5,9 @@ num_labels: 1
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 reward_model: true
@@ -35,6 +38,8 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: true
 fp16:
 tf32: true
@@ -42,12 +47,21 @@ tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch:
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -1,67 +0,0 @@
 base_model: google/gemma-3-1b-it
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 # gemma3 doesn't seem to play nice with ddp
 ddp_find_unused_parameters: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 chat_template: gemma3
 datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 val_set_size: 0.0
 output_dir: ./outputs/out
 adapter: qlora
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 4
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -1,61 +0,0 @@
 base_model: google/gemma-3-4b-it
 strict: false
 load_in_4bit: true
 # gemma3 doesn't seem to play nice with ddp
 ddp_find_unused_parameters: true
 chat_template: gemma3
 datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.01
 output_dir: ./outputs/out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: true
 fp16:
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
 flash_attention: true
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -1,63 +0,0 @@
 base_model: google/gemma-3-4b-it
 processor_type: AutoProcessor
 strict: false
 load_in_4bit: true
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false
 # gemma3 doesn't seem to play nice with ddp
 ddp_find_unused_parameters: true
 chat_template: gemma3
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
    field_messages: messages
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.01
 output_dir: ./outputs/out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: true
 fp16:
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
 flash_attention: true
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -18,7 +18,9 @@ max_packed_sequence_len:
 lora_r: 8
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -32,10 +34,15 @@ optimizer: paged_adamw_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0001
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -44,6 +51,10 @@ gptq_model_v1:
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -40,18 +40,26 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 special_tokens:
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -39,20 +39,26 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
-
+debug:
 deepspeed: deepspeed_configs/zero2.json
 weight_decay: 0.0
 special_tokens:
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -39,6 +39,8 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -33,9 +33,13 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 5
 xformers_attention: true
 flash_attention:
@@ -44,7 +48,11 @@ gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -4,6 +4,9 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -23,6 +26,7 @@ lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -37,12 +41,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
@@ -51,8 +61,11 @@ flash_attn_fuse_mlp: true
 warmup_steps: 100
 evals_per_epoch: 4
 eval_table_size:
 saves_per_epoch: 1
-
+debug:
 deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -10,6 +10,8 @@ gptq_disable_exllama: true
 tokenizer_use_fast: true
 tokenizer_legacy: true
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 push_dataset_to_hub:
 hf_use_auth_token: true
@@ -31,6 +33,7 @@ lora_target_modules:
  - q_proj
  - v_proj
 lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
 wandb_watch:
 wandb_name:
@@ -47,19 +50,26 @@ torchdistx_path:
 lr_scheduler: cosine
 lr_quadratic_warmup: true
 learning_rate: 0.000017
 train_on_inputs: false
 group_by_length: false
 bf16: false
 fp16: false
 float16: true
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention:
 sdp_attention:
 flash_optimum:
 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1
 special_tokens:
  bos_token: "<s>"
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -4,6 +4,9 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -23,6 +26,7 @@ lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
 lora_fan_in_fan_out:
 lisa_n_layers: 4
 lisa_step_interval: 20
@@ -41,12 +45,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 5e-5 # recommendation from lisa paper for 7b
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
@@ -55,8 +65,13 @@ flash_attn_fuse_mlp: true
 warmup_steps: 100
 evals_per_epoch: 4
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -4,6 +4,9 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -23,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 peft:
  loftq_config:
    loftq_bits: 4
@@ -40,16 +44,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -26,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -40,16 +41,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -41,19 +43,28 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -41,16 +43,27 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -24,7 +24,9 @@ pad_to_sequence_len: true
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 relora_steps: 150
 relora_warmup_steps: 10
@@ -43,18 +45,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -45,11 +45,14 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: true
 fp16:
 tf32: true
 gradient_checkpointing: true
 local_rank:
 logging_steps: 1
 flash_attention: true
 eager_attention:
@@ -57,4 +60,8 @@ eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -42,19 +42,27 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 2e-5
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 2
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -1,6 +1,9 @@
 base_model: NousResearch/Meta-Llama-3.1-8B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -27,19 +30,29 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 2e-5
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 2
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: <|end_of_text|>
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -42,6 +42,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -56,15 +57,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -37,6 +37,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -51,17 +52,30 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
   pad_token: <|end_of_text|>
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -58,6 +58,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -72,15 +73,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -19,6 +19,7 @@ val_set_size: 0.0
 output_dir: ./outputs/lora-out
 dataset_exact_deduplication: true
 test_value: true
 sequence_len: 4096
 sample_packing: true
@@ -31,6 +32,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 lora_modules_to_save:
  - embed_tokens
  - lm_head
@@ -48,17 +50,30 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
   pad_token: <|end_of_text|>
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -1,6 +1,9 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -21,6 +24,7 @@ lora_r: 16
 lora_alpha: 32
 # Currently, we don't support dropout with our custom Triton kernels
 # lora_dropout: 0.05
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -49,12 +53,18 @@ optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 loss_watchdog_threshold: 5.0
@@ -63,6 +73,10 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -1,6 +1,9 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -21,6 +24,7 @@ pad_to_sequence_len: true
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -43,12 +47,18 @@ optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 loss_watchdog_threshold: 5.0
@@ -57,9 +67,11 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
-
+debug:
 deepspeed: deepspeed_configs/zero3.json
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -1,66 +0,0 @@
 base_model: meta-llama/Llama-3.2-1B
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.0
 output_dir: ./outputs/lora-out
 test_value: true
 sequence_len: 4096
 sample_packing: true
 sample_packing_sequentially: true
 curriculum_sampling: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_modules_to_save:
  - embed_tokens
  - lm_head
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: <|end_of_text|>
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -1,6 +1,9 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -21,6 +24,7 @@ pad_to_sequence_len: true
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -43,12 +47,18 @@ optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 loss_watchdog_threshold: 5.0
@@ -57,6 +67,10 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -27,6 +27,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 lora_modules_to_save:
  - embed_tokens
  - lm_head
@@ -44,17 +45,30 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
   pad_token: <|end_of_text|>
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -32,6 +32,7 @@ lora_r: 32
 lora_alpha: 64
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -46,19 +47,31 @@ optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 20
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -24,6 +24,7 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -46,12 +47,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 loss_watchdog_threshold: 5.0
@@ -59,7 +66,13 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -24,6 +24,7 @@ pad_to_sequence_len: true
 lora_r: 16
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
 lora_target_linear: true
 gradient_accumulation_steps: 4
@@ -33,6 +34,8 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -41,19 +43,28 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -26,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -41,17 +43,28 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama4/scout-lora.yaml
+++ b/examples/llama4/scout-lora.yaml
@@ -1,75 +0,0 @@
 base_model: meta-llama/Llama-4-Scout-17B-16E
 model_type: Llama4ForConditionalGeneration
  # Automatically upload checkpoint and final model to HF
  # hub_model_id: username/custom_model_name
 strict: false
  # torch_compile: true
 adapter: lora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
 lora_modules_to_save:
  - lm_head
  - embed_tokens
 chat_template: llama4
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_8bit
 lr_scheduler: cosine
 learning_rate: 2e-5
 bf16: true
 tf32: true
 # gradient_checkpointing: true
 # gradient_checkpointing_kwargs:
 #   use_reentrant: false
 logging_steps: 1
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 2
 saves_per_epoch: 1
 weight_decay: 0.0
 fsdp:
  - auto_wrap
  - full_shard
 fsdp_config:
  fsdp_version: 2
  fsdp_offload_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_reshard_after_forward: true
  fsdp_activation_checkpointing: true
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -1,56 +0,0 @@
 base_model: llava-hf/llava-1.5-7b-hf
 processor_type: AutoProcessor
 strict: false
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false
 chat_template: llava
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
    field_messages: messages
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 adapter: lora
 lora_model_dir:
 sequence_len: 8192
 pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: true
 fp16:
 tf32: true
 gradient_checkpointing: true
 logging_steps: 1
 flash_attention: true
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -5,6 +5,9 @@ tokenizer_type: AutoTokenizer
 tokenizer_config: EleutherAI/gpt-neox-20b
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -35,17 +38,27 @@ train_on_inputs: false
 group_by_length: true
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
 tokens:
 save_safetensors: False
--- a/examples/mistral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral-ds-zero3.yaml
@@ -6,6 +6,9 @@ tokenizer_type: LlamaTokenizer
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 unfrozen_parameters:
@@ -37,19 +40,27 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0001
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 save_total_limit: 1
 save_steps:
-
+debug:
 deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_params.json
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  eos_token: "<|im_end|>"
 tokens:
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -4,6 +4,9 @@ model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -31,16 +34,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.000005
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -4,6 +4,9 @@ model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -25,6 +28,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -47,13 +51,18 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16: false
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: false
 sdp_attention: true
@@ -62,6 +71,12 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_table_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -27,6 +27,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -49,12 +50,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 loss_watchdog_threshold: 5.0
@@ -62,6 +69,12 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -40,6 +40,7 @@ lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.2
 lora_target_linear: true
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
@@ -66,18 +67,31 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0001
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: false
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<|im_start|>"
  eos_token: "<|im_end|>"
--- a/examples/mistral/mistral-qlora-fsdp.yml
+++ b/examples/mistral/mistral-qlora-fsdp.yml
@@ -32,6 +32,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -46,12 +47,18 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 loss_watchdog_threshold: 5.0
@@ -59,8 +66,10 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/mistral/mistral-qlora-orpo.yml
+++ b/examples/mistral/mistral-qlora-orpo.yml
@@ -32,6 +32,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -54,12 +55,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 loss_watchdog_threshold: 5.0
@@ -67,6 +74,12 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/mistral/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small-3.1-24B-lora.yml
@@ -1,59 +0,0 @@
 base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
 processor_type: AutoProcessor
 strict: false
 load_in_8bit: true
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false
 chat_template: mistral_v7_tekken
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
    field_messages: messages
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.01
 output_dir: ./outputs/out
 adapter: lora
 lora_model_dir:
 sequence_len: 2048
 pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: true
 fp16:
 tf32: true
 gradient_checkpointing: true
 logging_steps: 1
 flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet.
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
@@ -30,6 +30,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -44,12 +45,18 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 loss_watchdog_threshold: 5.0
@@ -57,8 +64,10 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/mistral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-qlora-fsdp.yml
@@ -32,6 +32,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -46,12 +47,18 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 loss_watchdog_threshold: 5.0
@@ -59,8 +66,10 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -41,6 +41,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 #lora_target_modules:
 #  - gate
 #  - q_proj
@@ -64,12 +65,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 loss_watchdog_threshold: 5.0
@@ -77,8 +84,12 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
-
+debug:
 deepspeed: deepspeed_configs/zero2.json
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/mistral/mixtral_22.yml
+++ b/examples/mistral/mixtral_22.yml
@@ -6,6 +6,9 @@ tokenizer_type: LlamaTokenizer
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 unfrozen_parameters:
@@ -35,19 +38,27 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0001
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 save_total_limit: 1
 save_steps:
-
+debug:
 deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  eos_token: "<|im_end|>"
 tokens:
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -27,6 +27,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -49,12 +50,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 loss_watchdog_threshold: 5.0
@@ -62,6 +69,12 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/mpt-7b/config.yml
+++ b/examples/mpt-7b/config.yml
@@ -35,17 +35,26 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0000002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 5
 xformers_attention:
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0001
 fsdp:
 fsdp_config:
 tokens:
  pad_token: "<|padding|>"
  bos_token: "<|endoftext|>"
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -4,6 +4,9 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 push_dataset_to_hub:
 datasets:
@@ -20,6 +23,7 @@ lora_alpha:
 lora_dropout:
 lora_target_modules:
 lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -33,20 +37,29 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.000003
 train_on_inputs: false
 group_by_length: false
 float16: true
 bf16: false
 fp16: false
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -29,6 +29,7 @@ lora_target_modules:
  - v_proj
  - k_proj
  - o_proj
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -42,19 +43,29 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: false
 fp16: true
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 gptq_groupsize:
 s2_attention:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/openllama-3b/qlora.yml
+++ b/examples/openllama-3b/qlora.yml
@@ -21,7 +21,9 @@ sample_packing: true
 lora_r: 8
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -35,19 +37,28 @@ optimizer: paged_adamw_32bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: false
 fp16: true
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/phi/lora-3.5.yaml
+++ b/examples/phi/lora-3.5.yaml
@@ -37,6 +37,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -51,16 +52,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bfloat16: true
 bf16: true
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 4
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -4,6 +4,9 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -24,6 +27,7 @@ lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -41,20 +45,30 @@ max_grad_norm: 1.0
 lr_scheduler: cosine
 learning_rate: 0.000003
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -27,6 +27,7 @@ lora_r: 64
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -44,20 +45,30 @@ max_grad_norm: 1.0
 lr_scheduler: cosine
 learning_rate: 0.000003
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/phi/phi2-ft.yml
+++ b/examples/phi/phi2-ft.yml
@@ -4,6 +4,9 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -24,6 +27,7 @@ lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
@@ -41,20 +45,30 @@ max_grad_norm: 1.0
 lr_scheduler: cosine
 learning_rate: 0.000003
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/phi/phi3-ft-fsdp.yml
+++ b/examples/phi/phi3-ft-fsdp.yml
@@ -4,6 +4,9 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -25,6 +28,7 @@ lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project: phi3
 wandb_entity:
@@ -42,19 +46,27 @@ max_grad_norm: 1.0
 lr_scheduler: cosine
 learning_rate: 0.000003
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
  - full_shard
--- a/examples/phi/phi3-ft.yml
+++ b/examples/phi/phi3-ft.yml
@@ -7,6 +7,9 @@ tokenizer_type: AutoTokenizer
 # hub_model_id: username/custom_model_name
 chat_template: phi_3
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
@@ -27,6 +30,7 @@ lora_r: 64
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 gradient_accumulation_steps: 1
 micro_batch_size: 2
@@ -38,6 +42,8 @@ max_grad_norm: 1.0
 lr_scheduler: cosine
 learning_rate: 5.0e-6
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 gradient_checkpointing: true
@@ -49,9 +55,9 @@ flash_attention: true
 eval_steps: 1000
 save_steps: 5000
 eval_table_size: 2
 eval_batch_size: 2
 eval_sample_packing: false
 eval_table_size: 2
 eval_max_new_tokens: 32
 eval_causal_lm_metrics: ["perplexity"]
 do_causal_lm_eval: true
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Dan Saunders	4ac65462f0	precommit	2025-03-21 16:43:14 +00:00
Dan Saunders	ce35b2a95f	precommit	2025-03-21 16:36:56 +00:00
Dan Saunders	ab3b36339a	fix tests	2025-03-21 16:36:54 +00:00
Dan Saunders	22cfa42961	small updates	2025-03-21 16:36:34 +00:00
Dan Saunders	0b2c2ed68c	refactors, SP mixin	2025-03-21 16:36:34 +00:00
Dan Saunders	2f0b4626b9	review comments, docstrings	2025-03-21 16:36:32 +00:00
Dan Saunders	a26985c53c	small changes	2025-03-21 16:36:17 +00:00
Dan Saunders	c1a58339e8	add SP doc, review comments	2025-03-21 16:36:17 +00:00
Dan Saunders	411df76a97	bugfix	2025-03-21 16:36:17 +00:00
Dan Saunders	a09d1ccbf2	removing print statement	2025-03-21 16:36:17 +00:00
Dan Saunders	2727d86544	non-seq2se1 collator fix	2025-03-21 16:36:17 +00:00
Dan Saunders	64c203cdef	sampler / dataloader refactor	2025-03-21 16:36:17 +00:00
Dan Saunders	7d7042f602	test fix	2025-03-21 16:36:17 +00:00
Dan Saunders	d187f1f8e2	using field validator instead of model validator	2025-03-21 16:36:17 +00:00
Dan Saunders	1cced52719	rename file, delete another	2025-03-21 16:36:17 +00:00
Dan Saunders	11321b17e7	removing flash-attn from requirements.txt (in setup.py extras already)	2025-03-21 16:36:17 +00:00
Wing Lian	7a1a211c99	move ring flash attn to extras with flash-attn (#2414 )	2025-03-21 16:36:17 +00:00
Dan Saunders	e1a02a32b5	fix	2025-03-21 16:36:17 +00:00
Dan Saunders	a6ef6c7764	fix	2025-03-21 16:36:17 +00:00
Dan Saunders	cb3a9e99a3	gracefully handle no ring-flash-attn	2025-03-21 16:36:17 +00:00
Dan Saunders	3ae47ec7de	actually isolate CLI tests	2025-03-21 16:36:17 +00:00
Dan Saunders	e36dc763ab	isolate cli tests	2025-03-21 16:36:17 +00:00
Dan Saunders	03027cf6bf	pernicious Fire CLI bugfix	2025-03-21 16:36:16 +00:00
Dan Saunders	0ade60d455	another import scoping change	2025-03-21 16:35:56 +00:00
Dan Saunders	02e1a42f04	scoping down problematic import	2025-03-21 16:35:56 +00:00
Dan Saunders	919b88f11b	update config.qmd and rename option	2025-03-21 16:35:55 +00:00
Dan Saunders	345a9dd831	removing some obvious comments	2025-03-21 16:35:38 +00:00
Dan Saunders	4ff97bc9d4	eval dataloader and sampler changes	2025-03-21 16:35:38 +00:00
Dan Saunders	d0e178d52f	remove debug logs and simplify	2025-03-21 16:35:38 +00:00
Dan Saunders	5731cdc0cf	fixing sample packing	2025-03-21 16:35:38 +00:00
Dan Saunders	b7738d57c4	working multi-group SP	2025-03-21 16:35:38 +00:00
Dan Saunders	698e599bf7	precommit fixes	2025-03-21 16:35:38 +00:00
Dan Saunders	1d339e4007	fixes	2025-03-21 16:35:38 +00:00
Dan Saunders	4190ad0647	updates	2025-03-21 16:35:36 +00:00
Dan Saunders	b44a207248	update	2025-03-21 16:35:10 +00:00
Dan Saunders	51c326150b	pytest	2025-03-21 16:35:10 +00:00
Dan Saunders	14baaf6e0a	updates	2025-03-21 16:35:10 +00:00
Dan Saunders	f487910444	removing unused code	2025-03-21 16:35:08 +00:00
Dan Saunders	c5071dfd8a	fix req	2025-03-21 16:34:12 +00:00
Dan Saunders	e323145ba9	remove errant file	2025-03-21 16:34:12 +00:00
Dan Saunders	7efc787ac8	cleanup	2025-03-21 16:34:12 +00:00
Dan Saunders	dce61cdab1	progress on ring attn impl	2025-03-21 16:34:12 +00:00
Dan Saunders	bd952de9d2	progress on ring attn impl	2025-03-21 16:34:10 +00:00
Dan Saunders	3f8a43cab6	adding easy_context as integration for now	2025-03-21 16:33:46 +00:00