raising value error

Merge branch 'main' into flex_patching_update
fixing tests
2025-04-09 17:54:24 +01:00 · 2025-04-09 16:51:05 +01:00 · 2025-04-08 17:23:21 +01:00 · 2025-04-08 17:20:53 +01:00 · 2025-04-08 17:17:23 +01:00 · 2025-04-08 09:23:46 -04:00
287 changed files with 998 additions and 8268 deletions
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,14 +0,0 @@
 [run]
 source = axolotl
 omit =
    */tests/*
    setup.py
 [report]
 exclude_lines =
    pragma: no cover
    def __repr__
    raise NotImplementedError
    if __name__ == .__main__.:
    pass
    raise ImportError
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -22,6 +22,12 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.4.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
@@ -40,18 +46,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "128"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,19 +18,19 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.5.1
            axolotl_extras: vllm
-            is_latest: true
+          - cuda: 124
-          - cuda: 126
+            cuda_version: 12.4.1
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.6.0
            axolotl_extras:
            is_latest: true
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -62,7 +62,6 @@ jobs:
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
@@ -78,6 +77,11 @@ jobs:
    strategy:
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -89,11 +93,6 @@ jobs:
            pytorch: 2.6.0
            axolotl_extras:
            is_latest: true
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -139,7 +138,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.4.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -8,8 +8,6 @@ on:
      - 'setup.py'
      - 'pyproject.toml'
      - '.github/workflows/multi-gpu-e2e.yml'
      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
@@ -36,15 +34,15 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.4.1
-            axolotl_extras:
+            axolotl_extras:  # no vllm support for 2.4.1
            num_gpus: 2
            nightly_build: "true"
-          - cuda: 126
+          - cuda: 124
-            cuda_version: 12.6.3
+            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.5.1
-            axolotl_extras:
+            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
    runs-on: [self-hosted, modal]
@@ -69,7 +67,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.multigpu
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,6 +12,11 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -65,6 +70,11 @@ jobs:
    strategy:
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -1,55 +0,0 @@
 name: Preview
 on:
  workflow_dispatch:
  pull_request:
    types: [opened, synchronize, reopened]
 permissions:
  checks: write
  contents: write
  deployments: write
  issues: write
  discussions: write
  pages: write
  pull-requests: write
  statuses: write
 jobs:
  preview:
    runs-on: ubuntu-latest
    steps:
      - name: Check out repository
        uses: actions/checkout@v4
      - name: Set up Quarto
        uses: quarto-dev/quarto-actions/setup@v2
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Install dependencies
        run: |
          python3 -m pip install jupyter quartodoc
          python3 -m pip install -e . --no-deps
      - name: Build autodoc
        run: quartodoc build
      - name: Quarto render
        run: quarto render
      - name: Netlify Publish
        uses: nwtgck/actions-netlify@v3.0
        with:
          publish-dir: './_site'
          enable-pull-request-comment: true
          enable-github-deployment: true
          github-token: ${{ secrets.GITHUB_TOKEN }}
          deploy-message: "Deployed On Netlify"
          github-deployment-environment: 'preview'
          github-deployment-description: 'Preview Deployment'
        env:
          NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
          NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -26,7 +26,7 @@ jobs:
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20
    steps:
@@ -106,6 +106,13 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -140,7 +147,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -27,9 +27,6 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 env:
  TRANSFORMERS_IS_CI: "yes"
 jobs:
  pre-commit:
    name: pre-commit
@@ -52,7 +49,7 @@ jobs:
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20
    steps:
@@ -105,17 +102,9 @@ jobs:
      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml
+          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v tests/patched/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v tests/patched/
-          pytest -v tests/cli/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v tests/cli/
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          files: ./coverage.xml
          flags: unittests,pytorch-${{ matrix.pytorch_version }}
          fail_ci_if_error: false
      - name: cleanup pip cache
        run: |
@@ -138,7 +127,7 @@ jobs:
      max-parallel: 1
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20
    steps:
@@ -245,7 +234,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
@@ -261,12 +249,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras: llmcompressor
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -278,13 +260,7 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            num_gpus: 1
-            axolotl_extras:
+            axolotl_extras: vllm
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
            num_gpus: 1
            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -305,7 +281,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
--- a/.runpod/.gitignore
+++ b/.runpod/.gitignore
@@ -1,161 +0,0 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 pod/scripts/config.yaml
--- a/.runpod/Dockerfile
+++ b/.runpod/Dockerfile
@@ -1,18 +0,0 @@
 FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0
 COPY .runpod/requirements.txt /requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install --upgrade pip && \
    python3 -m pip install --upgrade -r /requirements.txt
 # Environment settings
 ARG BASE_VOLUME="/runpod-volume"
 ENV BASE_VOLUME=$BASE_VOLUME
 ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
 ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
 ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
 COPY .runpod/src /src
 WORKDIR /src
 CMD ["python3", "/src/handler.py"]
--- a/.runpod/README.md
+++ b/.runpod/README.md
@@ -1,335 +0,0 @@
 <h1>LLM Post Training- Full fine-tune, LoRA, QLoRa etc. Llama/Mistral/Gemma and more</h1>
 # Configuration Options
 This document outlines all available configuration options for training models. The configuration can be provided as a JSON request.
 ## Usage
 You can use these configuration Options:
 1. As a JSON request body:
 ```json
 {
  "input": {
    "user_id": "user",
    "model_id": "model-name",
    "run_id": "run-id",
    "credentials": {
      "wandb_api_key": "", # add your Weights & biases key. TODO:  you will be able to set this in Enviornment variables.
      "hf_token": "", # add your HF_token. TODO:  you will be able to set this in Enviornment variables.
    },
    "args": {
      "base_model": "NousResearch/Llama-3.2-1B",
      // ... other options
    }
  }
 }
 ```
 ## Configuration Options
 ### Model Configuration
 | Option              | Description                                                                                   | Default              |
 | ------------------- | --------------------------------------------------------------------------------------------- | -------------------- |
 | `base_model`        | Path to the base model (local or HuggingFace)                                                 | Required             |
 | `base_model_config` | Configuration path for the base model                                                         | Same as base_model   |
 | `revision_of_model` | Specific model revision from HuggingFace hub                                                  | Latest               |
 | `tokenizer_config`  | Custom tokenizer configuration path                                                           | Optional             |
 | `model_type`        | Type of model to load                                                                         | AutoModelForCausalLM |
 | `tokenizer_type`    | Type of tokenizer to use                                                                      | AutoTokenizer        |
 | `hub_model_id`      | Repository ID where the model will be pushed on Hugging Face Hub (format: username/repo-name) | Optional             |
 ## Model Family Identification
 | Option                     | Default | Description                    |
 | -------------------------- | ------- | ------------------------------ |
 | `is_falcon_derived_model`  | `false` | Whether model is Falcon-based  |
 | `is_llama_derived_model`   | `false` | Whether model is LLaMA-based   |
 | `is_qwen_derived_model`    | `false` | Whether model is Qwen-based    |
 | `is_mistral_derived_model` | `false` | Whether model is Mistral-based |
 ## Model Configuration Overrides
 | Option                                          | Default    | Description                        |
 | ----------------------------------------------- | ---------- | ---------------------------------- |
 | `overrides_of_model_config.rope_scaling.type`   | `"linear"` | RoPE scaling type (linear/dynamic) |
 | `overrides_of_model_config.rope_scaling.factor` | `1.0`      | RoPE scaling factor                |
 ### Model Loading Options
 | Option         | Description                   | Default |
 | -------------- | ----------------------------- | ------- |
 | `load_in_8bit` | Load model in 8-bit precision | false   |
 | `load_in_4bit` | Load model in 4-bit precision | false   |
 | `bf16`         | Use bfloat16 precision        | false   |
 | `fp16`         | Use float16 precision         | false   |
 | `tf32`         | Use tensor float 32 precision | false   |
 ## Memory and Device Settings
 | Option             | Default   | Description             |
 | ------------------ | --------- | ----------------------- |
 | `gpu_memory_limit` | `"20GiB"` | GPU memory limit        |
 | `lora_on_cpu`      | `false`   | Load LoRA on CPU        |
 | `device_map`       | `"auto"`  | Device mapping strategy |
 | `max_memory`       | `null`    | Max memory per device   |
 ## Training Hyperparameters
 | Option                        | Default   | Description                 |
 | ----------------------------- | --------- | --------------------------- |
 | `gradient_accumulation_steps` | `1`       | Gradient accumulation steps |
 | `micro_batch_size`            | `2`       | Batch size per GPU          |
 | `eval_batch_size`             | `null`    | Evaluation batch size       |
 | `num_epochs`                  | `4`       | Number of training epochs   |
 | `warmup_steps`                | `100`     | Warmup steps                |
 | `warmup_ratio`                | `0.05`    | Warmup ratio                |
 | `learning_rate`               | `0.00003` | Learning rate               |
 | `lr_quadratic_warmup`         | `false`   | Quadratic warmup            |
 | `logging_steps`               | `null`    | Logging frequency           |
 | `eval_steps`                  | `null`    | Evaluation frequency        |
 | `evals_per_epoch`             | `null`    | Evaluations per epoch       |
 | `save_strategy`               | `"epoch"` | Checkpoint saving strategy  |
 | `save_steps`                  | `null`    | Saving frequency            |
 | `saves_per_epoch`             | `null`    | Saves per epoch             |
 | `save_total_limit`            | `null`    | Maximum checkpoints to keep |
 | `max_steps`                   | `null`    | Maximum training steps      |
 ### Dataset Configuration
 ```yaml
 datasets:
  - path: vicgalle/alpaca-gpt4 # HuggingFace dataset or TODO: You will be able to add the local path.
    type: alpaca # Format type (alpaca, gpteacher, oasst, etc.)
    ds_type: json # Dataset type
    data_files: path/to/data # Source data files
    train_on_split: train # Dataset split to use
 ```
 ## Chat Template Settings
 | Option                   | Default                          | Description            |
 | ------------------------ | -------------------------------- | ---------------------- |
 | `chat_template`          | `"tokenizer_default"`            | Chat template type     |
 | `chat_template_jinja`    | `null`                           | Custom Jinja template  |
 | `default_system_message` | `"You are a helpful assistant."` | Default system message |
 ## Dataset Processing
 | Option                        | Default                    | Description                       |
 | ----------------------------- | -------------------------- | --------------------------------- |
 | `dataset_prepared_path`       | `"data/last_run_prepared"` | Path for prepared dataset         |
 | `push_dataset_to_hub`         | `""`                       | Push dataset to HF hub            |
 | `dataset_processes`           | `4`                        | Number of preprocessing processes |
 | `dataset_keep_in_memory`      | `false`                    | Keep dataset in memory            |
 | `shuffle_merged_datasets`     | `true`                     | Shuffle merged datasets           |
 | `dataset_exact_deduplication` | `true`                     | Deduplicate datasets              |
 ## LoRA Configuration
 | Option                     | Default                | Description                    |
 | -------------------------- | ---------------------- | ------------------------------ |
 | `adapter`                  | `"lora"`               | Adapter type (lora/qlora)      |
 | `lora_model_dir`           | `""`                   | Directory with pretrained LoRA |
 | `lora_r`                   | `8`                    | LoRA attention dimension       |
 | `lora_alpha`               | `16`                   | LoRA alpha parameter           |
 | `lora_dropout`             | `0.05`                 | LoRA dropout                   |
 | `lora_target_modules`      | `["q_proj", "v_proj"]` | Modules to apply LoRA          |
 | `lora_target_linear`       | `false`                | Target all linear modules      |
 | `peft_layers_to_transform` | `[]`                   | Layers to transform            |
 | `lora_modules_to_save`     | `[]`                   | Modules to save                |
 | `lora_fan_in_fan_out`      | `false`                | Fan in/out structure           |
 ## Optimization Settings
 | Option                    | Default | Description                |
 | ------------------------- | ------- | -------------------------- |
 | `train_on_inputs`         | `false` | Train on input prompts     |
 | `group_by_length`         | `false` | Group by sequence length   |
 | `gradient_checkpointing`  | `false` | Use gradient checkpointing |
 | `early_stopping_patience` | `3`     | Early stopping patience    |
 ## Learning Rate Scheduling
 | Option                     | Default    | Description          |
 | -------------------------- | ---------- | -------------------- |
 | `lr_scheduler`             | `"cosine"` | Scheduler type       |
 | `lr_scheduler_kwargs`      | `{}`       | Scheduler parameters |
 | `cosine_min_lr_ratio`      | `null`     | Minimum LR ratio     |
 | `cosine_constant_lr_ratio` | `null`     | Constant LR ratio    |
 | `lr_div_factor`            | `null`     | LR division factor   |
 ## Optimizer Settings
 | Option                 | Default      | Description         |
 | ---------------------- | ------------ | ------------------- |
 | `optimizer`            | `"adamw_hf"` | Optimizer choice    |
 | `optim_args`           | `{}`         | Optimizer arguments |
 | `optim_target_modules` | `[]`         | Target modules      |
 | `weight_decay`         | `null`       | Weight decay        |
 | `adam_beta1`           | `null`       | Adam beta1          |
 | `adam_beta2`           | `null`       | Adam beta2          |
 | `adam_epsilon`         | `null`       | Adam epsilon        |
 | `max_grad_norm`        | `null`       | Gradient clipping   |
 ## Attention Implementations
 | Option                     | Default | Description                   |
 | -------------------------- | ------- | ----------------------------- |
 | `flash_optimum`            | `false` | Use better transformers       |
 | `xformers_attention`       | `false` | Use xformers                  |
 | `flash_attention`          | `false` | Use flash attention           |
 | `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
 | `flash_attn_rms_norm`      | `false` | Flash attention RMS norm      |
 | `flash_attn_fuse_qkv`      | `false` | Fuse QKV operations           |
 | `flash_attn_fuse_mlp`      | `false` | Fuse MLP operations           |
 | `sdp_attention`            | `false` | Use scaled dot product        |
 | `s2_attention`             | `false` | Use shifted sparse attention  |
 ## Tokenizer Modifications
 | Option           | Default | Description                  |
 | ---------------- | ------- | ---------------------------- |
 | `special_tokens` | -       | Special tokens to add/modify |
 | `tokens`         | `[]`    | Additional tokens            |
 ## Distributed Training
 | Option                  | Default | Description           |
 | ----------------------- | ------- | --------------------- |
 | `fsdp`                  | `null`  | FSDP configuration    |
 | `fsdp_config`           | `null`  | FSDP config options   |
 | `deepspeed`             | `null`  | Deepspeed config path |
 | `ddp_timeout`           | `null`  | DDP timeout           |
 | `ddp_bucket_cap_mb`     | `null`  | DDP bucket capacity   |
 | `ddp_broadcast_buffers` | `null`  | DDP broadcast buffers |
 <details>
 <summary><h3>Example Configuration Request:</h3></summary>
 Here's a complete example for fine-tuning a LLaMA model using LoRA:
 ```json
 {
  "input": {
    "user_id": "user",
    "model_id": "llama-test",
    "run_id": "test-run",
    "credentials": {
      "wandb_api_key": "",
      "hf_token": ""
    },
    "args": {
      "base_model": "NousResearch/Llama-3.2-1B",
      "load_in_8bit": false,
      "load_in_4bit": false,
      "strict": false,
      "datasets": [
        {
          "path": "teknium/GPT4-LLM-Cleaned",
          "type": "alpaca"
        }
      ],
      "dataset_prepared_path": "last_run_prepared",
      "val_set_size": 0.1,
      "output_dir": "./outputs/lora-out",
      "adapter": "lora",
      "sequence_len": 2048,
      "sample_packing": true,
      "eval_sample_packing": true,
      "pad_to_sequence_len": true,
      "lora_r": 16,
      "lora_alpha": 32,
      "lora_dropout": 0.05,
      "lora_target_modules": [
        "gate_proj",
        "down_proj",
        "up_proj",
        "q_proj",
        "v_proj",
        "k_proj",
        "o_proj"
      ],
      "gradient_accumulation_steps": 2,
      "micro_batch_size": 2,
      "num_epochs": 1,
      "optimizer": "adamw_8bit",
      "lr_scheduler": "cosine",
      "learning_rate": 0.0002,
      "train_on_inputs": false,
      "group_by_length": false,
      "bf16": "auto",
      "tf32": false,
      "gradient_checkpointing": true,
      "logging_steps": 1,
      "flash_attention": true,
      "loss_watchdog_threshold": 5,
      "loss_watchdog_patience": 3,
      "warmup_steps": 10,
      "evals_per_epoch": 4,
      "saves_per_epoch": 1,
      "weight_decay": 0,
      "hub_model_id": "runpod/llama-fr-lora",
      "wandb_name": "test-run-1",
      "wandb_project": "test-run-1",
      "wandb_entity": "axo-test",
      "special_tokens": {
        "pad_token": "<|end_of_text|>"
      }
    }
  }
 }
 ```
 </details>
 ### Advanced Features
 #### Wandb Integration
 - `wandb_project`: Project name for Weights & Biases
 - `wandb_entity`: Team name in W&B
 - `wandb_watch`: Monitor model with W&B
 - `wandb_name`: Name of the W&B run
 - `wandb_run_id`: ID for the W&B run
 #### Performance Optimization
 - `sample_packing`: Enable efficient sequence packing
 - `eval_sample_packing`: Use sequence packing during evaluation
 - `torch_compile`: Enable PyTorch 2.0 compilation
 - `flash_attention`: Use Flash Attention implementation
 - `xformers_attention`: Use xFormers attention implementation
 ### Available Optimizers
 The following optimizers are supported:
 - `adamw_hf`: HuggingFace's AdamW implementation
 - `adamw_torch`: PyTorch's AdamW
 - `adamw_torch_fused`: Fused AdamW implementation
 - `adamw_torch_xla`: XLA-optimized AdamW
 - `adamw_apex_fused`: NVIDIA Apex fused AdamW
 - `adafactor`: Adafactor optimizer
 - `adamw_anyprecision`: Anyprecision AdamW
 - `adamw_bnb_8bit`: 8-bit AdamW from bitsandbytes
 - `lion_8bit`: 8-bit Lion optimizer
 - `lion_32bit`: 32-bit Lion optimizer
 - `sgd`: Stochastic Gradient Descent
 - `adagrad`: Adagrad optimizer
 ## Notes
 - Set `load_in_8bit: true` or `load_in_4bit: true` for memory-efficient training
 - Enable `flash_attention: true` for faster training on modern GPUs
 - Use `gradient_checkpointing: true` to reduce memory usage
 - Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory
 For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html).
 ### Errors:
 - if you face any issues with the Flash Attention-2, Delete yoor worker and Re-start.
--- a/.runpod/hub.json
+++ b/.runpod/hub.json
@@ -1,93 +0,0 @@
 {
  "title": "Axolotl Fine-Tuning",
  "description": "Serverless fine-tuning of open-source LLMs with Axolotl. Supports LoRA, QLoRA, DPO, and more using Hugging Face models and datasets.",
  "type": "serverless",
  "category": "language",
  "iconUrl": "https://avatars.githubusercontent.com/u/167502477",
  "config": {
    "runsOn": "GPU",
    "containerDiskInGb": 200,
    "gpuCount": 1,
    "allowedCudaVersions": [
      "12.8",
      "12.7",
      "12.6",
      "12.5",
      "12.4"
    ],
    "presets": [],
    "env": [
      {
        "key": "TOKENIZER",
        "input": {
          "name": "Tokenizer",
          "type": "string",
          "description": "Name or path of the Hugging Face tokenizer to use.",
          "default": "",
          "advanced": true
        }
      },
      {
        "key": "MAX_NUM_SEQS",
        "input": {
          "name": "Max Num Seqs",
          "type": "number",
          "description": "Maximum number of sequences per iteration.",
          "default": 256,
          "advanced": true
        }
      },
      {
        "key": "DISABLE_LOG_STATS",
        "input": {
          "name": "Disable Log Stats",
          "type": "boolean",
          "description": "Disable logging statistics.",
          "default": false,
          "trueValue": "true",
          "falseValue": "false"
        }
      },
      {
        "key": "LOAD_FORMAT",
        "input": {
          "name": "Load Format",
          "type": "string",
          "description": "The format of the model weights to load.",
          "default": "auto",
          "options": [
            {
              "label": "auto",
              "value": "auto"
            },
            {
              "label": "pt",
              "value": "pt"
            },
            {
              "label": "safetensors",
              "value": "safetensors"
            },
            {
              "label": "npcache",
              "value": "npcache"
            },
            {
              "label": "dummy",
              "value": "dummy"
            },
            {
              "label": "tensorizer",
              "value": "tensorizer"
            },
            {
              "label": "bitsandbytes",
              "value": "bitsandbytes"
            }
          ],
          "advanced": true
        }
      }
    ]
  }
 }
--- a/.runpod/requirements.txt
+++ b/.runpod/requirements.txt
@@ -1,7 +0,0 @@
 # Required Python packages get listed here, one per line.
 # Reccomended to lock the version number to avoid unexpected changes.
 # You can also install packages from a git repository, e.g.:
 # git+https://github.com/runpod/runpod-python.git
 # To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/
 runpod~=1.7.0
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -1,577 +0,0 @@
 # # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
 # # This can also be a relative path to a model on disk
 # base_model: ./llama-7b-hf
 # # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
 # base_model_ignore_patterns:
 # # If the base_model repo on hf hub doesn't include configuration .json files,
 # # You can set that here, or leave this empty to default to base_model
 # base_model_config: ./llama-7b-hf
 # # You can specify to choose a specific model revision from huggingface hub
 # model_revision:
 # # Optional tokenizer configuration override in case you want to use a different tokenizer
 # # than the one defined in the base model
 # tokenizer_config:
 # # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
 # model_type: AutoModelForCausalLM
 # # Corresponding tokenizer for the model AutoTokenizer is a good choice
 # tokenizer_type: AutoTokenizer
 # # Trust remote code for untrusted source
 # trust_remote_code:
 # # use_fast option for tokenizer loading from_pretrained, default to True
 # tokenizer_use_fast:
 # # Whether to use the legacy tokenizer setting, defaults to True
 # tokenizer_legacy:
 # # Resize the model embeddings when new tokens are added to multiples of 32
 # # This is reported to improve training speed on some models
 # resize_token_embeddings_to_32x:
 # # Used to identify which the model is based on
 # is_falcon_derived_model:
 # is_llama_derived_model:
 # # Please note that if you set this to true, `padding_side` will be set to "left" by default
 # is_mistral_derived_model:
 # is_qwen_derived_model:
 # # optional overrides to the base model configuration
 # model_config:
 #   # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
 #   rope_scaling:
 #     type: # linear | dynamic
 #     factor: # float
 # # Whether you are training a 4-bit GPTQ quantized model
 # gptq: true
 # gptq_groupsize: 128 # group size
 # gptq_model_v1: false # v1 or v2
 # # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
 # load_in_8bit: true
 # # Use bitsandbytes 4 bit
 # load_in_4bit:
 # # Use CUDA bf16
 # bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
 # # Use CUDA fp16
 # fp16: true
 # # Use CUDA tf32
 # tf32: true # require >=ampere
 # # No AMP (automatic mixed precision)
 # bfloat16: true # require >=ampere
 # float16: true
 # # A list of one or more datasets to finetune the model with
 # datasets:
 #   # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
 #   - path: vicgalle/alpaca-gpt4
 #   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
 #     type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
 #     ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
 #     data_files: # Optional[str] path to source data files
 #     shards: # Optional[int] number of shards to split data into
 #     name: # Optional[str] name of dataset configuration to load
 #     train_on_split: train # Optional[str] name of dataset split to load from
 #     # Optional[str] fastchat conversation type, only used with type: sharegpt
 #     conversation:  # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
 #     field_human: # Optional[str]. Human key to use for conversation.
 #     field_model: # Optional[str]. Assistant key to use for conversation.
 #   # Custom user prompt
 #   - path: repo
 #     type:
 #       # The below are defaults. only set what's needed.
 #       system_prompt: ""
 #       system_format: "{system}"
 #       field_system: system
 #       field_instruction: instruction
 #       field_input: input
 #       field_output: output
 #       # Customizable to be single line or multi-line
 #       # 'format' can include {input}
 #       format: |-
 #         User: {instruction} {input}
 #         Assistant:
 #       # 'no_input_format' cannot include {input}
 #       no_input_format: "{instruction} "
 #       # For `completion` datsets only, uses the provided field instead of `text` column
 #       field:
 # # Axolotl attempts to save the dataset as an arrow after packing the data together so
 # # subsequent training attempts load faster, relative path
 # dataset_prepared_path: data/last_run_prepared
 # # Push prepared dataset to hub
 # push_dataset_to_hub: # repo path
 # # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
 # # if not set.
 # dataset_processes: # defaults to os.cpu_count() if not set
 # # push checkpoints to hub
 # hub_model_id: # repo path to push finetuned model
 # # how to push checkpoints to hub
 # # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
 # hub_strategy:
 # # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
 # # Required to be true when used in combination with `push_dataset_to_hub`
 # hf_use_auth_token: # boolean
 # # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
 # val_set_size: 0.04
 # # Num shards for whole dataset
 # dataset_shard_num:
 # # Index of shard to use for whole dataset
 # dataset_shard_idx:
 # # The maximum length of an input to train with, this should typically be less than 2048
 # # as most models have a token/context limit of 2048
 # sequence_len: 2048
 # # Pad inputs so each step uses constant sized buffers
 # # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
 # pad_to_sequence_len:
 # # Max sequence length to concatenate training samples together up to
 # # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
 # # FutureWarning: This will soon be DEPRECATED
 # max_packed_sequence_len: 1024
 # # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
 # sample_packing:
 # # Set to 'false' if getting errors during eval with sample_packing on.
 # eval_sample_packing:
 # # You can set these packing optimizations AFTER starting a training at least once.
 # # The trainer will provide recommended values for these values.
 # sample_packing_eff_est:
 # total_num_tokens:
 # # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
 # adapter: lora
 # # If you already have a lora model trained that you want to load, put that here.
 # # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`.
 # lora_model_dir:
 # # LoRA hyperparameters
 # # For more details about the following options, see:
 # # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
 # lora_r: 8
 # lora_alpha: 16
 # lora_dropout: 0.05
 # lora_target_modules:
 #   - q_proj
 #   - v_proj
 # #  - k_proj
 # #  - o_proj
 # #  - gate_proj
 # #  - down_proj
 # #  - up_proj
 # lora_target_linear: # If true, will target all linear layers
 # # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
 # # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
 # # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
 # # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
 # lora_modules_to_save:
 # #  - embed_tokens
 # #  - lm_head
 # # Once you complete training, the model will be saved to the following directory.
 # # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory.
 # # Make sure `lora_model_dir` points to this directory if you want to use the trained model.
 # lora_out_dir:
 # lora_fan_in_fan_out: false
 # # ReLoRA configuration
 # # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
 # relora_steps: # Number of steps per ReLoRA restart
 # relora_warmup_steps: # Number of per-restart warmup steps
 # relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
 # # wandb configuration if you're using it
 # wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
 # wandb_project: # Your wandb project name
 # wandb_entity: # A wandb Team name if using a Team
 # wandb_watch:
 # wandb_run_id: # Set the name of your wandb run
 # wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
 # # Where to save the full-finetuned model to
 # output_dir: ./completed-model
 # # Whether to use torch.compile and which backend to use
 # torch_compile:  # bool
 # torch_compile_backend:  # Optional[str]
 # # Training hyperparameters
 # # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
 # gradient_accumulation_steps: 1
 # # The number of samples to include in each batch. This is the number of samples sent to each GPU.
 # micro_batch_size: 2
 # eval_batch_size:
 # num_epochs: 4
 # warmup_steps: 100  # cannot use with warmup_ratio
 # warmup_ratio: 0.05  # cannot use with warmup_steps
 # learning_rate: 0.00003
 # lr_quadratic_warmup:
 # logging_steps:
 # save_strategy: # Set to `no` to skip checkpoint saves
 # save_steps: # Leave empty to save at each epoch
 # eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
 # save_total_limit: # Checkpoints saved at a time
 # # Maximum number of iterations to train for. It precedes num_epochs which means that
 # # if both are set, num_epochs will not be guaranteed.
 # # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
 # max_steps:
 # eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
 # eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
 # # Save model as safetensors (require safetensors package)
 # save_safetensors:
 # # Whether to mask out or include the human's prompt from the training labels
 # train_on_inputs: false
 # # Group similarly sized data to minimize padding.
 # # May be slower to start, as it must download and sort the entire dataset.
 # # Note that training loss may have an oscillating pattern with this enabled.
 # group_by_length: false
 # # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 # gradient_checkpointing: false
 # # Stop training after this many evaluation losses have increased in a row
 # # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
 # early_stopping_patience: 3
 # # Specify a scheduler and kwargs to use with the optimizer
 # lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
 # lr_scheduler_kwargs:
 # # For one_cycle optim
 # lr_div_factor: # Learning rate div factor
 # # For log_sweep optim
 # log_sweep_min_lr:
 # log_sweep_max_lr:
 # # Specify optimizer
 # # Valid values are driven by the Transformers OptimizerNames class, see:
 # # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
 # #
 # # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
 # # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
 # # in the examples/ for your model and fine-tuning use case.
 # #
 # # Valid values for 'optimizer' include:
 # # - adamw_hf
 # # - adamw_torch
 # # - adamw_torch_fused
 # # - adamw_torch_xla
 # # - adamw_apex_fused
 # # - adafactor
 # # - adamw_anyprecision
 # # - sgd
 # # - adagrad
 # # - adamw_bnb_8bit
 # # - lion_8bit
 # # - lion_32bit
 # # - paged_adamw_32bit
 # # - paged_adamw_8bit
 # # - paged_lion_32bit
 # # - paged_lion_8bit
 # optimizer:
 # # Specify weight decay
 # weight_decay:
 # # adamw hyperparams
 # adam_beta1:
 # adam_beta2:
 # adam_epsilon:
 # # Gradient clipping max norm
 # max_grad_norm:
 # # Augmentation techniques
 # # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
 # # currently only supported on Llama and Mistral
 # noisy_embedding_alpha:
 # # Whether to bettertransformers
 # flash_optimum:
 # # Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
 # xformers_attention:
 # # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
 # flash_attention:
 # flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
 # flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
 # flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
 # flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
 # # Whether to use scaled-dot-product attention
 # # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 # sdp_attention:
 # # Landmark attention (only llama)
 # landmark_attention:
 # # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
 # # LLaMA only
 # xpos_rope:
 # # Resume from a specific checkpoint dir
 # resume_from_checkpoint:
 # # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
 # # Be careful with this being turned on between different models.
 # auto_resume_from_checkpoints: false
 # # Don't mess with this, it's here for accelerate and torchrun
 # local_rank:
 # # Add or change special tokens.
 # # If you add tokens here, you don't need to add them to the `tokens` list.
 # special_tokens:
 #   # bos_token: "<s>"
 #   # eos_token: "</s>"
 #   # unk_token: "<unk>"
 # # Add extra tokens.
 # tokens:
 # # FSDP
 # fsdp:
 # fsdp_config:
 # # Deepspeed config path. e.g., deepspeed/zero3.json
 # deepspeed:
 # # Advanced DDP Arguments
 # ddp_timeout:
 # ddp_bucket_cap_mb:
 # ddp_broadcast_buffers:
 # # Path to torch distx for optim 'adamw_anyprecision'
 # torchdistx_path:
 # # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
 # pretraining_dataset:
 # # Debug mode
 # debug:
 # # Seed
 # seed:
 # # Allow overwrite yml config using from cli
 # strict:
 base_model: ${BASE_MODEL}
 base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
 base_model_config: ${BASE_MODEL_CONFIG}
 revision_of_model: ${REVISION_OF_MODEL}
 tokenizer_config: ${TOKENIZER_CONFIG}
 model_type: ${MODEL_TYPE}
 tokenizer_type: ${TOKENIZER_TYPE}
 trust_remote_code: ${TRUST_REMOTE_CODE}
 tokenizer_use_fast: ${TOKENIZER_USE_FAST}
 tokenizer_legacy: ${TOKENIZER_LEGACY}
 resize_token_embeddings_to_32x: ${RESIZE_TOKEN_EMBEDDINGS_TO_32X}
 is_falcon_derived_model: ${IS_FALCON_DERIVED_MODEL}
 is_llama_derived_model: ${IS_LLAMA_DERIVED_MODEL}
 is_qwen_derived_model: ${IS_QWEN_DERIVED_MODEL}
 is_mistral_derived_model: ${IS_MISTRAL_DERIVED_MODEL}
 overrides_of_model_config:
  rope_scaling:
    type: ${ROPE_SCALING_TYPE}
    factor: ${ROPE_SCALING_FACTOR}
 bnb_config_kwargs:
  llm_int8_has_fp16_weight: ${BNB_LLM_INT8_HAS_FP16_WEIGHT}
  bnb_4bit_quant_type: ${BNB_4BIT_QUANT_TYPE}
  bnb_4bit_use_double_quant: ${BNB_4BIT_USE_DOUBLE_QUANT}
 gptq: ${GPTQ}
 load_in_8bit: ${LOAD_IN_8BIT}
 load_in_4bit: ${LOAD_IN_4BIT}
 bf16: ${BF16}
 fp16: ${FP16}
 tf32: ${TF32}
 bfloat16: ${BFLOAT16}
 float16: ${FLOAT16}
 gpu_memory_limit: ${GPU_MEMORY_LIMIT}
 lora_on_cpu: ${LORA_ON_CPU}
 datasets:
  - path: ${DATASET_PATH}
    type: ${DATASET_TYPE}
    ds_type: ${DATASET_DS_TYPE}
    data_files: ${DATASET_DATA_FILES}
    shards: ${DATASET_SHARDS}
    name: ${DATASET_NAME}
    train_on_split: ${DATASET_TRAIN_ON_SPLIT}
    revision: ${DATASET_REVISION}
    trust_remote_code: ${DATASET_TRUST_REMOTE_CODE}
 rl: ${RL}
 dpo_use_weighting: ${DPO_USE_WEIGHTING}
 chat_template: ${CHAT_TEMPLATE}
 chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
 default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
 dataset_prepared_path: ${DATASET_PREPARED_PATH}
 push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
 dataset_processes: ${DATASET_PROCESSES}
 dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
 hub_model_id: ${HUB_MODEL_ID}
 hub_strategy: ${HUB_STRATEGY}
 hf_use_auth_token: ${HF_USE_AUTH_TOKEN}
 val_set_size: ${VAL_SET_SIZE}
 dataset_shard_num: ${DATASET_SHARD_NUM}
 dataset_shard_idx: ${DATASET_SHARD_IDX}
 sequence_len: ${SEQUENCE_LEN}
 pad_to_sequence_len: ${PAD_TO_SEQUENCE_LEN}
 sample_packing: ${SAMPLE_PACKING}
 eval_sample_packing: ${EVAL_SAMPLE_PACKING}
 sample_packing_eff_est: ${SAMPLE_PACKING_EFF_EST}
 total_num_tokens: ${TOTAL_NUM_TOKENS}
 sample_packing_group_size: ${SAMPLE_PACKING_GROUP_SIZE}
 sample_packing_bin_size: ${SAMPLE_PACKING_BIN_SIZE}
 batch_flattening: ${BATCH_FLATTENING}
 device_map: ${DEVICE_MAP}
 max_memory: ${MAX_MEMORY}
 adapter: ${ADAPTER}
 lora_model_dir: ${LORA_MODEL_DIR}
 lora_r: ${LORA_R}
 lora_alpha: ${LORA_ALPHA}
 lora_dropout: ${LORA_DROPOUT}
 lora_target_modules:
  - ${LORA_TARGET_MODULES}
 lora_target_linear: ${LORA_TARGET_LINEAR}
 peft_layers_to_transform: ${PEFT_LAYERS_TO_TRANSFORM}
 lora_modules_to_save: ${LORA_MODULES_TO_SAVE}
 lora_fan_in_fan_out: ${LORA_FAN_IN_FAN_OUT}
 loraplus_lr_ratio: ${LORAPLUS_LR_RATIO}
 loraplus_lr_embedding: ${LORAPLUS_LR_EMBEDDING}
 peft:
  loftq_config:
    loftq_bits: ${LOFTQ_BITS}
 relora_steps: ${RELORA_STEPS}
 relora_warmup_steps: ${RELORA_WARMUP_STEPS}
 relora_anneal_steps: ${RELORA_ANNEAL_STEPS}
 relora_prune_ratio: ${RELORA_PRUNE_RATIO}
 relora_cpu_offload: ${RELORA_CPU_OFFLOAD}
 wandb_mode: ${WANDB_MODE}
 wandb_project: ${WANDB_PROJECT}
 wandb_entity: ${WANDB_ENTITY}
 wandb_watch: ${WANDB_WATCH}
 wandb_name: ${WANDB_NAME}
 wandb_run_id: ${WANDB_RUN_ID}
 wandb_log_model: ${WANDB_LOG_MODEL}
 mlflow_tracking_uri: ${MLFLOW_TRACKING_URI}
 mlflow_experiment_name: ${MLFLOW_EXPERIMENT_NAME}
 mlflow_run_name: ${MLFLOW_RUN_NAME}
 hf_mlflow_log_artifacts: ${HF_MLFLOW_LOG_ARTIFACTS}
 use_comet: ${USE_COMET}
 comet_api_key: ${COMET_API_KEY}
 comet_workspace: ${COMET_WORKSPACE}
 comet_project_name: ${COMET_PROJECT_NAME}
 comet_experiment_key: ${COMET_EXPERIMENT_KEY}
 comet_mode: ${COMET_MODE}
 comet_online: ${COMET_ONLINE}
 comet_experiment_config: ${COMET_EXPERIMENT_CONFIG}
 output_dir: ${OUTPUT_DIR}
 torch_compile: ${TORCH_COMPILE}
 torch_compile_backend: ${TORCH_COMPILE_BACKEND}
 gradient_accumulation_steps: ${GRADIENT_ACCUMULATION_STEPS}
 micro_batch_size: ${MICRO_BATCH_SIZE}
 eval_batch_size: ${EVAL_BATCH_SIZE}
 num_epochs: ${NUM_EPOCHS}
 warmup_steps: ${WARMUP_STEPS}
 warmup_ratio: ${WARMUP_RATIO}
 learning_rate: ${LEARNING_RATE}
 lr_quadratic_warmup: ${LR_QUADRATIC_WARMUP}
 logging_steps: ${LOGGING_STEPS}
 eval_steps: ${EVAL_STEPS}
 evals_per_epoch: ${EVALS_PER_EPOCH}
 save_strategy: ${SAVE_STRATEGY}
 save_steps: ${SAVE_STEPS}
 saves_per_epoch: ${SAVES_PER_EPOCH}
 save_total_limit: ${SAVE_TOTAL_LIMIT}
 max_steps: ${MAX_STEPS}
 eval_table_size: ${EVAL_TABLE_SIZE}
 eval_max_new_tokens: ${EVAL_MAX_NEW_TOKENS}
 eval_causal_lm_metrics: ${EVAL_CAUSAL_LM_METRICS}
 profiler_steps: ${PROFILER_STEPS}
 loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
 loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
 save_safetensors: ${SAVE_SAFETENSORS}
 train_on_inputs: ${TRAIN_ON_INPUTS}
 group_by_length: ${GROUP_BY_LENGTH}
 gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
 early_stopping_patience: ${EARLY_STOPPING_PATIENCE}
 lr_scheduler: ${LR_SCHEDULER}
 lr_scheduler_kwargs: ${LR_SCHEDULER_KWARGS}
 cosine_min_lr_ratio: ${COSINE_MIN_LR_RATIO}
 cosine_constant_lr_ratio: ${COSINE_CONSTANT_LR_RATIO}
 lr_div_factor: ${LR_DIV_FACTOR}
 optimizer: ${OPTIMIZER}
 optim_args: ${OPTIM_ARGS}
 optim_target_modules: ${OPTIM_TARGET_MODULES}
 weight_decay: ${WEIGHT_DECAY}
 adam_beta1: ${ADAM_BETA1}
 adam_beta2: ${ADAM_BETA2}
 adam_epsilon: ${ADAM_EPSILON}
 max_grad_norm: ${MAX_GRAD_NORM}
 neftune_noise_alpha: ${NEFTUNE_NOISE_ALPHA}
 flash_optimum: ${FLASH_OPTIMUM}
 xformers_attention: ${XFORMERS_ATTENTION}
 flash_attention: ${FLASH_ATTENTION}
 flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
 flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
 flash_attn_fuse_qkv: ${FLASH_ATTN_FUSE_QKV}
 flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
 sdp_attention: ${SDP_ATTENTION}
 s2_attention: ${S2_ATTENTION}
 resume_from_checkpoint: ${RESUME_FROM_CHECKPOINT}
 auto_resume_from_checkpoints: ${AUTO_RESUME_FROM_CHECKPOINTS}
 local_rank: ${LOCAL_RANK}
 special_tokens:
  bos_token: ${SPECIAL_TOKEN_BOS}
  eos_token: ${SPECIAL_TOKEN_EOS}
  unk_token: ${SPECIAL_TOKEN_UNK}
  pad_token: ${SPECIAL_TOKEN_PAD}
 tokens: ${TOKENS}
 fsdp: ${FSDP}
 fsdp_config: ${FSDP_CONFIG}
 deepspeed: ${DEEPSPEED}
 ddp_timeout: ${DDP_TIMEOUT}
 ddp_bucket_cap_mb: ${DDP_BUCKET_CAP_MB}
 ddp_broadcast_buffers: ${DDP_BROADCAST_BUFFERS}
 torchdistx_path: ${TORCHDISTX_PATH}
 pretraining_dataset: ${PRETRAINING_DATASET}
 debug: ${DEBUG}
 seed: ${SEED}
 strict: ${STRICT}
--- a/.runpod/src/handler.py
+++ b/.runpod/src/handler.py
@@ -1,64 +0,0 @@
 """
 Runpod serverless entrypoint handler
 """
 import os
 import runpod
 import yaml
 from huggingface_hub._login import login
 from train import train
 from utils import get_output_dir
 BASE_VOLUME = os.environ.get("BASE_VOLUME", "/runpod-volume")
 if not os.path.exists(BASE_VOLUME):
    os.makedirs(BASE_VOLUME)
 logger = runpod.RunPodLogger()
 async def handler(job):
    runpod_job_id = job["id"]
    inputs = job["input"]
    run_id = inputs.get("run_id", "default_run_id")
    args = inputs.get("args", {})
    # Set output directory
    output_dir = os.path.join(BASE_VOLUME, get_output_dir(run_id))
    args["output_dir"] = output_dir
    # First save args to a temporary config file
    config_path = "/workspace/test_config.yaml"
    # Add run_name and job_id to args before saving
    args["run_name"] = run_id
    args["runpod_job_id"] = runpod_job_id
    yaml_data = yaml.dump(args, default_flow_style=False)
    with open(config_path, "w", encoding="utf-8") as file:
        file.write(yaml_data)
    # Handle credentials
    credentials = inputs.get("credentials", {})
    if "wandb_api_key" in credentials:
        os.environ["WANDB_API_KEY"] = credentials["wandb_api_key"]
    if "hf_token" in credentials:
        os.environ["HF_TOKEN"] = credentials["hf_token"]
    if os.environ.get("HF_TOKEN"):
        login(token=os.environ["HF_TOKEN"])
    else:
        logger.info("No HF_TOKEN provided. Skipping login.")
    logger.info("Starting Training.")
    async for result in train(config_path):  # Pass the config path instead of args
        logger.info(result)
    logger.info("Training Complete.")
    # Cleanup
    del os.environ["WANDB_API_KEY"]
    del os.environ["HF_TOKEN"]
 runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
--- a/.runpod/src/test_input.json
+++ b/.runpod/src/test_input.json
@@ -1,61 +0,0 @@
 {
  "input": {
    "user_id": "user",
    "model_id": "llama-test",
    "run_id": "llama-test",
    "credentials": {
      "wandb_api_key": "",
      "hf_token": ""
    },
    "args": {
      "base_model": "NousResearch/Meta-Llama-3-8B",
      "model_type": "LlamaForCausalLM",
      "tokenizer_type": "AutoTokenizer",
      "load_in_8bit": true,
      "load_in_4bit": false,
      "strict": false,
      "datasets": [
        {
          "path": "mhenrichsen/alpaca_2k_test",
          "type": "alpaca"
        }
      ],
      "val_set_size": 0.05,
      "output_dir": "./outputs/lora-out",
      "sequence_len": 4096,
      "sample_packing": true,
      "eval_sample_packing": false,
      "pad_to_sequence_len": true,
      "adapter": "lora",
      "lora_r": 32,
      "lora_alpha": 16,
      "lora_dropout": 0.05,
      "lora_target_linear": true,
      "lora_modules_to_save": [
        "embed_tokens",
        "lm_head"
      ],
      "gradient_accumulation_steps": 4,
      "micro_batch_size": 2,
      "num_epochs": 1,
      "optimizer": "adamw_bnb_8bit",
      "lr_scheduler": "cosine",
      "learning_rate": 0.0002,
      "train_on_inputs": false,
      "group_by_length": false,
      "bf16": "auto",
      "tf32": false,
      "gradient_checkpointing": true,
      "logging_steps": 1,
      "flash_attention": true,
      "warmup_steps": 1,
      "evals_per_epoch": 1,
      "eval_max_new_tokens": 128,
      "saves_per_epoch": 1,
      "weight_decay": 0.0,
      "special_tokens": {
        "pad_token": "<|end_of_text|>"
      }
    }
  }
 }
--- a/.runpod/src/train.py
+++ b/.runpod/src/train.py
@@ -1,45 +0,0 @@
 """
 Runpod train entrypoint
 """
 import asyncio
 async def train(config_path: str, gpu_id: str = "0", preprocess: bool = True):
    """
    Run preprocessing (if enabled) and training with the given config file
    :param config_path: Path to the YAML config file
    :param gpu_id: GPU ID to use (default: "0")
    :param preprocess: Whether to run preprocessing (default: True)
    """
    # First check if preprocessing is needed
    if preprocess:
        # Preprocess command
        preprocess_cmd = (
            f"CUDA_VISIBLE_DEVICES={gpu_id} axolotl preprocess {config_path}"
        )
        process = await asyncio.create_subprocess_shell(
            preprocess_cmd,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.STDOUT,
        )
        if process.stdout is not None:
            async for line in process.stdout:
                yield f"Preprocessing: {line.decode().strip()}"
        await process.wait()
        yield "Preprocessing completed."
    else:
        yield "Skipping preprocessing step."
    # Training command
    train_cmd = f"axolotl train {config_path}"
    process = await asyncio.create_subprocess_shell(
        train_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
    )
    if process.stdout is not None:
        async for line in process.stdout:
            yield f"Training: {line.decode().strip()}"
    await process.wait()
--- a/.runpod/src/utils.py
+++ b/.runpod/src/utils.py
@@ -1,89 +0,0 @@
 """
 Runpod launcher utils
 """
 import os
 import yaml
 def get_output_dir(run_id):
    path = f"fine-tuning/{run_id}"
    return path
 def make_valid_config(input_args):
    """
    Creates and saves updated config file, returns the path to the new config
    :param input_args: dict of input args
    :return: str, path to the updated config file
    """
    # Load default config
    with open("config/config.yaml", "r", encoding="utf-8") as fin:
        all_args = yaml.safe_load(fin)
    if not input_args:
        print("No args provided, using defaults")
    else:
        all_args.update(input_args)
    # Create updated config path
    updated_config_path = "config/updated_config.yaml"
    # Save updated config to new file
    with open(updated_config_path, "w", encoding="utf-8") as f:
        yaml.dump(all_args, f)
    return updated_config_path
 def set_config_env_vars(args: dict):
    """
    Convert API arguments into environment variables.
    Handles nested dictionaries, lists, and special values.
    Args:
        args (dict): The arguments dictionary from the API request
    """
    def process_value(value):
        """Convert Python values to string format for environment variables"""
        if value is None:
            return ""
        if isinstance(value, bool):
            return str(value).lower()
        if isinstance(value, (list, dict)):
            return str(value)
        return str(value)
    def set_env_vars(data, prefix=""):
        """Recursively set environment variables from nested dictionary"""
        for key, value in data.items():
            env_key = prefix + key.upper()
            # Handle special cases
            if isinstance(value, dict):
                # For nested dictionaries (like special_tokens)
                set_env_vars(value, f"{env_key}_")
            elif isinstance(value, list):
                # Handle list of dictionaries (like datasets)
                if value and isinstance(value[0], dict):
                    for i, item in enumerate(value):
                        set_env_vars(item, f"{env_key}_{i}_")
                else:
                    # For simple lists (like lora_target_modules)
                    os.environ[env_key] = process_value(value)
            else:
                # Handle all other cases
                os.environ[env_key] = process_value(value)
    # Clear any existing related environment variables
    # This prevents old values from persisting
    for key in list(os.environ.keys()):
        if key.startswith(
            ("BASE_MODEL", "MODEL_TYPE", "TOKENIZER_TYPE", "DATASET", "LORA_", "WANDB_")
        ):
            del os.environ[key]
    # Set new environment variables
    set_env_vars(args)
--- a/.runpod/test-input.json
+++ b/.runpod/test-input.json
@@ -1,86 +0,0 @@
 {
  "input": {
    "name": "quick_smoke_test_sft",
    "user_id": "user",
    "model_id": "llama-test",
    "run_id": "llama-test",
    "credentials": {
      "wandb_api_key": "",
      "hf_token": ""
    },
    "args": {
      "base_model": "HuggingFaceTB/SmolLM2-135M",
      "model_type": "AutoModelForCausalLM",
      "tokenizer_type": "AutoTokenizer",
      "load_in_4bit": true,
      "strict": false,
      "datasets": [
        {
          "path": "mhenrichsen/alpaca_2k_test",
          "type": "alpaca",
          "split": "train[:10%]"
        }
      ],
      "val_set_size": 0.02,
      "output_dir": "./outputs/lora-out",
      "sequence_len": 4096,
      "sample_packing": true,
      "eval_sample_packing": false,
      "pad_to_sequence_len": true,
      "adapter": "qlora",
      "lora_r": 32,
      "lora_alpha": 64,
      "lora_dropout": 0.05,
      "lora_target_linear": true,
      "lora_modules_to_save": [
        "embed_tokens",
        "lm_head"
      ],
      "gradient_accumulation_steps": 2,
      "micro_batch_size": 1,
      "num_epochs": 1,
      "optimizer": "adamw_torch_fused",
      "lr_scheduler": "cosine",
      "learning_rate": 0.0002,
      "train_on_inputs": false,
      "group_by_length": false,
      "bf16": "auto",
      "tf32": true,
      "gradient_checkpointing": true,
      "logging_steps": 1,
      "flash_attention": true,
      "warmup_steps": 1,
      "evals_per_epoch": 1,
      "eval_max_new_tokens": 128,
      "saves_per_epoch": 1,
      "weight_decay": 0.0,
      "special_tokens": {
        "pad_token": "<|endoftext|>"
      },
      "max_steps": 20
    },
    "timeout": 100000
  },
  "config": {
    "gpuTypeId": "NVIDIA GeForce RTX 4090",
    "gpuCount": 1,
    "containerDiskInGb": 200,
    "env": [
      {
        "key": "TOKENIZER",
        "value": ""
      },
      {
        "key": "DISABLE_LOG_STATS",
        "value": "true"
      }
    ],
    "allowedCudaVersions": [
      "12.8",
      "12.7",
      "12.6",
      "12.5",
      "12.4"
    ]
  }
 }
--- a/.runpod/tests.json
+++ b/.runpod/tests.json
@@ -1,90 +0,0 @@
 {
  "tests": [
    {
      "name": "quick_smoke_test_sft",
      "input": {
        "user_id": "user",
        "model_id": "llama-test",
        "run_id": "llama-test",
        "credentials": {
          "wandb_api_key": "",
          "hf_token": ""
        },
        "args": {
          "base_model": "HuggingFaceTB/SmolLM2-135M",
          "model_type": "AutoModelForCausalLM",
          "tokenizer_type": "AutoTokenizer",
          "load_in_4bit": true,
          "strict": false,
          "datasets": [
            {
              "path": "mhenrichsen/alpaca_2k_test",
              "type": "alpaca",
              "split": "train[:10%]"
            }
          ],
          "val_set_size": 0.02,
          "output_dir": "./outputs/lora-out",
          "sequence_len": 4096,
          "sample_packing": true,
          "eval_sample_packing": false,
          "pad_to_sequence_len": true,
          "adapter": "qlora",
          "lora_r": 32,
          "lora_alpha": 64,
          "lora_dropout": 0.05,
          "lora_target_linear": true,
          "lora_modules_to_save": [
            "embed_tokens",
            "lm_head"
          ],
          "gradient_accumulation_steps": 2,
          "micro_batch_size": 1,
          "num_epochs": 1,
          "optimizer": "adamw_torch_fused",
          "lr_scheduler": "cosine",
          "learning_rate": 0.0002,
          "train_on_inputs": false,
          "group_by_length": false,
          "bf16": "auto",
          "tf32": true,
          "gradient_checkpointing": true,
          "logging_steps": 1,
          "flash_attention": true,
          "warmup_steps": 1,
          "evals_per_epoch": 1,
          "eval_max_new_tokens": 128,
          "saves_per_epoch": 1,
          "weight_decay": 0.0,
          "special_tokens": {
            "pad_token": "<|endoftext|>"
          },
          "max_steps": 20
        }
      },
      "timeout": 100000
    }
  ],
  "config": {
    "gpuTypeId": "NVIDIA GeForce RTX 4090",
    "gpuCount": 1,
    "containerDiskInGb": 200,
    "env": [
      {
        "key": "TOKENIZER",
        "value": ""
      },
      {
        "key": "DISABLE_LOG_STATS",
        "value": "true"
      }
    ],
    "allowedCudaVersions": [
      "12.8",
      "12.7",
      "12.6",
      "12.5",
      "12.4"
    ]
  }
 }
--- a/1
+++ b/1
@@ -1 +0,0 @@
 docs.axolotl.ai
--- a/README.md
+++ b/README.md
@@ -9,7 +9,6 @@
 <p align="center">
    <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
    <a href="https://codecov.io/gh/axolotl-ai-cloud/axolotl"><img src="https://codecov.io/gh/axolotl-ai-cloud/axolotl/branch/main/graph/badge.svg" alt="codecov"></a>
    <a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
    <br/>
    <a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
@@ -64,7 +63,7 @@ axolotl fetch examples
 axolotl fetch deepspeed_configs  # OPTIONAL
 ```
-Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
+Other installation approaches are described [here](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html).
 ### Your First Fine-tune
@@ -79,7 +78,7 @@ axolotl fetch examples --dest path/to/folder
 axolotl train examples/llama-3/lora-1b.yml
 ```
-That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.
+That's it! Check out our [Getting Started Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/getting-started.html) for a more detailed walkthrough.
 ## ✨ Key Features
@@ -92,20 +91,20 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge
 ## 📚 Documentation
- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
+- [Installation Options](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html) - Detailed setup instructions for different environments
- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
+- [Configuration Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html) - Full configuration options and examples
- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
+- [Dataset Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/) - Supported formats and how to use them
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-GPU Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [Multi-Node Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html)
- [Multipacking](https://docs.axolotl.ai/docs/multipack.html)
+- [Multipacking](https://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html)
- [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
+- [API Reference](https://axolotl-ai-cloud.github.io/axolotl/docs/api/) - Auto-generated code documentation
- [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions
+- [FAQ](https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html) - Frequently asked questions
 ## 🤝 Getting Help
 - Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
 - Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
- Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html)
+- Read our [Debugging Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html)
 - Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options
 ## 🌟 Contributing
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -3,53 +3,10 @@ set -e
 python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
-# Run unit tests with initial coverage report
+pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli /workspace/axolotl/tests/
-pytest -v --durations=10 -n8 \
+pytest -v --durations=10 /workspace/axolotl/tests/e2e/patched/lora_kernels  # running these with the other patches causes a failure
-  --ignore=tests/e2e/ \
+pytest -v --durations=10 --ignore=tests/e2e/patched/lora_kernels /workspace/axolotl/tests/e2e/patched
-  --ignore=tests/patched/ \
+pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/solo/
-  --ignore=tests/cli \
+pytest -v --durations=10 /workspace/axolotl/tests/e2e/integrations/
-  /workspace/axolotl/tests/ \
+pytest -v --durations=10 /workspace/axolotl/tests/cli
-  --cov=axolotl
+pytest -v --durations=10 --ignore=tests/e2e/solo/ --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ --ignore=tests/cli /workspace/axolotl/tests/e2e/
 # Run lora kernels tests with coverage append
 pytest -v --durations=10 \
  /workspace/axolotl/tests/e2e/patched/lora_kernels \
  --cov=axolotl \
  --cov-append
 # Run patched tests excluding lora kernels with coverage append
 pytest -v --durations=10 \
  --ignore=tests/e2e/patched/lora_kernels \
  /workspace/axolotl/tests/e2e/patched \
  --cov=axolotl \
  --cov-append
 # Run solo tests with coverage append
 pytest -v --durations=10 -n1 \
  /workspace/axolotl/tests/e2e/solo/ \
  --cov=axolotl \
  --cov-append
 # Run integration tests with coverage append
 pytest -v --durations=10 \
  /workspace/axolotl/tests/e2e/integrations/ \
  --cov=axolotl \
  --cov-append
 pytest -v --durations=10 /workspace/axolotl/tests/cli \
  --cov=axolotl \
  --cov-append
 # Run remaining e2e tests with coverage append and final report
 pytest -v --durations=10 \
  --ignore=tests/e2e/solo/ \
  --ignore=tests/e2e/patched/ \
  --ignore=tests/e2e/multigpu/ \
  --ignore=tests/e2e/integrations/ \
  --ignore=tests/cli \
  /workspace/axolotl/tests/e2e/ \
  --cov=axolotl \
  --cov-append \
  --cov-report=xml:e2e-coverage.xml
 codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -28,7 +28,6 @@ df_args = {
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -29,7 +29,6 @@ df_args = {
    "CUDA": os.environ.get("CUDA", "121"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -1,23 +1,6 @@
 #!/bin/bash
 set -e
-# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
+# only run one test at a time so as not to OOM the GPU
-pytest -v -n2 \
+pytest -v  --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
-  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
+pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
  --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
  /workspace/axolotl/tests/e2e/multigpu/ \
  --cov=axolotl
 # Run solo tests with coverage append
 pytest -v --durations=10 -n1 \
  /workspace/axolotl/tests/e2e/multigpu/solo/ \
  --cov=axolotl \
  --cov-append
 pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
  --cov=axolotl \
  --cov-append \
  --cov-report=xml:multigpu-coverage.xml
 # Upload coverage to Codecov
 codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,56 +0,0 @@
 codecov:
  require_ci_to_pass: yes
  notify:
    wait_for_ci: true
 coverage:
  precision: 2
  round: down
  range: "70...100"
  status:
    project:
      default:
        # basic
        target: auto
        threshold: 0%
        base: auto
        # advanced
        branches: null
        if_no_uploads: error
        if_not_found: success
        if_ci_failed: error
        only_pulls: false
        flags: null
        paths: null
    patch:
      default:
        # basic
        target: auto
        threshold: 0%
        base: auto
        # advanced
        branches: null
        if_no_uploads: error
        if_not_found: success
        if_ci_failed: error
        only_pulls: false
        flags: null
        paths: null
 parsers:
  gcov:
    branch_detection:
      conditional: yes
      loop: yes
      method: no
      macro: no
 comment:
  layout: "reach,diff,flags,files,footer"
  behavior: default
  require_changes: no
  require_base: no
  require_head: yes
 github_checks:
  annotations: false
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -37,7 +37,3 @@ RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10
 RUN if [ "$PYTORCH_VERSION" = "2.7.0" ] ; then \
        pip3 install flash-attn==2.7.4.post1; \
    fi
--- a/docs/cli.qmd
+++ b/docs/cli.qmd
@@ -199,17 +199,6 @@ output_dir: # Directory to save evaluation results
 See [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) for more details.
 ### delinearize-llama4
 Delinearizes a Llama 4 linearized model into a regular HuggingFace Llama 4 model. This only works with the non-quantized linearized model.
 ```bash
 axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
 ```
 This would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.
 ## Legacy CLI Usage
 While the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -90,7 +90,7 @@ lora_on_cpu: true
 # List[str]. Add plugins to extend the pipeline.
 # See `src/axolotl/integrations` for the available plugins or doc below for more details.
-# https://docs.axolotl.ai/docs/custom_integrations.html
+# https://axolotl-ai-cloud.github.io/axolotl/docs/custom_integrations.html
 plugins:
  # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
@@ -154,10 +154,6 @@ datasets:
    # Key containing the messages (default: "messages")
    field_messages: messages
    # Key containing the system message (default: "system")
    # If the system message is not present in the dataset sample, it will be loaded from the field_system property.
    field_system: system
    # Mapping of properties from the input dataset to the chat template.
    # (default: message_property_mappings={'role':'role', 'content':'content'})
    # If a property exists in the template but not in this mapping, the system will attempt
@@ -184,14 +180,10 @@ datasets:
    # adding a system turn with empty content.
    drop_system_message:
    # Optional[bool]. Whether to split the assistant turn based on a reasoning trace inside delimited tags
    # defaults to False
    split_thinking:
    # IMPORTANT: The following fields determine which parts of the conversation to train on.
    # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
    # See examples at `docs/dataset-formats/conversation.qmd`
-    # Note: If the below 5 fields are empty, defaults to training only on the last message.
+    # Note: If the below 4 fields are set to empty, defaults to training only on the last message.
    # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
    roles_to_train: ["assistant"]  # default
@@ -200,13 +192,7 @@ datasets:
    # - turn (default): train on the EOS token at the end of each trainable turn
    # - last: train on the last EOS token in the conversation
    # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.
-    train_on_eos: turn
+    train_on_eos: last
    # Optional[str]. Which EOT (End-of-Turn) tokens to train on in the conversation. Possible values are:
    # - all: train on all EOT tokens
    # - turn: train on the EOT token at the end of each trainable turn
    # - last: train on the last EOT token in the conversation
    # If not specified, defaults to the value of train_on_eos for backward compatibility.
    train_on_eot:
    # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
    message_field_training: training
    # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
@@ -289,17 +275,8 @@ process_reward_model:
 chat_template: tokenizer_default
 # custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
 chat_template_jinja: null
-# Optional[List[str]]. Custom EOT (End-of-Turn) tokens to mask/unmask during training.
+# Changes the default system message. Currently only supports chatml.
-# These tokens mark the boundaries between conversation turns.
+default_system_message: You are a helpful assistant. Please give a long and detailed answer.
 # For example: ["/INST", "</s>", "[/SYSTEM_PROMPT]"]
 # If not specified, defaults to just the model's eos_token.
 # This is useful for templates that use multiple delimiter tokens.
 eot_tokens:
  # - "</s>"
  # - "[/INST]"
  # - "[/SYSTEM_PROMPT]"
 # Changes the default system message
 default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
 # Axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
@@ -417,7 +394,7 @@ lora_fan_in_fan_out: false
 # Apply custom LoRA autograd functions and activation function Triton kernels for
 # speed and memory savings
-# See: https://docs.axolotl.ai/docs/lora_optims.html
+# See: https://axolotl-ai-cloud.github.io/axolotl/docs/lora_optims.html
 lora_mlp_kernel: true
 lora_qkv_kernel: true
 lora_o_kernel: true
@@ -684,10 +661,8 @@ special_tokens:
  # unk_token: "<unk>"
  # pad_token: "[PAD]"
-# Optional[list[str]]. Add extra tokens to the tokenizer.
+# Add extra tokens.
 tokens:
  # - "<|startoftext|>"
  # - "<|endoftext|>"
 # Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.
 # Only works for tokens that are not part of the base vocab (aka are added_tokens).
@@ -713,14 +688,11 @@ ddp_broadcast_buffers:
 # Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
 # E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
 # subsequences, or set to 4 to split into four equal-sized subsequences.
-# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details.
+# See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
 sequence_parallel_degree:
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 # Must evenly divide the number of KV heads in your model.
 heads_k_stride: 1
 # One of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to "varlen_llama3"
 # in the sample packing case, and "batch_ring" in the non-sample packing case.
 ring_attn_func:
 # Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:
--- a/docs/custom_integrations.qmd
+++ b/docs/custom_integrations.qmd
@@ -49,8 +49,7 @@ sections = [
    ("Knowledge Distillation (KD)", "kd"),
    ("Liger Kernels", "liger"),
    ("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
-    ("Spectrum", "spectrum"),
+    ("Spectrum", "spectrum")
    ("LLMCompressor", "llm_compressor")
 ]
 for section_name, folder_name in sections:
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -4,6 +4,18 @@ description: Conversation format for supervised fine-tuning.
 order: 3
 ---
 ## sharegpt
 ::: {.callout-important}
 ShareGPT is deprecated!. Please see [chat_template](#chat_template) section below.
 :::
 ## pygmalion
 ```{.json filename="data.jsonl"}
 {"conversations": [{"role": "...", "value": "..."}]}
 ```
 ## chat_template
 Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2.
@@ -52,7 +64,7 @@ We recommend checking the below examples for other usecases.
 ### Examples
-1. (Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
+1. Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
 ```yaml
 datasets:
@@ -97,55 +109,10 @@ datasets:
 ```
 ::: {.callout-important}
-Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
+Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.
 :::
-5. If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
+5. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
 ```yaml
 eot_tokens:
  - "[/INST]"
  # - "[/SYSTEM_PROMPT]"
 datasets:
  - path: ...
    type: chat_template
    # optional
    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)
 ```
 ::: {.callout-tip}
 See [config documentation](../config.qmd) for detailed explanations of "turn", "last", and "all" options for training on tokens.
 :::
 ::: {.callout-note}
 Using `eot_tokens` requires each token that exists in `chat_template` to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.
 You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config.qmd) for more details.
 :::
 6. Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
 ```yaml
 eot_tokens:
  - "[/INST]"
  # ...
 datasets:
  - path: ...
    type: chat_template
    train_on_eos: last
    train_on_eot: turn
 ```
 ::: {.callout-tip}
 If EOS token only appears at the end of a prompt, `train_on_eos: last` is equivalent to `train_on_eos: turn`. Therefore, generally, you can leave them to their defaults and omit them.
 :::
 7. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
 For a data sample that looks like:
@@ -195,15 +162,3 @@ datasets:
 ::: {.callout-tip}
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::
 ## sharegpt
 ::: {.callout-important}
 ShareGPT is deprecated!. Please see [chat_template](#chat_template) section.
 :::
 ## pygmalion
 ```{.json filename="data.jsonl"}
 {"conversations": [{"role": "...", "value": "..."}]}
 ```
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -457,7 +457,10 @@ datasets:
    type: alpaca
 ```
-Axolotl supports many kinds of instruction dataset. All of them can be found in the [Instruction Dataset Documentation](inst_tune.qmd) with their respective type and sample row format.
+Axolotl supports many kinds of instruction dataset. All of them can be found here (https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html) with their respective type and sample row format.
 Reference: [Instruction Dataset Documentation](inst_tune.qmd).
 #### Custom Instruct Prompt Format
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -28,8 +28,6 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
 Tags examples:
 - `main-base-py3.11-cu128-2.7.0`
 - `main-base-py3.11-cu126-2.7.0`
 - `main-base-py3.11-cu124-2.6.0`
 - `main-base-py3.11-cu124-2.5.1`
 - `main-base-py3.11-cu124-2.4.1`
@@ -52,7 +50,7 @@ Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
 # on push to main
 main-py{python_version}-cu{cuda_version}-{pytorch_version}
-# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)
+# latest main (currently torch 2.5.1, python 3.11, cuda 12.4)
 main-latest
 # nightly build
@@ -70,7 +68,6 @@ There may be some extra tags appended to the image, like `-vllm` which installs
 Tags examples:
 - `main-py3.11-cu126-2.7.0`
 - `main-py3.11-cu124-2.6.0`
 - `main-py3.11-cu124-2.5.1`
 - `main-py3.11-cu124-2.4.1`
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -73,40 +73,10 @@ description: Frequently asked questions
 > A: This is likely an empty turn.
-**Q: The EOS token is incorrectly being masked or not being masked / `EOS token __ not found in chat template`.**
+**Q: The EOS/EOT token is incorrectly being masked or not being masked.**
-> A: There can be two reasons:
+> A: This is because of the mismatch between `tokenizer.eos_token` and EOS/EOT token in template. Please make sure to set `eos_token` under `special_tokens` to the same EOS/EOT token as in template.
 > 1. This is because of the mismatch between `tokenizer.eos_token` and EOS token in template. Please make sure to set `eos_token: ` under `special_tokens: ` to the same EOS token as in template.
 > 2. The EOS token is not in the template. Please check if your template is correct. As an example, `phi_35` template does not use its dedicated EOS token `<|endoftext|>` at the end.
 **Q: "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null. Please add a `chat_template` in tokenizer config"**
 > A: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See [chat_template](dataset-formats/conversation.qmd#chat-template) for more details.
 **Q: The EOT token(s) are incorrectly being masked or not being masked / `EOT token __ not found in chat template`.**
 > A: There can be two reasons:
 > 1. The EOT token is different from the EOS token and was not specified under `eot_tokens: `. Please set `eot_tokens: ` to the same EOT token(s) as in template.
 > 2. There is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case.
 **Q: `EOT token encoding failed. Please check if the token is valid and can be encoded.`**
 > A: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.
 **Q: `EOT token __ is encoded as multiple tokens.`**
 > A: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `.
 **Q: `Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot`**
 > A: This is because the EOS token is in the `eot_tokens: ` while mismatch between `train_on_eos: ` and `train_on_eot: `. This will cause one to override the other. Please ensure that `train_on_eos: ` and `train_on_eot: ` are the same or remove the EOS token from `eot_tokens: `.
 **Q: If `eot_tokens: ` is not provided, what happens?**
 > A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.
 > Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens.
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -19,12 +19,6 @@ This guide covers all the ways you can install and set up Axolotl for your envir
 ## Installation Methods {#sec-installation-methods}
 ::: {.callout-important}
 Please make sure to have Pytorch installed before installing Axolotl in your local environment.
 Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
 :::
 ### PyPI Installation (Recommended) {#sec-pypi}
 ```{.bash}
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -36,9 +36,6 @@ deepspeed: deepspeed_configs/zero1.json
 ### Usage {#sec-deepspeed-usage}
 ```{.bash}
 # Fetch deepspeed configs (if not already present)
 axolotl fetch deepspeed_configs
 # Passing arg via config
 axolotl train config.yml
@@ -51,20 +48,10 @@ axolotl train config.yml --deepspeed deepspeed_configs/zero1.json
 We provide default configurations for:
 - ZeRO Stage 1 (`zero1.json`)
 - ZeRO Stage 1 with torch compile (`zero1_torch_compile.json`)
 - ZeRO Stage 2 (`zero2.json`)
 - ZeRO Stage 3 (`zero3.json`)
 - ZeRO Stage 3 with bf16 (`zero3_bf16.json`)
 - ZeRO Stage 3 with bf16 and CPU offload params(`zero3_bf16_cpuoffload_params.json`)
 - ZeRO Stage 3 with bf16 and CPU offload params and optimizer (`zero3_bf16_cpuoffload_all.json`)
-::: {.callout-tip}
+Choose based on your memory requirements and performance needs.
 Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.
 Start from Stage 1 -> Stage 2 -> Stage 3.
 :::
 ## FSDP {#sec-fsdp}
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -164,7 +164,7 @@ Here is an example of a multi-modal dataset:
        {
            "role": "user",
            "content": [
-                {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
+                {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
                {"type": "text", "text": "Describe this image in detail."}
            ]
        },
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -502,7 +502,9 @@ The input format is a simple JSON input with customizable fields based on the ab
 Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
 :::
-In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:
+If you have multiple GPUs available, we reccomend using `vLLM` with the `GRPOTrainer` to significantly speedup trajectory generation during training.
 First, launch a `vLLM` server using `trl vllm-serve` - you may use a config file or CLI overrides to configure your vLLM server. In this example, we're
 using 4 GPUs - 2 for training, and 2 for vLLM:
 ::: {.callout-important}
 Make sure you've installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. `pip install axolotl[vllm]`.
@@ -528,7 +530,7 @@ trl:
 ```
 ```bash
-CUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml
+CUDA_VISIBLE_DEVICES=2,3 axolotl vllm_serve grpo.yaml
 ```
 Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:
@@ -537,10 +539,6 @@ Your `vLLM` instance will now attempt to spin up, and it's time to kick off trai
 CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
 ```
 ::: {.callout-note}
 Due to TRL's implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use `CUDA_VISIBLE_DEVICES=2,3` for the vLLM instance.
 :::
 #### Reward functions
 GRPO uses custom reward functions and transformations. Please have them ready locally.
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -27,9 +27,6 @@ To enable sequence parallelism, add the following to your configuration file:
 sequence_parallel_degree: 4  # Split sequences across 4 GPUs
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
 # Optional; one of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to
 # "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
 ring_attn_func:
 ```
 The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -8,6 +8,7 @@ tokenizer_type: GPT2Tokenizer
 trust_remote_code: true
 tokenizer_use_fast: true
 tokenizer_legacy: true
 strict: false
 push_dataset_to_hub:
 hf_use_auth_token: true
 datasets:
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -4,6 +4,7 @@ base_model: cerebras/Cerebras-GPT-1.3B
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -4,6 +4,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 chat_template: cohere
--- a/examples/dbrx/16bit-lora.yaml
+++ b/examples/dbrx/16bit-lora.yaml
@@ -3,6 +3,7 @@ base_model: LnL-AI/dbrx-base-converted-v2
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/dbrx/8bit-lora.yaml
+++ b/examples/dbrx/8bit-lora.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/dbrx/fft-ds-zero3.yaml
+++ b/examples/dbrx/fft-ds-zero3.yaml
@@ -3,6 +3,7 @@ base_model: LnL-AI/dbrx-base-converted-v2
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/deepcoder/deepcoder-14B-preview-lora.yml
@@ -1,58 +0,0 @@
 base_model: agentica-org/DeepCoder-14B-Preview
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -1,58 +0,0 @@
 base_model: deepcogito/cogito-v1-preview-llama-3B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -1,58 +0,0 @@
 base_model: deepcogito/cogito-v1-preview-qwen-14B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -2,6 +2,7 @@ base_model: deepseek-ai/DeepSeek-V2-Lite
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 plugins:
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -11,6 +11,7 @@ trust_remote_code: true
 load_in_8bit: true
 load_in_4bit: false
 gptq: false
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -15,6 +15,7 @@ load_in_8bit: false
 # enable 4bit for QLoRA
 load_in_4bit: true
 gptq: false
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: QingyiSi/Alpaca-CoT
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -8,6 +8,7 @@ tokenizer_type: AutoTokenizer
 # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
 trust_remote_code: true
 gptq: false
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -8,6 +8,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 datasets:
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 chat_template: gemma
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -5,6 +5,7 @@ num_labels: 1
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 reward_model: true
 chat_template: gemma
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -10,6 +10,7 @@ ddp_find_unused_parameters: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 chat_template: gemma3
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -1,4 +1,5 @@
 base_model: google/gemma-3-4b-it
 strict: false
 load_in_4bit: true
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -1,5 +1,6 @@
 base_model: google/gemma-3-4b-it
 processor_type: AutoProcessor
 strict: false
 load_in_4bit: true
--- a/examples/glm4/qlora-32b.yaml
+++ b/examples/glm4/qlora-32b.yaml
@@ -1,62 +0,0 @@
 base_model: THUDM/GLM-4-32B-0414
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_4bit: true
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0
 output_dir: ./outputs/qlora-out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
 pad_to_sequence_len: true
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -4,6 +4,7 @@ base_model: EleutherAI/gpt-j-6b
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -5,6 +5,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -5,6 +5,7 @@ tokenizer_type: AutoTokenizer
 # hub_model_id: username/custom_model_name
 load_in_4bit: true
 strict: false
 use_tensorboard: true
 chat_template: jamba
 datasets:
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -10,6 +10,7 @@ gptq_disable_exllama: true
 tokenizer_use_fast: true
 tokenizer_legacy: true
 strict: false
 push_dataset_to_hub:
 hf_use_auth_token: true
 datasets:
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: yahma/alpaca-cleaned
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -5,6 +5,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -4,6 +4,7 @@ processor_type: AutoProcessor
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -9,6 +9,7 @@ liger_rms_norm: true
 liger_glu_activation: true
 liger_fused_linear_cross_entropy: true
 strict: false
 chat_template: llama3
 datasets:
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Meta-Llama-3.1-8B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 chat_template: llama3
 rl: dpo
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 chat_template: llama3
 datasets:
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 chat_template: llama3
 rl: dpo
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -4,6 +4,7 @@ base_model: meta-llama/Llama-3.2-1B
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 rl: kto
 rl_beta: 0.5
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -4,6 +4,7 @@ base_model: NousResearch/Llama-3.2-1B
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -5,6 +5,7 @@ tokenizer_type: AutoTokenizer
 # hub_model_id: username/custom_model_name
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer  # PreTrainedTokenizerFast
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: aaditya/alpaca_subset_1
--- a/examples/llama-3/sparse-finetuning.yaml
+++ b/examples/llama-3/sparse-finetuning.yaml
@@ -1,77 +0,0 @@
 base_model: neuralmagic/Sparse-Llama-3.1-8B-2of4
 plugins:
  - axolotl.integrations.llm_compressor.LLMCompressorPlugin
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
 output_dir: ./outputs/out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 eval_sample_packing: false
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 8
 micro_batch_size: 1
 num_epochs: 1
 optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 2e-5
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 2
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: <|end_of_text|>
 llmcompressor:
  recipe:
    finetuning_stage:
      finetuning_modifiers:
        ConstantPruningModifier:
          targets: [
            're:.*q_proj.weight',
            're:.*k_proj.weight',
            're:.*v_proj.weight',
            're:.*o_proj.weight',
            're:.*gate_proj.weight',
            're:.*up_proj.weight',
            're:.*down_proj.weight',
          ]
          start: 0
  save_compressed: true
--- a/examples/llama-4/README.md
+++ b/examples/llama-4/README.md
@@ -1,36 +1,10 @@
 # Llama 4 by Meta AI
 ## Flash Attention vs Flex Attention
 While Flash Attention to support is "enabled" for Llama-4, the upstream implementation is not correct and usage of Flex Attention is recommended.
 ## Available Examples
 ### Llama 4 Scout 17Bx16Experts (109B)
 - [Multi-Modal/Vision QLoRA w/ FSDP1](./scout-vision-qlora-fsdp.yaml)
 - [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml)
 - [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml)
-Flex Attention
+Our Single GPU implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second.
 - [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100-flex.yaml)
 - [Text Multi GPU QLoRA w/ FSDP2](./scout-qlora-flexattn-fsdp2.yaml)
 [//]: # (Flash Attention &#40;Do not use&#41;)
 [//]: # (- [Multi-Modal/Vision QLoRA w/ FSDP1]&#40;./scout-vision-qlora-fsdp.yaml&#41;)
 [//]: # (- [Text Single GPU &#40;H100&#41; QLoRA]&#40;./scout-qlora-single-h100.yaml&#41;)
 [//]: # (- [Text Multi GPU QLoRA w/ FSDP1]&#40;./scout-qlora-fsdp1.yaml&#41;)
 Our Single H100 implementation for Llama 4 Scout uses only 64.5GB VRAM for post-training with 4k context length @ 519 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/wpie7dkj)
 Multi-GPU (4xH100) for Llama 4 Scout uses 62.8GB VRAM/GPU @ 4k contenxt length @ 280tps/gpu, [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/2lkezdj8)
 ### Llama 4 Maverick 17Bx128Experts (400B)
 Coming Soon
 ## Delinearized Llama 4 Models
 We provide a script to delinearize Llama 4 linearized models into regular HuggingFace Llama 4 models.
 ```bash
 axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
 ```
--- a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
@@ -1,88 +0,0 @@
 base_model: axolotl-quants/Llama-4-Maverick-17B-128E-Linearized-bnb-nf4-bf16
 model_type: Llama4ForConditionalGeneration
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
 llama4_linearized_experts: true
 load_in_4bit: true
 adapter: qlora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
 lora_modules_to_save:
 # - lm_head
 # - embed_tokens
 chat_template: llama4
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 1e-4
 bf16: true
 tf32: true
 logging_steps: 1
 flash_attention: true
 gradient_checkpointing: offload
 gradient_checkpointing_kwargs:
  use_reentrant: false
 warmup_steps: 20
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 fsdp:
  - auto_wrap
  - full_shard
 fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
--- a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
+++ b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
@@ -1,86 +0,0 @@
 base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
 model_type: Llama4ForConditionalGeneration
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
 llama4_linearized_experts: true
 load_in_4bit: true
 adapter: qlora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
 lora_modules_to_save:
  # - lm_head
  # - embed_tokens
 chat_template: llama4
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 3
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 1e-4
 bf16: true
 tf32: true
 logging_steps: 1
 flex_attention: true
 flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 fsdp:
  - auto_wrap
  - full_shard
 fsdp_config:
  fsdp_version: 2
  fsdp_offload_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_reshard_after_forward: true
  fsdp_activation_checkpointing: true
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Salman Mohammadi	deb01959d2	raising value error	2025-04-09 17:54:24 +01:00
Salman Mohammadi	76ae4ae238	Merge branch 'main' into flex_patching_update	2025-04-09 16:51:05 +01:00
Salman Mohammadi	2f147cc6ff	fixing tests	2025-04-08 17:23:21 +01:00
Salman Mohammadi	6f47b1e896	merging	2025-04-08 17:20:53 +01:00
Salman Mohammadi	e1a8dfbe8c	pinning transformers version	2025-04-08 17:17:23 +01:00
salman	cdb16069af	fixing transformers version	2025-04-08 09:23:46 -04:00
Sunny Liu	75c565d476	add back dynamic=False	2025-04-08 09:23:46 -04:00
Sunny Liu	bdaaba2784	remove backend='inductor' in local patch	2025-04-08 09:23:46 -04:00
Sunny Liu	04624c5a8d	bump flex patching transformers to v4.51, update torch compile kwargs to be in line with transformers v4.51	2025-04-08 09:23:46 -04:00
salman	b98dbafc31	fixing transformers version	2025-04-08 11:28:52 +01:00
Sunny Liu	4d320e2e4d	add back dynamic=False	2025-04-07 17:06:51 -04:00
Sunny Liu	421e0ee499	remove backend='inductor' in local patch	2025-04-07 17:05:08 -04:00
Sunny Liu	4e8677027a	bump flex patching transformers to v4.51, update torch compile kwargs to be in line with transformers v4.51	2025-04-07 15:12:45 -04:00