chore: refactor normalize_attn to use mapping and loop

fix: set replit mpt model to use eager attention
fixes from PR feedback
2025-05-07 17:10:18 +07:00 · 2025-05-07 17:10:18 +07:00 · 2025-05-07 17:10:18 +07:00 · 2025-05-07 17:10:18 +07:00 · 2025-05-06 23:40:44 -04:00 · 2025-05-06 22:56:00 -04:00
178 changed files with 3414 additions and 667 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -22,12 +22,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.4.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,11 +15,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -35,7 +30,7 @@ jobs:
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
-            axolotl_extras: vllm
+            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -67,6 +62,7 @@ jobs:
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
@@ -82,11 +78,6 @@ jobs:
    strategy:
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -9,6 +9,7 @@ on:
      - 'pyproject.toml'
      - '.github/workflows/multi-gpu-e2e.yml'
      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
@@ -32,13 +33,6 @@ jobs:
            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:  # no vllm support for 2.4.1
            num_gpus: 2
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,11 +12,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -70,11 +65,6 @@ jobs:
    strategy:
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -4,6 +4,12 @@ on:
  pull_request:
    types: [opened, synchronize, reopened]
    # Run the workflow only when one of these files changes
    paths:
      - '**/*.md'      # any Markdown file
      - '**/*.qmd'     # any Quarto file
      - '_quarto.yaml'
 permissions:
  checks: write
  contents: write
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -26,7 +26,7 @@ jobs:
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
+        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
    timeout-minutes: 20
    steps:
@@ -106,13 +106,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -27,6 +27,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 env:
  TRANSFORMERS_IS_CI: "yes"
 jobs:
  pre-commit:
    name: pre-commit
@@ -41,15 +44,101 @@ jobs:
        env:
          SKIP: no-commit-to-branch
-  pytest:
+  preload-cache:
-    name: PyTest
+    name: Preload HF cache
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.4.1", "2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.6.0"]
    timeout-minutes: 20
    env:
      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
    steps:
      - name: Check out repository code
        uses: actions/checkout@v4
      - name: Restore HF cache
        id: hf-cache-restore
        uses: actions/cache/restore@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ runner.os }}-hf-hub-cache-v2
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
      - name: Install PyTorch
        run: |
          pip3 install torch==${{ matrix.pytorch_version }}
      - name: Install dependencies
        run: |
          pip3 show torch
          pip3 install --no-build-isolation -U -e .
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
          pip3 install -r requirements-dev.txt -r requirements-tests.txt
      - name: Make sure PyTorch version wasn't clobbered
        run: |
          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
      - name: Ensure axolotl CLI was installed
        run: |
          axolotl --help
      - name: Pre-Download dataset fixture
        run: |
          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
      - name: Run tests
        run: |
          pytest -v tests/conftest.py
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          files: ./coverage.xml
          flags: unittests,pytorch-${{ matrix.pytorch_version }}
          fail_ci_if_error: false
      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
      - name: Save HF cache
        id: hf-cache
        uses: actions/cache/save@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
    needs: [preload-cache]
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
    timeout-minutes: 20
    steps:
@@ -118,24 +207,15 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
      - name: Save HF cache
        id: hf-cache
        uses: actions/cache/save@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 1
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
+        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
    timeout-minutes: 20
    steps:
@@ -196,15 +276,6 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
      - name: Save HF cache
        id: hf-cache
        uses: actions/cache/save@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
  docker-e2e-tests-1st:
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
@@ -258,6 +329,12 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras: llmcompressor
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.runpod/test-input.json
+++ b/.runpod/test-input.json
@@ -0,0 +1,86 @@
 {
  "input": {
    "name": "quick_smoke_test_sft",
    "user_id": "user",
    "model_id": "llama-test",
    "run_id": "llama-test",
    "credentials": {
      "wandb_api_key": "",
      "hf_token": ""
    },
    "args": {
      "base_model": "HuggingFaceTB/SmolLM2-135M",
      "model_type": "AutoModelForCausalLM",
      "tokenizer_type": "AutoTokenizer",
      "load_in_4bit": true,
      "strict": false,
      "datasets": [
        {
          "path": "mhenrichsen/alpaca_2k_test",
          "type": "alpaca",
          "split": "train[:10%]"
        }
      ],
      "val_set_size": 0.02,
      "output_dir": "./outputs/lora-out",
      "sequence_len": 4096,
      "sample_packing": true,
      "eval_sample_packing": false,
      "pad_to_sequence_len": true,
      "adapter": "qlora",
      "lora_r": 32,
      "lora_alpha": 64,
      "lora_dropout": 0.05,
      "lora_target_linear": true,
      "lora_modules_to_save": [
        "embed_tokens",
        "lm_head"
      ],
      "gradient_accumulation_steps": 2,
      "micro_batch_size": 1,
      "num_epochs": 1,
      "optimizer": "adamw_torch_fused",
      "lr_scheduler": "cosine",
      "learning_rate": 0.0002,
      "train_on_inputs": false,
      "group_by_length": false,
      "bf16": "auto",
      "tf32": true,
      "gradient_checkpointing": true,
      "logging_steps": 1,
      "flash_attention": true,
      "warmup_steps": 1,
      "evals_per_epoch": 1,
      "eval_max_new_tokens": 128,
      "saves_per_epoch": 1,
      "weight_decay": 0.0,
      "special_tokens": {
        "pad_token": "<|endoftext|>"
      },
      "max_steps": 20
    },
    "timeout": 100000
  },
  "config": {
    "gpuTypeId": "NVIDIA GeForce RTX 4090",
    "gpuCount": 1,
    "containerDiskInGb": 200,
    "env": [
      {
        "key": "TOKENIZER",
        "value": ""
      },
      {
        "key": "DISABLE_LOG_STATS",
        "value": "true"
      }
    ],
    "allowedCudaVersions": [
      "12.8",
      "12.7",
      "12.6",
      "12.5",
      "12.4"
    ]
  }
 }
--- a/.runpod/tests.json
+++ b/.runpod/tests.json
@@ -1,65 +1,70 @@
 {
-  "input": {
+  "tests": [
-    "name": "quick_smoke_test_sft",
+    {
-    "user_id": "user",
+      "name": "quick_smoke_test_sft",
-    "model_id": "llama-test",
+      "input": {
-    "run_id": "llama-test",
+        "user_id": "user",
-    "credentials": {
+        "model_id": "llama-test",
-      "wandb_api_key": "",
+        "run_id": "llama-test",
-      "hf_token": ""
+        "credentials": {
-    },
+          "wandb_api_key": "",
-    "args": {
+          "hf_token": ""
-      "base_model": "HuggingFaceTB/SmolLM2-135M",
+        },
-      "model_type": "AutoModelForCausalLM",
+        "args": {
-      "tokenizer_type": "AutoTokenizer",
+          "base_model": "HuggingFaceTB/SmolLM2-135M",
-      "load_in_8bit": true,
+          "model_type": "AutoModelForCausalLM",
-      "load_in_4bit": false,
+          "tokenizer_type": "AutoTokenizer",
-      "strict": false,
+          "load_in_4bit": true,
-      "datasets": [
+          "strict": false,
-        {
+          "datasets": [
-          "path": "mhenrichsen/alpaca_2k_test",
+            {
-          "type": "alpaca"
+              "path": "mhenrichsen/alpaca_2k_test",
              "type": "alpaca",
              "split": "train[:10%]"
            }
          ],
          "val_set_size": 0.02,
          "output_dir": "./outputs/lora-out",
          "sequence_len": 4096,
          "sample_packing": true,
          "eval_sample_packing": false,
          "pad_to_sequence_len": true,
          "adapter": "qlora",
          "lora_r": 32,
          "lora_alpha": 64,
          "lora_dropout": 0.05,
          "lora_target_linear": true,
          "lora_modules_to_save": [
            "embed_tokens",
            "lm_head"
          ],
          "gradient_accumulation_steps": 2,
          "micro_batch_size": 1,
          "num_epochs": 1,
          "optimizer": "adamw_torch_fused",
          "lr_scheduler": "cosine",
          "learning_rate": 0.0002,
          "train_on_inputs": false,
          "group_by_length": false,
          "bf16": "auto",
          "tf32": true,
          "gradient_checkpointing": true,
          "logging_steps": 1,
          "flash_attention": true,
          "warmup_steps": 1,
          "evals_per_epoch": 1,
          "eval_max_new_tokens": 128,
          "saves_per_epoch": 1,
          "weight_decay": 0.0,
          "special_tokens": {
            "pad_token": "<|endoftext|>"
          },
          "max_steps": 20
        }
-      ],
+      },
-      "val_set_size": 0.05,
+      "timeout": 100000
-      "output_dir": "./outputs/lora-out",
+    }
-      "sequence_len": 4096,
+  ],
      "sample_packing": true,
      "eval_sample_packing": false,
      "pad_to_sequence_len": true,
      "adapter": "lora",
      "lora_r": 32,
      "lora_alpha": 64,
      "lora_dropout": 0.05,
      "lora_target_linear": true,
      "lora_modules_to_save": [
        "embed_tokens",
        "lm_head"
      ],
      "gradient_accumulation_steps": 4,
      "micro_batch_size": 2,
      "num_epochs": 1,
      "optimizer": "adamw_torch_fused",
      "lr_scheduler": "cosine",
      "learning_rate": 0.0002,
      "train_on_inputs": false,
      "group_by_length": false,
      "bf16": "auto",
      "tf32": true,
      "gradient_checkpointing": true,
      "logging_steps": 1,
      "flash_attention": true,
      "warmup_steps": 1,
      "evals_per_epoch": 1,
      "eval_max_new_tokens": 128,
      "saves_per_epoch": 1,
      "weight_decay": 0.0,
      "special_tokens": {
        "pad_token": "<|endoftext|>"
      }
    },
    "timeout": 100000
  },
  "config": {
    "gpuTypeId": "NVIDIA GeForce RTX 4090",
    "gpuCount": 1,
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -32,6 +32,8 @@ tokenizer_legacy:
 resize_token_embeddings_to_32x:
 # Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
 shrink_embeddings:
 # Optional[bool] Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs
 embeddings_skip_upcast:
 # Whether to load the model with randomly initialized weights. Useful for
 # pre-training a model from scratch or debugging purposes.
 random_init_weights:
@@ -73,11 +75,12 @@ load_in_8bit: true
 load_in_4bit:
 # Use CUDA bf16
-bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
+bf16: true # bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require >=ampere
 # Use CUDA fp16
 fp16: true
 # Use CUDA tf32
 tf32: true # require >=ampere
 # Note: if bf16 is set to 'auto', and fp16 is set to true, we will prefer the explict fp16 setting
 # No AMP (automatic mixed precision)
 bfloat16: true # require >=ampere
@@ -184,6 +187,10 @@ datasets:
    # adding a system turn with empty content.
    drop_system_message:
    # Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags
    # See example at `docs/dataset-formats/conversation.qmd`
    split_thinking:
    # IMPORTANT: The following fields determine which parts of the conversation to train on.
    # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
    # See examples at `docs/dataset-formats/conversation.qmd`
@@ -543,7 +550,7 @@ gradient_checkpointing: false
 early_stopping_patience: 3
 # Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine
+lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
 cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
--- a/docs/custom_integrations.qmd
+++ b/docs/custom_integrations.qmd
@@ -49,7 +49,8 @@ sections = [
    ("Knowledge Distillation (KD)", "kd"),
    ("Liger Kernels", "liger"),
    ("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
-    ("Spectrum", "spectrum")
+    ("Spectrum", "spectrum"),
    ("LLMCompressor", "llm_compressor")
 ]
 for section_name, folder_name in sections:
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -196,6 +196,34 @@ datasets:
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::
 8. (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
 ```yaml
 datasets:
  - path: ...
    type: chat_template
    chat_template: qwen3
    split_thinking: true
 ```
 For example, a content can look like:
 ```json
 {
  "content": "<think>Some thinking outputs</think>Output after thinking."
 }
 ```
 After split, it will look like:
 ```json
 {
  "reasoning_content": "Some thinking outputs",
  "content": "Output after thinking..."
 }
 ```
 ## sharegpt
 ::: {.callout-important}
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -164,7 +164,7 @@ Here is an example of a multi-modal dataset:
        {
            "role": "user",
            "content": [
-                {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
+                {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
                {"type": "text", "text": "Describe this image in detail."}
            ]
        },
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -59,9 +59,7 @@ gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 sdp_attention:
 flash_optimum:
 gptq_groupsize:
 gptq_model_v1:
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -39,8 +39,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
+attention: xformers
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -45,7 +45,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -46,7 +46,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -45,7 +45,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -46,7 +46,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -45,7 +45,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -46,7 +46,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -49,7 +49,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -112,9 +112,7 @@
    "early_stopping_patience:\n",
    "resume_from_checkpoint:\n",
    "logging_steps: 1\n",
-    "xformers_attention:\n",
+    "attention: sdpa\n",
    "flash_attention: false\n",
    "sdp_attention: true\n",
    "\n",
    "warmup_steps: 1\n",
    "max_steps: 25\n",
--- a/examples/dbrx/16bit-lora.yaml
+++ b/examples/dbrx/16bit-lora.yaml
@@ -52,7 +52,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch:
--- a/examples/dbrx/8bit-lora.yaml
+++ b/examples/dbrx/8bit-lora.yaml
@@ -55,7 +55,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch:
--- a/examples/dbrx/fft-ds-zero3.yaml
+++ b/examples/dbrx/fft-ds-zero3.yaml
@@ -39,7 +39,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch:
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -35,7 +35,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -59,7 +59,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -43,8 +43,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
+attention: xformers
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 40
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -73,8 +73,7 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-xformers_attention: true
+attention: xformers
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -40,8 +40,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
+attention: xformers
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 40
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -47,7 +47,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -53,7 +53,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -43,7 +43,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -57,7 +57,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -51,8 +51,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
-flash_attention: true
+attention: flash
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -53,8 +53,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
-flash_attention: true
+attention: flash
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -36,8 +36,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
+attention: xformers
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -47,7 +47,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch:
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -46,7 +46,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch:
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -45,7 +45,8 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 1
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -37,8 +37,7 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-xformers_attention: true
+attention: xformers
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -42,7 +42,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -53,9 +53,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention:
+attention: flash
 sdp_attention:
 flash_optimum:
 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -46,7 +46,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -45,7 +45,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -45,7 +45,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -48,7 +48,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -46,7 +46,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -48,7 +48,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -50,8 +50,7 @@ tf32: true
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
+attention: flash
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -49,7 +49,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -34,7 +34,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -61,7 +61,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -56,7 +56,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -77,7 +77,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -53,7 +53,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -54,7 +54,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -48,7 +48,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -55,7 +55,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -48,7 +48,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -49,7 +49,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -53,7 +53,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 20
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -51,7 +51,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -39,7 +39,8 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -48,7 +48,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -46,7 +46,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/sparse-finetuning.yaml
+++ b/examples/llama-3/sparse-finetuning.yaml
@@ -0,0 +1,77 @@
 base_model: neuralmagic/Sparse-Llama-3.1-8B-2of4
 plugins:
  - axolotl.integrations.llm_compressor.LLMCompressorPlugin
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
 output_dir: ./outputs/out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 eval_sample_packing: false
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 8
 micro_batch_size: 1
 num_epochs: 1
 optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 2e-5
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 2
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: <|end_of_text|>
 llmcompressor:
  recipe:
    finetuning_stage:
      finetuning_modifiers:
        ConstantPruningModifier:
          targets: [
            're:.*q_proj.weight',
            're:.*k_proj.weight',
            're:.*v_proj.weight',
            're:.*o_proj.weight',
            're:.*gate_proj.weight',
            're:.*up_proj.weight',
            're:.*down_proj.weight',
          ]
          start: 0
  save_compressed: true
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -46,8 +46,7 @@ tf32: true
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
+attention: flash
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -39,7 +39,7 @@ tf32: true
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention:
+attention: eager
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/mistral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral-ds-zero3.yaml
@@ -42,7 +42,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 save_total_limit: 1
 save_steps:
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -36,7 +36,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -53,8 +53,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: false
+attention: sdpa
 sdp_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -54,7 +54,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -71,7 +71,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: false
+attention: eager
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/mistral/mistral-qlora-fsdp.yml
+++ b/examples/mistral/mistral-qlora-fsdp.yml
@@ -51,7 +51,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mistral-qlora-orpo.yml
+++ b/examples/mistral/mistral-qlora-orpo.yml
@@ -59,7 +59,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small-3.1-24B-lora.yml
@@ -48,9 +48,7 @@ tf32: true
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet.
+attention: eager  # PixtralVisionModel does not support Flash Attention 2.0 yet.
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
--- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
@@ -49,7 +49,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-qlora-fsdp.yml
@@ -51,7 +51,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -69,7 +69,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mixtral_22.yml
+++ b/examples/mistral/mixtral_22.yml
@@ -40,7 +40,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 save_total_limit: 1
 save_steps:
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -54,7 +54,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mpt-7b/config.yml
+++ b/examples/mpt-7b/config.yml
@@ -39,7 +39,7 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-flash_attention:
+attention: eager
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -39,7 +39,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -47,7 +47,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/openllama-3b/qlora.yml
+++ b/examples/openllama-3b/qlora.yml
@@ -40,7 +40,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/orpheus/README.md
+++ b/examples/orpheus/README.md
@@ -0,0 +1,341 @@
 # Finetuning LLMs to output audio
 In this example, we finetune Orpcanopylabs/orpheus-tts-0.1-pretrained (a LLaMA 3.2 3b model) to output audio.
 The `finetune.yml` withe current settings will run on any Nvidia GPU with 45GB VRAM or more. If you adjust the batch size it can easily run on any GPU under 24GB.
 ## Dataset pre-processing for pre-training
 If you are adding another voice in English, please jump ahead to finetuning pre-processing.
 For this to work, we need to preprocess our dataset. Since we are expecting to output audio, we will need to add tokens to the tokenizer.
 Using this code, it will download the SNAC model and add the correct tokens and upload the final dataset.
 ```python
 import torch
 from snac import SNAC
 from datasets import load_dataset
 from huggingface_hub import snapshot_download
 from datasets import load_dataset
 import random
 import torchaudio.transforms as T
 from transformers import AutoTokenizer
 import os
 my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
 name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
 dsn = my_original_dataset_name
 snapshot_download(
    repo_id=dsn,
    repo_type="dataset",
    revision="main",
    max_workers=64,
 )
 ds = load_dataset(dsn, split="train")
 ds_sample_rate = ds[0]["audio"]["sampling_rate"]
 model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 model = model.to("mps")
 def tokenise_audio(waveform):
  waveform = torch.from_numpy(waveform).unsqueeze(0)
  waveform = waveform.to(dtype=torch.float32)
  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
  waveform = resample_transform(waveform)
  waveform = waveform.unsqueeze(0).to("cuda")
  #generate the codes from snac
  with torch.inference_mode():
    codes = model.encode(waveform)
  all_codes = []
  for i in range(codes[0].shape[1]):
    all_codes.append(codes[0][0][i].item()+128266)
    all_codes.append(codes[1][0][2*i].item()+128266+4096)
    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
  return all_codes
 def add_codes(example):
    # Always initialize codes_list to None
    codes_list = None
    try:
        answer_audio = example.get("audio")
        # If there's a valid audio array, tokenise it
        if answer_audio and "array" in answer_audio:
            audio_array = answer_audio["array"]
            codes_list = tokenise_audio(audio_array)
    except Exception as e:
        print(f"Skipping row due to error: {e}")
        # Keep codes_list as None if we fail
    example["codes_list"] = codes_list
    return example
 ds = ds.map(add_codes, remove_columns=["audio"])
 #@title Load Tokenizer
 tokeniser_length = 128256
 start_of_text = 128000
 end_of_text = 128009
 start_of_speech = tokeniser_length + 1
 end_of_speech = tokeniser_length + 2
 start_of_human = tokeniser_length + 3
 end_of_human = tokeniser_length + 4
 start_of_ai = tokeniser_length + 5
 end_of_ai =  tokeniser_length + 6
 pad_token = tokeniser_length + 7
 audio_tokens_start = tokeniser_length + 10
 tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
 tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 num_proc = os.cpu_count() - 2
 ds = ds.filter(lambda x: x["codes_list"] is not None)
 ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
 #@title Create Input Ids
 def remove_duplicate_frames(example):
    vals = example["codes_list"]
    if len(vals) % 7 != 0:
        raise ValueError("Input list length must be divisible by 7")
    result = vals[:7]
    removed_frames = 0
    for i in range(7, len(vals), 7):
        current_first = vals[i]
        previous_first = result[-7]
        if current_first != previous_first:
            result.extend(vals[i:i+7])
        else:
            removed_frames += 1
    example["codes_list"] = result
    return example
 ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
 def create_input_ids(example):
    text_ids = tokenizer.encode({example['text']},  add_special_tokens=True)
    text_ids.append(end_of_text)
    example["text_tokens"] = text_ids
    input_ids = (
        [start_of_human]
        + example["text_tokens"]
        + [end_of_human]
        + [start_of_ai]
        + [start_of_speech]
        + example["codes_list"]
        + [end_of_speech]
        + [end_of_ai]
    )
    example["input_ids"] = input_ids
    example["labels"] = input_ids
    example["attention_mask"] = [1] * len(input_ids)
    return example
 ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
 #@title Remove unnecessary columns
 columns_to_keep = ["input_ids", "labels", "attention_mask"]
 columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
 ds = ds.remove_columns(columns_to_remove)
 ds.push_to_hub(name_to_push_dataset_to)
 ```
 ## Finetune pre-processing
 Use this code to add a new voice.
 ```python
 import torch
 from snac import SNAC
 from datasets import load_dataset
 from huggingface_hub import snapshot_download
 from datasets import load_dataset
 import random
 import torchaudio.transforms as T
 from transformers import AutoTokenizer
 import os
 my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
 name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
 dsn = my_original_dataset_name
 snapshot_download(
    repo_id=dsn,
    repo_type="dataset",
    revision="main",
    max_workers=64,
 )
 ds = load_dataset(dsn, split="train")
 ds_sample_rate = ds[0]["audio"]["sampling_rate"]
 model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 model = model.to("mps")
 def tokenise_audio(waveform):
  waveform = torch.from_numpy(waveform).unsqueeze(0)
  waveform = waveform.to(dtype=torch.float32)
  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
  waveform = resample_transform(waveform)
  waveform = waveform.unsqueeze(0).to("cuda")
  #generate the codes from snac
  with torch.inference_mode():
    codes = model.encode(waveform)
  all_codes = []
  for i in range(codes[0].shape[1]):
    all_codes.append(codes[0][0][i].item()+128266)
    all_codes.append(codes[1][0][2*i].item()+128266+4096)
    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
  return all_codes
 def add_codes(example):
    # Always initialize codes_list to None
    codes_list = None
    try:
        answer_audio = example.get("audio")
        # If there's a valid audio array, tokenise it
        if answer_audio and "array" in answer_audio:
            audio_array = answer_audio["array"]
            codes_list = tokenise_audio(audio_array)
    except Exception as e:
        print(f"Skipping row due to error: {e}")
        # Keep codes_list as None if we fail
    example["codes_list"] = codes_list
    return example
 ds = ds.map(add_codes, remove_columns=["audio"])
 #@title Load Tokenizer
 tokeniser_length = 128256
 start_of_text = 128000
 end_of_text = 128009
 start_of_speech = tokeniser_length + 1
 end_of_speech = tokeniser_length + 2
 start_of_human = tokeniser_length + 3
 end_of_human = tokeniser_length + 4
 start_of_ai = tokeniser_length + 5
 end_of_ai =  tokeniser_length + 6
 pad_token = tokeniser_length + 7
 audio_tokens_start = tokeniser_length + 10
 tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
 tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 num_proc = os.cpu_count() - 2
 ds = ds.filter(lambda x: x["codes_list"] is not None)
 ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
 #@title Create Input Ids
 def remove_duplicate_frames(example):
    vals = example["codes_list"]
    if len(vals) % 7 != 0:
        raise ValueError("Input list length must be divisible by 7")
    result = vals[:7]
    removed_frames = 0
    for i in range(7, len(vals), 7):
        current_first = vals[i]
        previous_first = result[-7]
        if current_first != previous_first:
            result.extend(vals[i:i+7])
        else:
            removed_frames += 1
    example["codes_list"] = result
    return example
 ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
 tok_info = '''*** HERE you can modify the text prompt
 i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:
 f"{example["source"]}:  {example["text"]}", as is passed.
 '''
 print(tok_info)
 def create_input_ids(example):
    text_ids = tokenizer.encode(f"{example['speaker_id']}: {example['text']}",  add_special_tokens=True)
    text_ids.append(end_of_text)
    example["text_tokens"] = text_ids
    input_ids = (
        [start_of_human]
        + example["text_tokens"]
        + [end_of_human]
        + [start_of_ai]
        + [start_of_speech]
        + example["codes_list"]
        + [end_of_speech]
        + [end_of_ai]
    )
    example["input_ids"] = input_ids
    example["labels"] = input_ids
    example["attention_mask"] = [1] * len(input_ids)
    return example
 ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
 #@title Remove unnecessary columns
 columns_to_keep = ["input_ids", "labels", "attention_mask"]
 columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
 ds = ds.remove_columns(columns_to_remove)
 ds.push_to_hub(name_to_push_dataset_to)
 ```
 ## Training
 After preprocessing is done, fill out the blanks in finetune.yml and simply run `axolotl train finetune.yml`
 ## Inference
 For inference, please refer to the original [orpheus github](https://github.com/canopyai/Orpheus-TTS/tree/main).
--- a/examples/orpheus/finetune.yml
+++ b/examples/orpheus/finetune.yml
@@ -0,0 +1,52 @@
 base_model: canopylabs/orpheus-3b-0.1-pretrained
 hub_model_id: <your-hub-model-id>
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_rope: true
 liger_rms_norm: true
 liger_glu_activation: true
 liger_fused_linear_cross_entropy: true
 datasets:
  - path: <your-hf-dataset-id>
    type:  # leave empty to load pre-tokenized
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.01
 output_dir: ./outputs/out
 sequence_len: 8192
 sample_packing: true
 pad_to_sequence_len: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 8
 micro_batch_size: 4
 num_epochs: 3
 optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 2e-5
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 20
 evals_per_epoch: 5
 saves_per_epoch: 5
 weight_decay: 0.05
 special_tokens:
  pad_token: <custom_token_7>
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -48,7 +48,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: True
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 100
 evals_per_epoch: 4
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -51,7 +51,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: True
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 100
 evals_per_epoch: 4
--- a/examples/phi/phi2-ft.yml
+++ b/examples/phi/phi2-ft.yml
@@ -48,7 +48,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: True
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 100
 evals_per_epoch: 4
--- a/examples/phi/phi3-ft-fsdp.yml
+++ b/examples/phi/phi3-ft-fsdp.yml
@@ -49,7 +49,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 100
 evals_per_epoch: 4
--- a/examples/phi/phi3-ft.yml
+++ b/examples/phi/phi3-ft.yml
@@ -44,7 +44,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: True
 early_stopping_patience: 3
 logging_steps: 1
-flash_attention: true
+attention: flash
 eval_steps: 1000
 save_steps: 5000
--- a/examples/pixtral/lora-12b.yml
+++ b/examples/pixtral/lora-12b.yml
@@ -46,8 +46,7 @@ tf32: true
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet
+attention: eager  # PixtralVisionModel does not support Flash Attention 2.0 yet
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/qwen/lora.yml
+++ b/examples/qwen/lora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention:
+attention: eager
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/qwen/qlora.yml
+++ b/examples/qwen/qlora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention:
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/qwen/qwen2-moe-lora.yaml
+++ b/examples/qwen/qwen2-moe-lora.yaml
@@ -43,7 +43,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/qwen/qwen2-moe-qlora.yaml
+++ b/examples/qwen/qwen2-moe-qlora.yaml
@@ -46,7 +46,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/qwen2-vl/lora-7b.yaml
+++ b/examples/qwen2-vl/lora-7b.yaml
@@ -46,8 +46,7 @@ tf32: true
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
+attention: flash
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
NanoCode012	ef883b6960	chore: refactor normalize_attn to use mapping and loop	2025-05-07 17:10:18 +07:00
NanoCode012	d0c4930dd5	fix: set replit mpt model to use eager attention	2025-05-07 17:10:18 +07:00
Wing Lian	6ee7cb30fa	fixes from PR feedback	2025-05-07 17:10:18 +07:00
Wing Lian	ba47adc24b	replace attention in the yaml config with an enum	2025-05-07 17:10:18 +07:00
Wing Lian	0d71b0aa5f	Configurable embeddings upcast (#2621 ) * fsdp embeddings should be float32 per comment * patch peft to not upcast everything * add tabs back to code check * fix import * add configurable option and fix check * add check for dtypes * move embeddings test to patch dir * fix test * fix comment and logic	2025-05-06 23:40:44 -04:00
Eric Meier	63aaccf85b	Fix cut_cross_entropy plugin install (#2642 ) [skip ci]	2025-05-06 22:56:00 -04:00
Wing Lian	ff0fe767c8	xformers attention with packing (#2619 ) * xformers attention with packing * wire up the patch * fix xformers + packing validation * fix warning * reorder the packing check * fix fp16 / bf16 reset when using fp16 with bf16 auto * fix seq lens calc to drop hanging sequences * handle xformers patch for inference too * fix batch size setter * fix xformers inference * add colab callback to fix inference post train * PR feedback	2025-05-06 22:49:22 -04:00
Wing Lian	8e4158cc0b	Multipack parallel bin packing (#2631 ) * improve readability of multipack sampler * parallel bin packing fix error with lambda and pickling make sure things are in float instead of np.float * annotations and comments update * support for configurable group and bin size for sample packing * fix missing map back to original indices	2025-05-06 20:08:08 -04:00
Wing Lian	cd84325253	allow plugins to return their own dataset (#2617 ) [skip ci] * allow plugins to return their own dataset * add post_trainer_create and wire up * add hook check * address PR feedback: * remove annotation causing circular import	2025-05-06 20:05:51 -04:00
NanoCode012	0b140fef83	feat(doc): add split_thinking docs (#2613 ) [skip ci] * feat(doc): add split_thinking docs * fix: link config.qmd to conversation.qmd for split_thinking example * update thinking => reasoning_content in messages format --------- Co-authored-by: Wing Lian <wing@axolotl.ai>	2025-05-06 20:05:32 -04:00
Wing Lian	e4cfebe995	bump liger dep to 0.5.9 (#2640 ) [skip ci] * bump liger dep to 0.5.9 * also upgrade vllm to post1, and datasets to 3.5.1	2025-05-06 20:05:19 -04:00
mhenrichsen	a6cac5dd32	Update lr_scheduler options in config.qmd to include additional scheduling strategies for improved training flexibility. (#2636 ) [skip ci]	2025-05-06 11:24:07 -04:00
Wing Lian	b71c0e3447	Print axolotl art if train is called outside of cli: (#2627 ) [skip ci]	2025-05-06 11:18:45 -04:00
Wing Lian	ddaebf8309	fix dpo eval override to call grandparent instead of the broken super (#2628 ) [skip ci]	2025-05-06 11:18:25 -04:00
Wing Lian	679743087a	make sure gc_steps is used for all trainers (#2638 )	2025-05-06 11:18:00 -04:00
Wing Lian	f720b6e72d	repop cache (#2639 ) * repop cache * pre-cache as a step * fix the name * add reason for pytest skipif * restore pytorch matrix * remove max-parallel now that we've optimized this a bit	2025-05-06 11:09:07 -04:00
mhenrichsen	a980618fd0	Adds example for training a TTS model on top of a LLM. (#2614 ) * Adds example for training a TTS model on top of a LLM. * Update examples/orpheus/finetune.yml Co-authored-by: NanoCode012 <nano@axolotl.ai> * Update examples/orpheus/finetune.yml Co-authored-by: NanoCode012 <nano@axolotl.ai> * Update README.md to clarify GPU requirements for finetuning Orpheus TTS model * Update finetune.yml to use the new base model canopylabs/orpheus-3b-0.1-pretrained * Update finetune.yml and README.md for consistency and clarity --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>	2025-05-06 10:11:06 +02:00
Emmanuel Ferdman	54960d4de0	Fix logging deprecation warnings (#2623 ) Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>	2025-05-04 08:22:45 -04:00
Wing Lian	ed922796b7	include multipack support for qwen3 family (#2622 )	2025-05-03 12:02:39 -04:00
Wing Lian	3dd9c3bf3f	setup hf transfer too and fix auto bf16 when fp16 enabled (#2620 ) [skip ci]	2025-05-03 12:02:26 -04:00
Wing Lian	0ba7d362fa	qwen3 and qwen3_moe support for liger kernels (#2612 ) * qwen3 and qwen3_moe support for liger kernels * fix moe module path * fix: qwen3 liger input args and mlp * fix: qwen3 input args and output class --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>	2025-05-02 09:29:55 -04:00
aitechguy	e4f73bc98e	remove keys to incoporate changes for the trl update (#2616 )	2025-05-02 08:47:42 -04:00
Wing Lian	bcb59c70e2	automatically set pad_to_sequence_len when use packing (#2607 ) * automatically set pad_to_sequence_len when use packing * update tests	2025-05-01 13:24:38 -04:00
NanoCode012	6a3e6f8c53	fix: run preview-docs only when md/qmd changes (#2606 ) * fix: run preview-docs only when md/qmd changes * feat: add quarto yaml based on PR feedback	2025-05-01 13:21:28 -04:00
Wing Lian	fee3c13bb5	Logging config for colab (#2611 ) * only configure logging on cli to play nicely with colab * allow reloading the config on the fly from a dict * make sure to use dict for yaml * reuse existing function for load * make cli args optional * mps fix and respect max_steps	2025-05-01 12:58:00 -04:00
Rahul Tuli	996fc124e5	Add: Sparse Finetuning Integration with llmcompressor (#2479 ) * Add: SFTPlugin with llmcompressor * Update: review comments! * Add:llmcompressor instalable * pre commit hooks * Use: warning over warn * Revert: TODO's * Update llmcompressor version to latest * Apply suggestions from @markurtz Co-authored-by: Mark Kurtz <mark.j.kurtz@gmail.com> * Address review comments from @markurtz * Add: llcompressor installable * Rename: sft.yaml to sparse-finetuning.yaml * Use: absolute import * Update model config * Move: LLMCompressorPlugin into it's own submodule * Add: `llm_compressor` integration documentation * Rebase and updates! * Tests, Style, Updates * Add: .qmd file * Address Review Comments: * deleted redundant docs/llm_compressor.qmd * incorporated feedback in integration README.md * added llmcompressor integration to docs/custom_integrations.qmd Signed-off-by: Rahul Tuli <rtuli@redhat.com> * Add: line about further optimizations using llmcompressor Signed-off-by: Rahul Tuli <rtuli@redhat.com> * Apply patch from @winglian Signed-off-by: Rahul Tuli <rtuli@redhat.com> * Fix: Test Signed-off-by: Rahul Tuli <rtuli@redhat.com> * additional fixes for docker and saving compressed * split llmcompressor from vllm checks * Reset session between tests Signed-off-by: Rahul Tuli <rtuli@redhat.com> * move decorator to test method instead of class * make sure to reset the session after each test * move import of llmcompressor to reset session inside test --------- Signed-off-by: Rahul Tuli <rtuli@redhat.com> Co-authored-by: Mark Kurtz <mark.j.kurtz@gmail.com> Co-authored-by: Wing Lian <wing@axolotl.ai>	2025-05-01 12:25:16 -04:00
Wing Lian	e963990ad7	add missing __init__ for lr monkeypatch fix (#2609 )	2025-05-01 09:41:32 -04:00
Dhruv Mullick	c3f2b1c5c2	Add num_completions_to_print for trl and grpo (#2604 )	2025-04-30 21:00:30 -04:00
Wing Lian	6ba5c0ed2c	use latest hf-xet and don't install vllm for torch 2.7.0 (#2603 ) * use latest hf-xet and don't install vllm for torch 2.7.0 * fix runpod hub tests	2025-04-30 18:27:39 -04:00
Wing Lian	24ff5f53f8	additional args for grpo config/trainer (#2598 )	2025-04-30 13:11:12 -04:00
Wing Lian	5e949eaa07	replace zero_only with simpler if statement (#2592 )	2025-04-30 13:11:03 -04:00
Wing Lian	89ca14d9a0	ensure we pass axolotl extras to the Dockerfile so vllm is included in shipped images (#2599 )	2025-04-30 11:35:45 -04:00
Wing Lian	8446b4ad28	don't automatically enable lora kernels for RL training (#2600 )	2025-04-30 11:06:50 -04:00
Wing Lian	fc79606b6d	only import vllm serve cli if its being called (#2597 ) [skip ci]	2025-04-30 09:11:25 -04:00
Wing Lian	baeb00231b	Handle other reasoning trace dataset formats (#2591 ) * Handle other reasoning trace dataset formats * rename var to improve readability * chore: refactor with comments --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>	2025-04-30 03:32:55 -04:00
Wing Lian	2413688b08	upload the deepspeed json to wandb (#2593 ) [skip ci]	2025-04-30 03:32:44 -04:00
NanoCode012	5bb1f3da56	feat: add qwen3 moe block for ds3 (#2596 ) [skip ci]	2025-04-30 03:32:23 -04:00
Wing Lian	a21b9cc472	patch to convert LR from tensor to float when using DS (#2595 ) [skip ci]	2025-04-30 03:31:57 -04:00
Aleksandr Dremov	41a1ec0c95	Plugins create_lr_scheduler support (#2584 ) * lr_scheduler support * fix * Update scheduler.py * Update scheduler.py * cfg handling * black * remove debug * remove adding the axolotl cfg to the scheduler mixin --------- Co-authored-by: Wing Lian <wing@axolotl.ai>	2025-04-29 17:08:30 -04:00
Dan Saunders	ecac731922	auto-enable lora kernels where possible (#2589 ) * auto-enable lora kernels where possible * test * revert change to example yaml * naming * remove print * slight logic change	2025-04-29 16:18:49 -04:00
NanoCode012	742fef4200	fix(doc): key used to point to url in multimodal doc (#2575 ) [skip ci]	2025-04-29 15:10:59 -04:00
Wing Lian	a39caf8824	bump vllm==0.8.5 for qwen3 support (#2583 ) [skip ci]	2025-04-29 15:10:40 -04:00
Wing Lian	07e4f2e25b	support for qwen3 with lora kernels (#2588 ) * support for qwen3 with lora kernels * fix patch * typo	2025-04-29 15:02:49 -04:00
Dan Saunders	c7d07de6b4	Fix eval + add smoke test (#2586 ) * fix evaluate CLI * add smoke test * fix naming * lint	2025-04-29 12:58:54 -04:00
Wing Lian	6565ae85d8	set config on the PluginManager for callback access (#2587 )	2025-04-29 12:05:44 -04:00
Wing Lian	80b4edb4a7	Post release fixes (#2581 ) * fix missing kwarg on child * make the runpod test shorter * update docs * rename runpod test json file * typing fixes and ordering of doc	2025-04-29 10:01:38 -04:00
Wing Lian	fedbcc0254	remove torch 2.4.1 CI as part of support deprecation (#2582 )	2025-04-29 08:28:32 -04:00
Wing Lian	8175896ada	add dev tag for v0.10.0.dev0 (#2580 )	2025-04-28 20:30:14 -04:00