bump peft to 3.5.1

Update lr_scheduler options in config.qmd to include additional scheduling strategies for improved training flexibility. (#2636 ) [skip ci]
Print axolotl art if train is called outside of cli: (#2627 ) [skip ci]
2025-05-06 11:38:14 -04:00 · 2025-05-06 11:24:07 -04:00 · 2025-05-06 11:18:45 -04:00 · 2025-05-06 11:18:25 -04:00 · 2025-05-06 11:18:00 -04:00 · 2025-05-06 11:09:07 -04:00
36 changed files with 1111 additions and 108 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -30,7 +30,7 @@ jobs:
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
-            axolotl_extras: vllm
+            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -4,6 +4,12 @@ on:
  pull_request:
    types: [opened, synchronize, reopened]

+    # Run the workflow only when one of these files changes
+    paths:
+      - '**/*.md'      # any Markdown file
+      - '**/*.qmd'     # any Quarto file
+      - '_quarto.yaml'
+
 permissions:
  checks: write
  contents: write
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -44,12 +44,98 @@ jobs:
        env:
          SKIP: no-commit-to-branch

-  pytest:
-    name: PyTest
+  preload-cache:
+    name: Preload HF cache
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
-      max-parallel: 2
+      matrix:
+        python_version: ["3.11"]
+        pytorch_version: ["2.6.0"]
+    timeout-minutes: 20
+
+    env:
+      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies
+
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }}
+
+      - name: Install dependencies
+        run: |
+          pip3 show torch
+          pip3 install --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: Make sure PyTorch version wasn't clobbered
+        run: |
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+
+      - name: Ensure axolotl CLI was installed
+        run: |
+          axolotl --help
+
+      - name: Pre-Download dataset fixture
+        run: |
+          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+
+      - name: Run tests
+        run: |
+          pytest -v tests/conftest.py
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: ./coverage.xml
+          flags: unittests,pytorch-${{ matrix.pytorch_version }}
+          fail_ci_if_error: false
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+
+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
+  pytest:
+    name: PyTest
+    runs-on: ubuntu-latest
+    needs: [preload-cache]
+    strategy:
+      fail-fast: false
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
@@ -121,21 +207,12 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
+    needs: [preload-cache]
    strategy:
      fail-fast: false
-      max-parallel: 1
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
@@ -199,15 +276,6 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  docker-e2e-tests-1st:
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
--- a/.runpod/tests.json
+++ b/.runpod/tests.json
@@ -0,0 +1,90 @@
+{
+  "tests": [
+    {
+      "name": "quick_smoke_test_sft",
+      "input": {
+        "user_id": "user",
+        "model_id": "llama-test",
+        "run_id": "llama-test",
+        "credentials": {
+          "wandb_api_key": "",
+          "hf_token": ""
+        },
+        "args": {
+          "base_model": "HuggingFaceTB/SmolLM2-135M",
+          "model_type": "AutoModelForCausalLM",
+          "tokenizer_type": "AutoTokenizer",
+          "load_in_4bit": true,
+          "strict": false,
+          "datasets": [
+            {
+              "path": "mhenrichsen/alpaca_2k_test",
+              "type": "alpaca",
+              "split": "train[:10%]"
+            }
+          ],
+          "val_set_size": 0.02,
+          "output_dir": "./outputs/lora-out",
+          "sequence_len": 4096,
+          "sample_packing": true,
+          "eval_sample_packing": false,
+          "pad_to_sequence_len": true,
+          "adapter": "qlora",
+          "lora_r": 32,
+          "lora_alpha": 64,
+          "lora_dropout": 0.05,
+          "lora_target_linear": true,
+          "lora_modules_to_save": [
+            "embed_tokens",
+            "lm_head"
+          ],
+          "gradient_accumulation_steps": 2,
+          "micro_batch_size": 1,
+          "num_epochs": 1,
+          "optimizer": "adamw_torch_fused",
+          "lr_scheduler": "cosine",
+          "learning_rate": 0.0002,
+          "train_on_inputs": false,
+          "group_by_length": false,
+          "bf16": "auto",
+          "tf32": true,
+          "gradient_checkpointing": true,
+          "logging_steps": 1,
+          "flash_attention": true,
+          "warmup_steps": 1,
+          "evals_per_epoch": 1,
+          "eval_max_new_tokens": 128,
+          "saves_per_epoch": 1,
+          "weight_decay": 0.0,
+          "special_tokens": {
+            "pad_token": "<|endoftext|>"
+          },
+          "max_steps": 20
+        }
+      },
+      "timeout": 100000
+    }
+  ],
+  "config": {
+    "gpuTypeId": "NVIDIA GeForce RTX 4090",
+    "gpuCount": 1,
+    "containerDiskInGb": 200,
+    "env": [
+      {
+        "key": "TOKENIZER",
+        "value": ""
+      },
+      {
+        "key": "DISABLE_LOG_STATS",
+        "value": "true"
+      }
+    ],
+    "allowedCudaVersions": [
+      "12.8",
+      "12.7",
+      "12.6",
+      "12.5",
+      "12.4"
+    ]
+  }
+}
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -547,7 +547,7 @@ gradient_checkpointing: false
 early_stopping_patience: 3

 # Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine
+lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
 cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
--- a/examples/orpheus/README.md
+++ b/examples/orpheus/README.md
@@ -0,0 +1,341 @@
+# Finetuning LLMs to output audio
+
+In this example, we finetune Orpcanopylabs/orpheus-tts-0.1-pretrained (a LLaMA 3.2 3b model) to output audio.
+
+The `finetune.yml` withe current settings will run on any Nvidia GPU with 45GB VRAM or more. If you adjust the batch size it can easily run on any GPU under 24GB.
+
+## Dataset pre-processing for pre-training
+If you are adding another voice in English, please jump ahead to finetuning pre-processing.
+
+For this to work, we need to preprocess our dataset. Since we are expecting to output audio, we will need to add tokens to the tokenizer.
+
+Using this code, it will download the SNAC model and add the correct tokens and upload the final dataset.
+
+```python
+import torch
+from snac import SNAC
+from datasets import load_dataset
+from huggingface_hub import snapshot_download
+from datasets import load_dataset
+import random
+import torchaudio.transforms as T
+from transformers import AutoTokenizer
+import os
+
+my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
+name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
+
+dsn = my_original_dataset_name
+
+snapshot_download(
+    repo_id=dsn,
+    repo_type="dataset",
+    revision="main",
+    max_workers=64,
+)
+
+
+ds = load_dataset(dsn, split="train")
+ds_sample_rate = ds[0]["audio"]["sampling_rate"]
+
+model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
+model = model.to("mps")
+
+def tokenise_audio(waveform):
+  waveform = torch.from_numpy(waveform).unsqueeze(0)
+  waveform = waveform.to(dtype=torch.float32)
+  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
+  waveform = resample_transform(waveform)
+
+  waveform = waveform.unsqueeze(0).to("cuda")
+
+  #generate the codes from snac
+  with torch.inference_mode():
+    codes = model.encode(waveform)
+
+  all_codes = []
+  for i in range(codes[0].shape[1]):
+    all_codes.append(codes[0][0][i].item()+128266)
+    all_codes.append(codes[1][0][2*i].item()+128266+4096)
+    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
+    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
+    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
+    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
+    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
+
+
+  return all_codes
+
+def add_codes(example):
+    # Always initialize codes_list to None
+    codes_list = None
+
+    try:
+        answer_audio = example.get("audio")
+        # If there's a valid audio array, tokenise it
+        if answer_audio and "array" in answer_audio:
+            audio_array = answer_audio["array"]
+            codes_list = tokenise_audio(audio_array)
+    except Exception as e:
+        print(f"Skipping row due to error: {e}")
+        # Keep codes_list as None if we fail
+    example["codes_list"] = codes_list
+
+    return example
+
+ds = ds.map(add_codes, remove_columns=["audio"])
+
+#@title Load Tokenizer
+tokeniser_length = 128256
+start_of_text = 128000
+end_of_text = 128009
+
+start_of_speech = tokeniser_length + 1
+end_of_speech = tokeniser_length + 2
+
+start_of_human = tokeniser_length + 3
+end_of_human = tokeniser_length + 4
+
+start_of_ai = tokeniser_length + 5
+end_of_ai =  tokeniser_length + 6
+pad_token = tokeniser_length + 7
+
+audio_tokens_start = tokeniser_length + 10
+
+tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
+
+
+tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+num_proc = os.cpu_count() - 2
+
+ds = ds.filter(lambda x: x["codes_list"] is not None)
+ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
+
+#@title Create Input Ids
+def remove_duplicate_frames(example):
+    vals = example["codes_list"]
+    if len(vals) % 7 != 0:
+        raise ValueError("Input list length must be divisible by 7")
+
+    result = vals[:7]
+
+    removed_frames = 0
+
+    for i in range(7, len(vals), 7):
+        current_first = vals[i]
+        previous_first = result[-7]
+
+        if current_first != previous_first:
+            result.extend(vals[i:i+7])
+        else:
+            removed_frames += 1
+
+    example["codes_list"] = result
+
+    return example
+
+ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
+
+
+def create_input_ids(example):
+    text_ids = tokenizer.encode({example['text']},  add_special_tokens=True)
+    text_ids.append(end_of_text)
+    example["text_tokens"] = text_ids
+    input_ids = (
+        [start_of_human]
+        + example["text_tokens"]
+        + [end_of_human]
+        + [start_of_ai]
+        + [start_of_speech]
+        + example["codes_list"]
+        + [end_of_speech]
+        + [end_of_ai]
+    )
+    example["input_ids"] = input_ids
+    example["labels"] = input_ids
+    example["attention_mask"] = [1] * len(input_ids)
+
+    return example
+
+ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
+
+#@title Remove unnecessary columns
+columns_to_keep = ["input_ids", "labels", "attention_mask"]
+columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
+
+ds = ds.remove_columns(columns_to_remove)
+
+ds.push_to_hub(name_to_push_dataset_to)
+```
+
+
+## Finetune pre-processing
+Use this code to add a new voice.
+
+```python
+import torch
+from snac import SNAC
+from datasets import load_dataset
+from huggingface_hub import snapshot_download
+from datasets import load_dataset
+import random
+import torchaudio.transforms as T
+from transformers import AutoTokenizer
+import os
+
+my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
+name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
+
+dsn = my_original_dataset_name
+
+snapshot_download(
+    repo_id=dsn,
+    repo_type="dataset",
+    revision="main",
+    max_workers=64,
+)
+
+
+ds = load_dataset(dsn, split="train")
+ds_sample_rate = ds[0]["audio"]["sampling_rate"]
+
+model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
+model = model.to("mps")
+
+def tokenise_audio(waveform):
+  waveform = torch.from_numpy(waveform).unsqueeze(0)
+  waveform = waveform.to(dtype=torch.float32)
+  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
+  waveform = resample_transform(waveform)
+
+  waveform = waveform.unsqueeze(0).to("cuda")
+
+  #generate the codes from snac
+  with torch.inference_mode():
+    codes = model.encode(waveform)
+
+  all_codes = []
+  for i in range(codes[0].shape[1]):
+    all_codes.append(codes[0][0][i].item()+128266)
+    all_codes.append(codes[1][0][2*i].item()+128266+4096)
+    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
+    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
+    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
+    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
+    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
+
+
+  return all_codes
+
+def add_codes(example):
+    # Always initialize codes_list to None
+    codes_list = None
+
+    try:
+        answer_audio = example.get("audio")
+        # If there's a valid audio array, tokenise it
+        if answer_audio and "array" in answer_audio:
+            audio_array = answer_audio["array"]
+            codes_list = tokenise_audio(audio_array)
+    except Exception as e:
+        print(f"Skipping row due to error: {e}")
+        # Keep codes_list as None if we fail
+    example["codes_list"] = codes_list
+
+    return example
+
+ds = ds.map(add_codes, remove_columns=["audio"])
+
+#@title Load Tokenizer
+tokeniser_length = 128256
+start_of_text = 128000
+end_of_text = 128009
+
+start_of_speech = tokeniser_length + 1
+end_of_speech = tokeniser_length + 2
+
+start_of_human = tokeniser_length + 3
+end_of_human = tokeniser_length + 4
+
+start_of_ai = tokeniser_length + 5
+end_of_ai =  tokeniser_length + 6
+pad_token = tokeniser_length + 7
+
+audio_tokens_start = tokeniser_length + 10
+
+tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
+
+
+tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+num_proc = os.cpu_count() - 2
+
+ds = ds.filter(lambda x: x["codes_list"] is not None)
+ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
+
+#@title Create Input Ids
+def remove_duplicate_frames(example):
+    vals = example["codes_list"]
+    if len(vals) % 7 != 0:
+        raise ValueError("Input list length must be divisible by 7")
+
+    result = vals[:7]
+
+    removed_frames = 0
+
+    for i in range(7, len(vals), 7):
+        current_first = vals[i]
+        previous_first = result[-7]
+
+        if current_first != previous_first:
+            result.extend(vals[i:i+7])
+        else:
+            removed_frames += 1
+
+    example["codes_list"] = result
+
+    return example
+
+ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
+
+tok_info = '''*** HERE you can modify the text prompt
+i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:
+f"{example["source"]}:  {example["text"]}", as is passed.
+'''
+print(tok_info)
+
+def create_input_ids(example):
+    text_ids = tokenizer.encode(f"{example['speaker_id']}: {example['text']}",  add_special_tokens=True)
+    text_ids.append(end_of_text)
+    example["text_tokens"] = text_ids
+    input_ids = (
+        [start_of_human]
+        + example["text_tokens"]
+        + [end_of_human]
+        + [start_of_ai]
+        + [start_of_speech]
+        + example["codes_list"]
+        + [end_of_speech]
+        + [end_of_ai]
+    )
+    example["input_ids"] = input_ids
+    example["labels"] = input_ids
+    example["attention_mask"] = [1] * len(input_ids)
+
+    return example
+
+ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
+
+#@title Remove unnecessary columns
+columns_to_keep = ["input_ids", "labels", "attention_mask"]
+columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
+
+ds = ds.remove_columns(columns_to_remove)
+
+ds.push_to_hub(name_to_push_dataset_to)
+```
+
+## Training
+After preprocessing is done, fill out the blanks in finetune.yml and simply run `axolotl train finetune.yml`
+
+## Inference
+For inference, please refer to the original [orpheus github](https://github.com/canopyai/Orpheus-TTS/tree/main).
--- a/examples/orpheus/finetune.yml
+++ b/examples/orpheus/finetune.yml
@@ -0,0 +1,52 @@
+base_model: canopylabs/orpheus-3b-0.1-pretrained
+
+hub_model_id: <your-hub-model-id>
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_fused_linear_cross_entropy: true
+
+datasets:
+  - path: <your-hf-dataset-id>
+    type:  # leave empty to load pre-tokenized
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./outputs/out
+
+sequence_len: 8192
+sample_packing: true
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 8
+micro_batch_size: 4
+num_epochs: 3
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 2e-5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 20
+evals_per_epoch: 5
+saves_per_epoch: 5
+weight_decay: 0.05
+
+special_tokens:
+  pad_token: <custom_token_7>
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,10 +15,10 @@ peft==0.15.2
 transformers==4.51.3
 tokenizers>=0.21.1
 accelerate==1.6.0
-datasets==3.5.0
+datasets==3.5.1
 deepspeed>=0.15.4
 trl==0.17.0
-hf_xet==1.0.0
+hf_xet==1.1.0
 hqq==0.2.5

 optimum==1.16.2
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -2,4 +2,7 @@

 import os

+from axolotl.logging_config import configure_logging
+
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+configure_logging()
--- a/src/axolotl/cli/art.py
+++ b/src/axolotl/cli/art.py
@@ -16,8 +16,15 @@ AXOLOTL_LOGO = """
    @@@@  @@@@@@@@@@@@@@@@
 """

+HAS_PRINTED_LOGO = False
+

 def print_axolotl_text_art():
    """Prints axolotl ASCII art."""
+
+    global HAS_PRINTED_LOGO  # pylint: disable=global-statement
+    if HAS_PRINTED_LOGO:
+        return
    if is_main_process():
+        HAS_PRINTED_LOGO = True
        print(AXOLOTL_LOGO)
--- a/src/axolotl/cli/checks.py
+++ b/src/axolotl/cli/checks.py
@@ -8,9 +8,6 @@ from accelerate.commands.config import config_args
 from huggingface_hub import HfApi
 from huggingface_hub.utils import LocalTokenNotFoundError

-from axolotl.logging_config import configure_logging
-
-configure_logging()
 LOG = logging.getLogger(__name__)


--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -5,6 +5,7 @@ import logging
 import os
 import tempfile
 from pathlib import Path
+from tempfile import NamedTemporaryFile
 from typing import Union
 from urllib.parse import urlparse

@@ -158,7 +159,9 @@ def plugin_set_cfg(cfg: DictDefault):
        plugin_manager.cfg = cfg


-def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs) -> DictDefault:
+def load_cfg(
+    config: str | Path | DictDefault = Path("examples/"), **kwargs
+) -> DictDefault:
    """
    Loads the `axolotl` configuration stored at `config`, validates it, and performs
    various setup.
@@ -170,13 +173,24 @@ def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs) -> DictDefa
    Returns:
        `DictDefault` mapping configuration keys to values.
    """
-    config = check_remote_config(config)
-    if Path(config).is_dir():
-        config = choose_config(Path(config))
+    if isinstance(config, (str, Path)):
+        config = check_remote_config(config)
+        if Path(config).is_dir():
+            config = choose_config(Path(config))

-    # Load the config from the yaml file
-    with open(config, encoding="utf-8") as file:
-        cfg: DictDefault = DictDefault(yaml.safe_load(file))
+        # Load the config from the yaml file
+        with open(config, encoding="utf-8") as file:
+            cfg: DictDefault = DictDefault(yaml.safe_load(file))
+
+        cfg.axolotl_config_path = config
+    else:
+        cfg = config
+        with NamedTemporaryFile(
+            mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
+        ) as temp_file:
+            temp_file.write(yaml.dump(config.to_dict()))
+            temp_file.close()
+        cfg.axolotl_config_path = temp_file.name

    # If there are any options passed in the cli, if it is something that seems valid
    # from the yaml, then overwrite the value
@@ -190,8 +204,6 @@ def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs) -> DictDefa
            else:
                cfg[k] = kwargs[k]

-    cfg.axolotl_config_path = config
-
    try:
        device_props = torch.cuda.get_device_properties("cuda")
        gpu_version = "sm_" + str(device_props.major) + str(device_props.minor)
--- a/src/axolotl/cli/evaluate.py
+++ b/src/axolotl/cli/evaluate.py
@@ -15,7 +15,7 @@ from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.evaluate import evaluate
-from axolotl.utils import set_pytorch_cuda_alloc_conf
+from axolotl.utils import patch_optimized_env
 from axolotl.utils.dict import DictDefault

 LOG = logging.getLogger(__name__)
@@ -32,7 +32,7 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
        cli_args: CLI arguments.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    set_pytorch_cuda_alloc_conf()
+    patch_optimized_env()

    # pylint: disable=duplicate-code
    print_axolotl_text_art()
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -29,7 +29,7 @@ from axolotl.cli.utils import (
    filter_none_kwargs,
 )
 from axolotl.integrations.lm_eval.cli import lm_eval
-from axolotl.utils import set_pytorch_cuda_alloc_conf
+from axolotl.utils import patch_optimized_env
 from axolotl.utils.schemas.config import AxolotlInputConfig


@@ -55,6 +55,8 @@ def preprocess(config: str, cloud: Optional[str] = None, **kwargs) -> None:
        kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
            config options.
    """
+    patch_optimized_env()
+
    if cloud:
        from axolotl.cli.cloud import do_cli_preprocess

@@ -100,7 +102,7 @@ def train(
            config options.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    set_pytorch_cuda_alloc_conf()
+    patch_optimized_env()

    if "use_ray" in kwargs and kwargs["use_ray"]:
        accelerate = False
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -18,7 +18,7 @@ from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.integrations.base import PluginManager
 from axolotl.train import train
-from axolotl.utils import set_pytorch_cuda_alloc_conf
+from axolotl.utils import patch_optimized_env
 from axolotl.utils.config import normalize_config, resolve_dtype
 from axolotl.utils.dict import DictDefault

@@ -36,7 +36,7 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
        cli_args: Training-specific CLI arguments.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    set_pytorch_cuda_alloc_conf()
+    patch_optimized_env()

    print_axolotl_text_art()
    check_accelerate_default_config()
--- a/src/axolotl/cli/utils.py
+++ b/src/axolotl/cli/utils.py
@@ -20,11 +20,9 @@ from transformers import (
    ProcessorMixin,
 )

-from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_processor, load_tokenizer

-configure_logging()
 LOG = logging.getLogger(__name__)


--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -47,7 +47,8 @@ def sample_dataset(dataset: Dataset, num_samples: int) -> Dataset:
 def load_datasets(
    *,
    cfg: DictDefault,
-    cli_args: Union[PreprocessCliArgs, TrainerCliArgs],
+    cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
+    debug: bool = False,
 ) -> TrainDatasetMeta:
    """
    Loads one or more training or evaluation datasets, calling
@@ -56,6 +57,7 @@ def load_datasets(
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Command-specific CLI arguments.
+        debug: Whether to print out tokenization of sample

    Returns:
        Dataclass with fields for training and evaluation datasets and the computed
@@ -64,7 +66,8 @@ def load_datasets(
    tokenizer = load_tokenizer(cfg)
    processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None
    preprocess_iterable = (
-        hasattr(cli_args, "iterable")
+        cli_args
+        and hasattr(cli_args, "iterable")
        and cli_args.iterable is not None
        and cli_args.iterable
    )
@@ -76,20 +79,25 @@ def load_datasets(
        preprocess_iterable=preprocess_iterable,
    )

-    if (
-        cli_args.debug
-        or cfg.debug
-        or cli_args.debug_text_only
-        or int(cli_args.debug_num_examples) > 0
-    ):
+    if (  # pylint: disable=too-many-boolean-expressions
+        cli_args
+        and (
+            cli_args.debug
+            or cfg.debug
+            or cli_args.debug_text_only
+            or int(cli_args.debug_num_examples) > 0
+        )
+    ) or debug:
        LOG.info("check_dataset_labels...")

-        train_samples = sample_dataset(train_dataset, cli_args.debug_num_examples)
+        num_examples = cli_args.debug_num_examples if cli_args else 1
+        text_only = cli_args.debug_text_only if cli_args else False
+        train_samples = sample_dataset(train_dataset, num_examples)
        check_dataset_labels(
            train_samples,
            tokenizer,
-            num_examples=cli_args.debug_num_examples,
-            text_only=cli_args.debug_text_only,
+            num_examples=num_examples,
+            text_only=text_only,
        )

        LOG.info("printing prompters...")
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -168,6 +168,9 @@ class TrainerBuilderBase(abc.ABC):
                )
            )

+        if self.cfg.gc_steps:
+            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
+
        if self.cfg.use_wandb:
            callbacks.append(
                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
@@ -249,9 +252,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.loss_watchdog_threshold is not None:
            callbacks.append(LossWatchDogCallback(self.cfg))

-        if self.cfg.gc_steps:
-            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
-
        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
@@ -488,7 +488,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):

        # these are all the "standard" kwargs that are def used
        training_arguments_kwargs["max_steps"] = (
-            total_num_steps if self.cfg.max_steps else -1
+            self.cfg.max_steps if self.cfg.max_steps else -1
        )
        training_arguments_kwargs["max_seq_length"] = self.cfg.sequence_len
        training_arguments_kwargs["per_device_train_batch_size"] = (
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -177,12 +177,8 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
            # dpo trainer may incorrectly prepend the bos_token_id to the dpo outputs
            if res["chosen_input_ids"][0] == processing_class.bos_token_id:
                res["chosen_input_ids"] = res["chosen_input_ids"][1:]
-                res["chosen_labels"] = res["chosen_labels"][1:]
-                res["chosen_attention_mask"] = res["chosen_attention_mask"][1:]
            if res["rejected_input_ids"][0] == processing_class.bos_token_id:
                res["rejected_input_ids"] = res["rejected_input_ids"][1:]
-                res["rejected_labels"] = res["rejected_labels"][1:]
-                res["rejected_attention_mask"] = res["rejected_attention_mask"][1:]

        return res

@@ -251,7 +247,9 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
                )

        # Base evaluation
-        initial_output = super().evaluation_loop(
+        initial_output = super(  # pylint: disable=bad-super-call
+            DPOTrainer, self
+        ).evaluation_loop(
            dataloader,
            description,
            prediction_loss_only,
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -63,6 +63,7 @@ class GRPOStrategy:

        grpo_args_kwargs["max_completion_length"] = trl.max_completion_length
        grpo_args_kwargs["log_completions"] = trl.log_completions
+        grpo_args_kwargs["num_completions_to_print"] = trl.num_completions_to_print

        if trl.reward_weights:
            grpo_args_kwargs["reward_weights"] = trl.reward_weights
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -11,7 +11,6 @@ from accelerate.logging import get_logger
 from datasets import Dataset
 from transformers.trainer import Trainer

-from axolotl.logging_config import configure_logging
 from axolotl.train import (
    TrainDatasetMeta,
    setup_model_and_tokenizer,
@@ -24,7 +23,6 @@ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)

-configure_logging()
 LOG = get_logger(__name__)


--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -151,6 +151,30 @@ class LigerPlugin(BasePlugin):
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
+        elif cfg.model_config_type == "qwen3":
+            from axolotl.integrations.liger.models.qwen3 import (
+                apply_liger_kernel_to_qwen3,
+            )
+
+            apply_liger_kernel_to_qwen3(
+                cross_entropy=cfg.liger_cross_entropy,
+                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
+                glu_activation=cfg.liger_glu_activation,
+                rms_norm=cfg.liger_rms_norm,
+                layer_norm=cfg.liger_layer_norm,
+            )
+        elif cfg.model_config_type == "qwen3_moe":
+            from axolotl.integrations.liger.models.qwen3_moe import (
+                apply_liger_kernel_to_qwen3_moe,
+            )
+
+            apply_liger_kernel_to_qwen3_moe(
+                cross_entropy=cfg.liger_cross_entropy,
+                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
+                glu_activation=cfg.liger_glu_activation,
+                rms_norm=cfg.liger_rms_norm,
+                layer_norm=cfg.liger_layer_norm,
+            )
        else:
            logging.warning(
                f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
--- a/src/axolotl/integrations/liger/models/qwen3.py
+++ b/src/axolotl/integrations/liger/models/qwen3.py
@@ -0,0 +1,160 @@
+"""
+Liger FLCE for Qwen3. Based on transformers v4.51.3.
+"""
+
+import sys
+from typing import Optional, Tuple, Union
+
+import torch
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Cache] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    **kwargs,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+    """
+
+    # pylint: disable=duplicate-code
+    output_attentions = (
+        output_attentions
+        if output_attentions is not None
+        else self.config.output_attentions
+    )
+    output_hidden_states = (
+        output_hidden_states
+        if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+
+    logits = None
+    loss = None
+    # if in training mode, don't materialize logits
+    if self.training and (labels is not None):
+        loss = LigerForCausalLMLoss(
+            hidden_states=hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+
+    else:  # if in inference mode materialize logits
+        slice_indices = (
+            slice(-logits_to_keep, None)
+            if isinstance(logits_to_keep, int)
+            else logits_to_keep
+        )
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
+
+
+def apply_liger_kernel_to_qwen3(
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = False,
+    rms_norm: bool = False,
+    glu_activation: bool = False,
+    layer_norm: bool = False,
+    **kwargs,  # pylint: disable=unused-argument
+) -> None:
+    # pylint: disable=duplicate-code
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
+
+    Args:
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is False.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
+        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
+        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
+    """
+
+    import transformers.models.qwen3.modeling_qwen3  # noqa: F401  # pylint: disable=unused-import
+    from liger_kernel.transformers.functional import liger_cross_entropy
+    from liger_kernel.transformers.layer_norm import LigerLayerNorm
+    from liger_kernel.transformers.rms_norm import LigerRMSNorm
+    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
+
+    assert not (
+        cross_entropy and fused_linear_cross_entropy
+    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+
+    modeling_qwen3 = sys.modules["transformers.models.qwen3.modeling_qwen3"]
+
+    if rms_norm:
+        modeling_qwen3.Qwen3RMSNorm = LigerRMSNorm
+
+    if glu_activation:
+        modeling_qwen3.Qwen3MLP = LigerSwiGLUMLP
+
+    if layer_norm:
+        modeling_qwen3.nn.LayerNorm = LigerLayerNorm
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        modeling_qwen3.Qwen3ForCausalLM.forward = lce_forward
--- a/src/axolotl/integrations/liger/models/qwen3_moe.py
+++ b/src/axolotl/integrations/liger/models/qwen3_moe.py
@@ -0,0 +1,191 @@
+"""
+Liger FLCE for Qwen3 MoE. Based on transformers v4.51.3.
+"""
+
+import sys
+from copy import deepcopy
+from typing import List, Optional, Union
+
+import torch
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from transformers.modeling_outputs import MoeCausalLMOutputWithPast
+from transformers.models.qwen3_moe.modeling_qwen3_moe import load_balancing_loss_func
+
+
+def lce_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_router_logits: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    **kwargs,
+) -> MoeCausalLMOutputWithPast:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+    """
+
+    # pylint: disable=duplicate-code
+    output_attentions = (
+        output_attentions
+        if output_attentions is not None
+        else self.config.output_attentions
+    )
+    output_router_logits = (
+        output_router_logits
+        if output_router_logits is not None
+        else self.config.output_router_logits
+    )
+    output_hidden_states = (
+        output_hidden_states
+        if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        output_router_logits=output_router_logits,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+
+    logits = None
+    loss = None
+    # if in training mode, don't materialize logits
+    if self.training and (labels is not None):
+        loss = LigerForCausalLMLoss(
+            hidden_states=hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+
+    else:  # if in inference mode materialize logits
+        slice_indices = (
+            slice(-logits_to_keep, None)
+            if isinstance(logits_to_keep, int)
+            else logits_to_keep
+        )
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    aux_loss = None
+    if output_router_logits:
+        aux_loss = load_balancing_loss_func(
+            outputs.router_logits,
+            self.num_experts,
+            self.num_experts_per_tok,
+            attention_mask,
+        )
+        if labels is not None:
+            loss += self.router_aux_loss_coef * aux_loss.to(
+                loss.device
+            )  # make sure to reside in the same device
+
+    return MoeCausalLMOutputWithPast(
+        loss=loss,
+        aux_loss=aux_loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
+
+
+def apply_liger_kernel_to_qwen3_moe(
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = False,
+    rms_norm: bool = False,
+    glu_activation: bool = False,
+    layer_norm: bool = False,
+    **kwargs,  # pylint: disable=unused-argument
+) -> None:
+    # pylint: disable=duplicate-code
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
+
+    Args:
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is False.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
+        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
+        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
+    """
+
+    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401  # pylint: disable=unused-import
+    from liger_kernel.transformers.functional import liger_cross_entropy
+    from liger_kernel.transformers.layer_norm import LigerLayerNorm
+    from liger_kernel.transformers.rms_norm import LigerRMSNorm
+    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
+
+    assert not (
+        cross_entropy and fused_linear_cross_entropy
+    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+
+    modeling_qwen3_moe = sys.modules["transformers.models.qwen3_moe.modeling_qwen3_moe"]
+
+    if rms_norm:
+        modeling_qwen3_moe.Qwen3MoeRMSNorm = LigerRMSNorm
+
+    if glu_activation:
+
+        def _liger_swiglu_mlp_wrapper(config, intermediate_size=None, **kwargs):
+            "Accepts intermediate_size to pass to LigerSwiGLUMLP"
+            # clone config to avoid modifying the original
+            config = deepcopy(config)
+            if intermediate_size:
+                setattr(config, "intermediate_size", intermediate_size)
+            return LigerSwiGLUMLP(config, **kwargs)
+
+        modeling_qwen3_moe.Qwen3MoeMLP = _liger_swiglu_mlp_wrapper
+
+    if layer_norm:
+        modeling_qwen3_moe.nn.LayerNorm = LigerLayerNorm
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = lce_forward
--- a/src/axolotl/monkeypatch/attention/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/patch.py
@@ -12,10 +12,8 @@ import torch
 import torch.distributed as dist
 from accelerate.logging import get_logger

-from axolotl.logging_config import configure_logging
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids

-configure_logging()
 LOG = get_logger(__name__)


--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -18,6 +18,8 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "mixtral",
    "qwen2",
    "qwen2_moe",
+    "qwen3",
+    "qwen3_moe",
    "falcon",
    "phi",
    "phi3",
--- a/src/axolotl/monkeypatch/trainer/init.py
+++ b/src/axolotl/monkeypatch/trainer/init.py
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -21,6 +21,7 @@ from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.trainer import Trainer

+from axolotl.cli.art import print_axolotl_text_art
 from axolotl.common.datasets import TrainDatasetMeta
 from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
@@ -30,7 +31,6 @@ from axolotl.core.trainers.mixins.sequence_parallel import (
    SequenceParallelContextManager,
 )
 from axolotl.integrations.base import PluginManager
-from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.freeze import freeze_layers_except
@@ -42,7 +42,6 @@ try:
 except ImportError:
    BetterTransformer = None

-configure_logging()
 LOG = get_logger(__name__)


@@ -518,6 +517,8 @@ def train(
    Returns:
        Tuple of (model, tokenizer) after training
    """
+    print_axolotl_text_art()
+
    # Setup model, tokenizer, (causal or RLHF) trainer, etc.
    (
        trainer,
--- a/src/axolotl/utils/init.py
+++ b/src/axolotl/utils/init.py
@@ -43,3 +43,12 @@ def set_pytorch_cuda_alloc_conf():
            os.environ["PYTORCH_CUDA_ALLOC_CONF"] = (
                "expandable_segments:True,roundup_power2_divisions:16"
            )
+
+
+def patch_optimized_env():
+    """
+    Patch environment variables to improve VRAM usage and increase download speed
+    """
+    if os.getenv("HF_HUB_ENABLE_HF_TRANSFER") is None:
+        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+    set_pytorch_cuda_alloc_conf()
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -59,7 +59,7 @@ def choose_device(cfg):

 def resolve_dtype(cfg):
    if (
-        cfg.bf16 == "auto" and not cfg.use_ray
+        not cfg.fp16 and cfg.bf16 == "auto" and not cfg.use_ray
    ):  # if we use ray we want to defer this check to the worker node
        if is_torch_bf16_gpu_available():
            LOG.debug("bf16 support detected, enabling for this configuration.")
@@ -67,7 +67,7 @@ def resolve_dtype(cfg):
        else:
            LOG.debug("bf16 support not detected, disabling for this configuration.")
            cfg.bf16 = False
-            if cfg.fp16 is None:
+            if cfg.fp16 is None and not cfg.float16:
                cfg.fp16 = True

    if cfg.device == "mps":
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -190,7 +190,7 @@ class MultipackBatchSampler(BatchSampler):
        self.len_across_ranks = None

        if self.sequential and not isinstance(sampler, SequentialSampler):
-            LOG.warn(
+            LOG.warning(
                "using sequential sample packing with non-sequential sampler, did you want to also enable curriculum_sampling?"
            )

--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -512,10 +512,17 @@ class AxolotlInputConfig(
    @model_validator(mode="before")
    @classmethod
    def hint_sample_packing_padding(cls, data):
-        if data.get("sample_packing") and not data.get("pad_to_sequence_len"):
-            LOG.warning(
-                "`pad_to_sequence_len: true` is recommended when using sample_packing"
-            )
+        if data.get("sample_packing"):
+            pad_to_sequence_len = data.get("pad_to_sequence_len")
+            if pad_to_sequence_len is False:
+                LOG.warning(
+                    "`pad_to_sequence_len: true` is recommended when using sample_packing"
+                )
+            elif pad_to_sequence_len is None:
+                LOG.info(
+                    "Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing"
+                )
+                data["pad_to_sequence_len"] = True
        return data

    @model_validator(mode="before")
--- a/src/axolotl/utils/schemas/trl.py
+++ b/src/axolotl/utils/schemas/trl.py
@@ -67,6 +67,12 @@ class TRLConfig(BaseModel):
        default=False,
        json_schema_extra={"description": "Whether to log completions"},
    )
+    num_completions_to_print: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Number of completions to print. If `log_completions` is `True`, this will be the number of completions logged."
+        },
+    )
    sync_ref_model: bool | None = Field(
        default=False,
        json_schema_extra={
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -597,6 +597,8 @@ def prepare_optim_env(cfg):
        os.environ["ACCELERATE_MIXED_PRECISION"] = "bf16"
    elif cfg.fp16:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "fp16"
+    else:
+        os.environ["ACCELERATE_MIXED_PRECISION"] = "no"


 def prepare_opinionated_env(cfg):
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,6 +4,7 @@ shared pytest fixtures

 import functools
 import importlib
+import os
 import shutil
 import sys
 import tempfile
@@ -529,31 +530,32 @@ def dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff(


 # # pylint: disable=redefined-outer-name,unused-argument
-# def test_load_fixtures(
-#     download_smollm2_135m_model,
-#     download_llama_68m_random_model,
-#     download_qwen_2_5_half_billion_model,
-#     download_tatsu_lab_alpaca_dataset,
-#     download_mhenrichsen_alpaca_2k_dataset,
-#     download_mhenrichsen_alpaca_2k_w_revision_dataset,
-#     download_mlabonne_finetome_100k_dataset,
-#     download_argilla_distilabel_capybara_dpo_7k_binarized_dataset,
-#     download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset,
-#     download_fozzie_alpaca_dpo_dataset,
-#     download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset,
-#     download_argilla_dpo_pairs_dataset,
-#     download_tiny_shakespeare_dataset,
-#     download_deepseek_model_fixture,
-#     download_huggyllama_model_fixture,
-#     download_llama_1b_model_fixture,
-#     download_llama3_8b_model_fixture,
-#     download_llama3_8b_instruct_model_fixture,
-#     download_phi_35_mini_model_fixture,
-#     download_phi_3_medium_model_fixture,
-#     download_mistral_7b_model_fixture,
-#     download_gemma_2b_model_fixture,
-#     download_gemma2_9b_model_fixture,
-#     download_mlx_mistral_7b_model_fixture,
-#     download_llama2_model_fixture,
-# ):
-#     pass
+@pytest.mark.skipif(
+    os.environ.get("AXOLOTL_IS_CI_CACHE_PRELOAD", "-1") != "1",
+    reason="Not running in CI cache preload",
+)
+def test_load_fixtures(
+    download_smollm2_135m_model,
+    download_qwen_2_5_half_billion_model,
+    download_tatsu_lab_alpaca_dataset,
+    download_mhenrichsen_alpaca_2k_dataset,
+    download_mhenrichsen_alpaca_2k_w_revision_dataset,
+    download_mlabonne_finetome_100k_dataset,
+    download_argilla_distilabel_capybara_dpo_7k_binarized_dataset,
+    download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset,
+    download_argilla_dpo_pairs_dataset,
+    download_tiny_shakespeare_dataset,
+    download_deepseek_model_fixture,
+    download_huggyllama_model_fixture,
+    download_llama_1b_model_fixture,
+    download_llama3_8b_model_fixture,
+    download_llama3_8b_instruct_model_fixture,
+    download_phi_35_mini_model_fixture,
+    download_phi_3_medium_model_fixture,
+    download_mistral_7b_model_fixture,
+    download_gemma_2b_model_fixture,
+    download_gemma2_9b_model_fixture,
+    download_mlx_mistral_7b_model_fixture,
+    download_llama2_model_fixture,
+):
+    pass
--- a/tests/patched/test_validation.py
+++ b/tests/patched/test_validation.py
@@ -648,7 +648,7 @@ class TestValidation(BaseValidation):
            DictDefault(
                {
                    "sample_packing": True,
-                    "pad_to_sequence_len": None,
+                    "pad_to_sequence_len": False,
                    "flash_attention": True,
                }
            )
@@ -662,6 +662,26 @@ class TestValidation(BaseValidation):
                for record in self._caplog.records
            )

+    def test_packing_autoset(self, minimal_cfg):
+        cfg = (
+            DictDefault(
+                {
+                    "sample_packing": True,
+                    "pad_to_sequence_len": None,
+                    "flash_attention": True,
+                }
+            )
+            | minimal_cfg
+        )
+        with self._caplog.at_level(logging.INFO):
+            cfg = validate_config(cfg)
+            assert any(
+                "Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing"
+                in record.message
+                for record in self._caplog.records
+            )
+            assert cfg.pad_to_sequence_len is True
+
    def test_merge_lora_no_bf16_fail(self, minimal_cfg):
        """
        This is assumed to be run on a CPU machine, so bf16 is not supported.
Author	SHA1	Message	Date
Wing Lian	d790371b64	bump peft to 3.5.1	2025-05-06 11:38:14 -04:00
mhenrichsen	a6cac5dd32	Update lr_scheduler options in config.qmd to include additional scheduling strategies for improved training flexibility. (#2636 ) [skip ci]	2025-05-06 11:24:07 -04:00
Wing Lian	b71c0e3447	Print axolotl art if train is called outside of cli: (#2627 ) [skip ci]	2025-05-06 11:18:45 -04:00
Wing Lian	ddaebf8309	fix dpo eval override to call grandparent instead of the broken super (#2628 ) [skip ci]	2025-05-06 11:18:25 -04:00
Wing Lian	679743087a	make sure gc_steps is used for all trainers (#2638 )	2025-05-06 11:18:00 -04:00
Wing Lian	f720b6e72d	repop cache (#2639 ) * repop cache * pre-cache as a step * fix the name * add reason for pytest skipif * restore pytorch matrix * remove max-parallel now that we've optimized this a bit	2025-05-06 11:09:07 -04:00
mhenrichsen	a980618fd0	Adds example for training a TTS model on top of a LLM. (#2614 ) * Adds example for training a TTS model on top of a LLM. * Update examples/orpheus/finetune.yml Co-authored-by: NanoCode012 <nano@axolotl.ai> * Update examples/orpheus/finetune.yml Co-authored-by: NanoCode012 <nano@axolotl.ai> * Update README.md to clarify GPU requirements for finetuning Orpheus TTS model * Update finetune.yml to use the new base model canopylabs/orpheus-3b-0.1-pretrained * Update finetune.yml and README.md for consistency and clarity --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>	2025-05-06 10:11:06 +02:00
Emmanuel Ferdman	54960d4de0	Fix logging deprecation warnings (#2623 ) Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>	2025-05-04 08:22:45 -04:00
Wing Lian	ed922796b7	include multipack support for qwen3 family (#2622 )	2025-05-03 12:02:39 -04:00
Wing Lian	3dd9c3bf3f	setup hf transfer too and fix auto bf16 when fp16 enabled (#2620 ) [skip ci]	2025-05-03 12:02:26 -04:00
Wing Lian	0ba7d362fa	qwen3 and qwen3_moe support for liger kernels (#2612 ) * qwen3 and qwen3_moe support for liger kernels * fix moe module path * fix: qwen3 liger input args and mlp * fix: qwen3 input args and output class --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>	2025-05-02 09:29:55 -04:00
aitechguy	e4f73bc98e	remove keys to incoporate changes for the trl update (#2616 )	2025-05-02 08:47:42 -04:00
Wing Lian	bcb59c70e2	automatically set pad_to_sequence_len when use packing (#2607 ) * automatically set pad_to_sequence_len when use packing * update tests	2025-05-01 13:24:38 -04:00
NanoCode012	6a3e6f8c53	fix: run preview-docs only when md/qmd changes (#2606 ) * fix: run preview-docs only when md/qmd changes * feat: add quarto yaml based on PR feedback	2025-05-01 13:21:28 -04:00
Wing Lian	fee3c13bb5	Logging config for colab (#2611 ) * only configure logging on cli to play nicely with colab * allow reloading the config on the fly from a dict * make sure to use dict for yaml * reuse existing function for load * make cli args optional * mps fix and respect max_steps	2025-05-01 12:58:00 -04:00
Rahul Tuli	996fc124e5	Add: Sparse Finetuning Integration with llmcompressor (#2479 ) * Add: SFTPlugin with llmcompressor * Update: review comments! * Add:llmcompressor instalable * pre commit hooks * Use: warning over warn * Revert: TODO's * Update llmcompressor version to latest * Apply suggestions from @markurtz Co-authored-by: Mark Kurtz <mark.j.kurtz@gmail.com> * Address review comments from @markurtz * Add: llcompressor installable * Rename: sft.yaml to sparse-finetuning.yaml * Use: absolute import * Update model config * Move: LLMCompressorPlugin into it's own submodule * Add: `llm_compressor` integration documentation * Rebase and updates! * Tests, Style, Updates * Add: .qmd file * Address Review Comments: * deleted redundant docs/llm_compressor.qmd * incorporated feedback in integration README.md * added llmcompressor integration to docs/custom_integrations.qmd Signed-off-by: Rahul Tuli <rtuli@redhat.com> * Add: line about further optimizations using llmcompressor Signed-off-by: Rahul Tuli <rtuli@redhat.com> * Apply patch from @winglian Signed-off-by: Rahul Tuli <rtuli@redhat.com> * Fix: Test Signed-off-by: Rahul Tuli <rtuli@redhat.com> * additional fixes for docker and saving compressed * split llmcompressor from vllm checks * Reset session between tests Signed-off-by: Rahul Tuli <rtuli@redhat.com> * move decorator to test method instead of class * make sure to reset the session after each test * move import of llmcompressor to reset session inside test --------- Signed-off-by: Rahul Tuli <rtuli@redhat.com> Co-authored-by: Mark Kurtz <mark.j.kurtz@gmail.com> Co-authored-by: Wing Lian <wing@axolotl.ai>	2025-05-01 12:25:16 -04:00
Wing Lian	e963990ad7	add missing __init__ for lr monkeypatch fix (#2609 )	2025-05-01 09:41:32 -04:00
Dhruv Mullick	c3f2b1c5c2	Add num_completions_to_print for trl and grpo (#2604 )	2025-04-30 21:00:30 -04:00
Wing Lian	6ba5c0ed2c	use latest hf-xet and don't install vllm for torch 2.7.0 (#2603 ) * use latest hf-xet and don't install vllm for torch 2.7.0 * fix runpod hub tests	2025-04-30 18:27:39 -04:00