fix(trl): remove access to invalid property

2025-05-02 15:41:53 +07:00
21 changed files with 71 additions and 941 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -44,98 +44,12 @@ jobs:
        env:
          SKIP: no-commit-to-branch

-  preload-cache:
-    name: Preload HF cache
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.11"]
-        pytorch_version: ["2.6.0"]
-    timeout-minutes: 20
-
-    env:
-      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
-
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies
-
-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
-
-      - name: Install PyTorch
-        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
-
-      - name: Install dependencies
-        run: |
-          pip3 show torch
-          pip3 install --no-build-isolation -U -e .
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-
-      - name: Make sure PyTorch version wasn't clobbered
-        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
-
-      - name: Ensure axolotl CLI was installed
-        run: |
-          axolotl --help
-
-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
-      - name: Run tests
-        run: |
-          pytest -v tests/conftest.py
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          files: ./coverage.xml
-          flags: unittests,pytorch-${{ matrix.pytorch_version }}
-          fail_ci_if_error: false
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-
-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
-    needs: [preload-cache]
    strategy:
      fail-fast: false
+      max-parallel: 2
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
@@ -207,12 +121,21 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
-    needs: [preload-cache]
    strategy:
      fail-fast: false
+      max-parallel: 1
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
@@ -276,6 +199,15 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
  docker-e2e-tests-1st:
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -547,7 +547,7 @@ gradient_checkpointing: false
 early_stopping_patience: 3

 # Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine
+lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
 cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
--- a/examples/orpheus/README.md
+++ b/examples/orpheus/README.md
@@ -1,341 +0,0 @@
-# Finetuning LLMs to output audio
-
-In this example, we finetune Orpcanopylabs/orpheus-tts-0.1-pretrained (a LLaMA 3.2 3b model) to output audio.
-
-The `finetune.yml` withe current settings will run on any Nvidia GPU with 45GB VRAM or more. If you adjust the batch size it can easily run on any GPU under 24GB.
-
-## Dataset pre-processing for pre-training
-If you are adding another voice in English, please jump ahead to finetuning pre-processing.
-
-For this to work, we need to preprocess our dataset. Since we are expecting to output audio, we will need to add tokens to the tokenizer.
-
-Using this code, it will download the SNAC model and add the correct tokens and upload the final dataset.
-
-```python
-import torch
-from snac import SNAC
-from datasets import load_dataset
-from huggingface_hub import snapshot_download
-from datasets import load_dataset
-import random
-import torchaudio.transforms as T
-from transformers import AutoTokenizer
-import os
-
-my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
-name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
-
-dsn = my_original_dataset_name
-
-snapshot_download(
-    repo_id=dsn,
-    repo_type="dataset",
-    revision="main",
-    max_workers=64,
-)
-
-
-ds = load_dataset(dsn, split="train")
-ds_sample_rate = ds[0]["audio"]["sampling_rate"]
-
-model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
-model = model.to("mps")
-
-def tokenise_audio(waveform):
-  waveform = torch.from_numpy(waveform).unsqueeze(0)
-  waveform = waveform.to(dtype=torch.float32)
-  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
-  waveform = resample_transform(waveform)
-
-  waveform = waveform.unsqueeze(0).to("cuda")
-
-  #generate the codes from snac
-  with torch.inference_mode():
-    codes = model.encode(waveform)
-
-  all_codes = []
-  for i in range(codes[0].shape[1]):
-    all_codes.append(codes[0][0][i].item()+128266)
-    all_codes.append(codes[1][0][2*i].item()+128266+4096)
-    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
-    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
-    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
-    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
-    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
-
-
-  return all_codes
-
-def add_codes(example):
-    # Always initialize codes_list to None
-    codes_list = None
-
-    try:
-        answer_audio = example.get("audio")
-        # If there's a valid audio array, tokenise it
-        if answer_audio and "array" in answer_audio:
-            audio_array = answer_audio["array"]
-            codes_list = tokenise_audio(audio_array)
-    except Exception as e:
-        print(f"Skipping row due to error: {e}")
-        # Keep codes_list as None if we fail
-    example["codes_list"] = codes_list
-
-    return example
-
-ds = ds.map(add_codes, remove_columns=["audio"])
-
-#@title Load Tokenizer
-tokeniser_length = 128256
-start_of_text = 128000
-end_of_text = 128009
-
-start_of_speech = tokeniser_length + 1
-end_of_speech = tokeniser_length + 2
-
-start_of_human = tokeniser_length + 3
-end_of_human = tokeniser_length + 4
-
-start_of_ai = tokeniser_length + 5
-end_of_ai =  tokeniser_length + 6
-pad_token = tokeniser_length + 7
-
-audio_tokens_start = tokeniser_length + 10
-
-tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
-
-
-tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-num_proc = os.cpu_count() - 2
-
-ds = ds.filter(lambda x: x["codes_list"] is not None)
-ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
-
-#@title Create Input Ids
-def remove_duplicate_frames(example):
-    vals = example["codes_list"]
-    if len(vals) % 7 != 0:
-        raise ValueError("Input list length must be divisible by 7")
-
-    result = vals[:7]
-
-    removed_frames = 0
-
-    for i in range(7, len(vals), 7):
-        current_first = vals[i]
-        previous_first = result[-7]
-
-        if current_first != previous_first:
-            result.extend(vals[i:i+7])
-        else:
-            removed_frames += 1
-
-    example["codes_list"] = result
-
-    return example
-
-ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
-
-
-def create_input_ids(example):
-    text_ids = tokenizer.encode({example['text']},  add_special_tokens=True)
-    text_ids.append(end_of_text)
-    example["text_tokens"] = text_ids
-    input_ids = (
-        [start_of_human]
-        + example["text_tokens"]
-        + [end_of_human]
-        + [start_of_ai]
-        + [start_of_speech]
-        + example["codes_list"]
-        + [end_of_speech]
-        + [end_of_ai]
-    )
-    example["input_ids"] = input_ids
-    example["labels"] = input_ids
-    example["attention_mask"] = [1] * len(input_ids)
-
-    return example
-
-ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
-
-#@title Remove unnecessary columns
-columns_to_keep = ["input_ids", "labels", "attention_mask"]
-columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
-
-ds = ds.remove_columns(columns_to_remove)
-
-ds.push_to_hub(name_to_push_dataset_to)
-```
-
-
-## Finetune pre-processing
-Use this code to add a new voice.
-
-```python
-import torch
-from snac import SNAC
-from datasets import load_dataset
-from huggingface_hub import snapshot_download
-from datasets import load_dataset
-import random
-import torchaudio.transforms as T
-from transformers import AutoTokenizer
-import os
-
-my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
-name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
-
-dsn = my_original_dataset_name
-
-snapshot_download(
-    repo_id=dsn,
-    repo_type="dataset",
-    revision="main",
-    max_workers=64,
-)
-
-
-ds = load_dataset(dsn, split="train")
-ds_sample_rate = ds[0]["audio"]["sampling_rate"]
-
-model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
-model = model.to("mps")
-
-def tokenise_audio(waveform):
-  waveform = torch.from_numpy(waveform).unsqueeze(0)
-  waveform = waveform.to(dtype=torch.float32)
-  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
-  waveform = resample_transform(waveform)
-
-  waveform = waveform.unsqueeze(0).to("cuda")
-
-  #generate the codes from snac
-  with torch.inference_mode():
-    codes = model.encode(waveform)
-
-  all_codes = []
-  for i in range(codes[0].shape[1]):
-    all_codes.append(codes[0][0][i].item()+128266)
-    all_codes.append(codes[1][0][2*i].item()+128266+4096)
-    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
-    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
-    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
-    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
-    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
-
-
-  return all_codes
-
-def add_codes(example):
-    # Always initialize codes_list to None
-    codes_list = None
-
-    try:
-        answer_audio = example.get("audio")
-        # If there's a valid audio array, tokenise it
-        if answer_audio and "array" in answer_audio:
-            audio_array = answer_audio["array"]
-            codes_list = tokenise_audio(audio_array)
-    except Exception as e:
-        print(f"Skipping row due to error: {e}")
-        # Keep codes_list as None if we fail
-    example["codes_list"] = codes_list
-
-    return example
-
-ds = ds.map(add_codes, remove_columns=["audio"])
-
-#@title Load Tokenizer
-tokeniser_length = 128256
-start_of_text = 128000
-end_of_text = 128009
-
-start_of_speech = tokeniser_length + 1
-end_of_speech = tokeniser_length + 2
-
-start_of_human = tokeniser_length + 3
-end_of_human = tokeniser_length + 4
-
-start_of_ai = tokeniser_length + 5
-end_of_ai =  tokeniser_length + 6
-pad_token = tokeniser_length + 7
-
-audio_tokens_start = tokeniser_length + 10
-
-tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
-
-
-tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-num_proc = os.cpu_count() - 2
-
-ds = ds.filter(lambda x: x["codes_list"] is not None)
-ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
-
-#@title Create Input Ids
-def remove_duplicate_frames(example):
-    vals = example["codes_list"]
-    if len(vals) % 7 != 0:
-        raise ValueError("Input list length must be divisible by 7")
-
-    result = vals[:7]
-
-    removed_frames = 0
-
-    for i in range(7, len(vals), 7):
-        current_first = vals[i]
-        previous_first = result[-7]
-
-        if current_first != previous_first:
-            result.extend(vals[i:i+7])
-        else:
-            removed_frames += 1
-
-    example["codes_list"] = result
-
-    return example
-
-ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
-
-tok_info = '''*** HERE you can modify the text prompt
-i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:
-f"{example["source"]}:  {example["text"]}", as is passed.
-'''
-print(tok_info)
-
-def create_input_ids(example):
-    text_ids = tokenizer.encode(f"{example['speaker_id']}: {example['text']}",  add_special_tokens=True)
-    text_ids.append(end_of_text)
-    example["text_tokens"] = text_ids
-    input_ids = (
-        [start_of_human]
-        + example["text_tokens"]
-        + [end_of_human]
-        + [start_of_ai]
-        + [start_of_speech]
-        + example["codes_list"]
-        + [end_of_speech]
-        + [end_of_ai]
-    )
-    example["input_ids"] = input_ids
-    example["labels"] = input_ids
-    example["attention_mask"] = [1] * len(input_ids)
-
-    return example
-
-ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
-
-#@title Remove unnecessary columns
-columns_to_keep = ["input_ids", "labels", "attention_mask"]
-columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
-
-ds = ds.remove_columns(columns_to_remove)
-
-ds.push_to_hub(name_to_push_dataset_to)
-```
-
-## Training
-After preprocessing is done, fill out the blanks in finetune.yml and simply run `axolotl train finetune.yml`
-
-## Inference
-For inference, please refer to the original [orpheus github](https://github.com/canopyai/Orpheus-TTS/tree/main).
--- a/examples/orpheus/finetune.yml
+++ b/examples/orpheus/finetune.yml
@@ -1,52 +0,0 @@
-base_model: canopylabs/orpheus-3b-0.1-pretrained
-
-hub_model_id: <your-hub-model-id>
-
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-liger_rope: true
-liger_rms_norm: true
-liger_glu_activation: true
-liger_fused_linear_cross_entropy: true
-
-datasets:
-  - path: <your-hf-dataset-id>
-    type:  # leave empty to load pre-tokenized
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./outputs/out
-
-sequence_len: 8192
-sample_packing: true
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 8
-micro_batch_size: 4
-num_epochs: 3
-optimizer: adamw_torch_fused
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 20
-evals_per_epoch: 5
-saves_per_epoch: 5
-weight_decay: 0.05
-
-special_tokens:
-  pad_token: <custom_token_7>
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ peft==0.15.2
 transformers==4.51.3
 tokenizers>=0.21.1
 accelerate==1.6.0
-datasets==3.5.1
+datasets==3.5.0
 deepspeed>=0.15.4
 trl==0.17.0
 hf_xet==1.1.0
--- a/src/axolotl/cli/art.py
+++ b/src/axolotl/cli/art.py
@@ -16,15 +16,8 @@ AXOLOTL_LOGO = """
    @@@@  @@@@@@@@@@@@@@@@
 """

-HAS_PRINTED_LOGO = False
-

 def print_axolotl_text_art():
    """Prints axolotl ASCII art."""
-
-    global HAS_PRINTED_LOGO  # pylint: disable=global-statement
-    if HAS_PRINTED_LOGO:
-        return
    if is_main_process():
-        HAS_PRINTED_LOGO = True
        print(AXOLOTL_LOGO)
--- a/src/axolotl/cli/evaluate.py
+++ b/src/axolotl/cli/evaluate.py
@@ -15,7 +15,7 @@ from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.evaluate import evaluate
-from axolotl.utils import patch_optimized_env
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.dict import DictDefault

 LOG = logging.getLogger(__name__)
@@ -32,7 +32,7 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
        cli_args: CLI arguments.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()

    # pylint: disable=duplicate-code
    print_axolotl_text_art()
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -29,7 +29,7 @@ from axolotl.cli.utils import (
    filter_none_kwargs,
 )
 from axolotl.integrations.lm_eval.cli import lm_eval
-from axolotl.utils import patch_optimized_env
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.schemas.config import AxolotlInputConfig


@@ -55,8 +55,6 @@ def preprocess(config: str, cloud: Optional[str] = None, **kwargs) -> None:
        kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
            config options.
    """
-    patch_optimized_env()
-
    if cloud:
        from axolotl.cli.cloud import do_cli_preprocess

@@ -102,7 +100,7 @@ def train(
            config options.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()

    if "use_ray" in kwargs and kwargs["use_ray"]:
        accelerate = False
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -18,7 +18,7 @@ from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.integrations.base import PluginManager
 from axolotl.train import train
-from axolotl.utils import patch_optimized_env
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.config import normalize_config, resolve_dtype
 from axolotl.utils.dict import DictDefault

@@ -36,7 +36,7 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
        cli_args: Training-specific CLI arguments.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()

    print_axolotl_text_art()
    check_accelerate_default_config()
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -48,7 +48,6 @@ def load_datasets(
    *,
    cfg: DictDefault,
    cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
-    debug: bool = False,
 ) -> TrainDatasetMeta:
    """
    Loads one or more training or evaluation datasets, calling
@@ -57,7 +56,6 @@ def load_datasets(
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Command-specific CLI arguments.
-        debug: Whether to print out tokenization of sample

    Returns:
        Dataclass with fields for training and evaluation datasets and the computed
@@ -79,25 +77,20 @@ def load_datasets(
        preprocess_iterable=preprocess_iterable,
    )

-    if (  # pylint: disable=too-many-boolean-expressions
-        cli_args
-        and (
-            cli_args.debug
-            or cfg.debug
-            or cli_args.debug_text_only
-            or int(cli_args.debug_num_examples) > 0
-        )
-    ) or debug:
+    if cli_args and (
+        cli_args.debug
+        or cfg.debug
+        or cli_args.debug_text_only
+        or int(cli_args.debug_num_examples) > 0
+    ):
        LOG.info("check_dataset_labels...")

-        num_examples = cli_args.debug_num_examples if cli_args else 1
-        text_only = cli_args.debug_text_only if cli_args else False
-        train_samples = sample_dataset(train_dataset, num_examples)
+        train_samples = sample_dataset(train_dataset, cli_args.debug_num_examples)
        check_dataset_labels(
            train_samples,
            tokenizer,
-            num_examples=num_examples,
-            text_only=text_only,
+            num_examples=cli_args.debug_num_examples,
+            text_only=cli_args.debug_text_only,
        )

        LOG.info("printing prompters...")
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -168,9 +168,6 @@ class TrainerBuilderBase(abc.ABC):
                )
            )

-        if self.cfg.gc_steps:
-            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
-
        if self.cfg.use_wandb:
            callbacks.append(
                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
@@ -252,6 +249,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.loss_watchdog_threshold is not None:
            callbacks.append(LossWatchDogCallback(self.cfg))

+        if self.cfg.gc_steps:
+            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
+
        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -247,9 +247,7 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
                )

        # Base evaluation
-        initial_output = super(  # pylint: disable=bad-super-call
-            DPOTrainer, self
-        ).evaluation_loop(
+        initial_output = super().evaluation_loop(
            dataloader,
            description,
            prediction_loss_only,
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -151,30 +151,6 @@ class LigerPlugin(BasePlugin):
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
-        elif cfg.model_config_type == "qwen3":
-            from axolotl.integrations.liger.models.qwen3 import (
-                apply_liger_kernel_to_qwen3,
-            )
-
-            apply_liger_kernel_to_qwen3(
-                cross_entropy=cfg.liger_cross_entropy,
-                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
-                glu_activation=cfg.liger_glu_activation,
-                rms_norm=cfg.liger_rms_norm,
-                layer_norm=cfg.liger_layer_norm,
-            )
-        elif cfg.model_config_type == "qwen3_moe":
-            from axolotl.integrations.liger.models.qwen3_moe import (
-                apply_liger_kernel_to_qwen3_moe,
-            )
-
-            apply_liger_kernel_to_qwen3_moe(
-                cross_entropy=cfg.liger_cross_entropy,
-                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
-                glu_activation=cfg.liger_glu_activation,
-                rms_norm=cfg.liger_rms_norm,
-                layer_norm=cfg.liger_layer_norm,
-            )
        else:
            logging.warning(
                f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
--- a/src/axolotl/integrations/liger/models/qwen3.py
+++ b/src/axolotl/integrations/liger/models/qwen3.py
@@ -1,160 +0,0 @@
-"""
-Liger FLCE for Qwen3. Based on transformers v4.51.3.
-"""
-
-import sys
-from typing import Optional, Tuple, Union
-
-import torch
-from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
-from transformers.cache_utils import Cache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
-
-def lce_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Cache] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **kwargs,
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-    Args:
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-    """
-
-    # pylint: disable=duplicate-code
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs[0]
-
-    logits = None
-    loss = None
-    # if in training mode, don't materialize logits
-    if self.training and (labels is not None):
-        loss = LigerForCausalLMLoss(
-            hidden_states=hidden_states,
-            lm_head_weight=self.lm_head.weight,
-            labels=labels,
-            hidden_size=self.config.hidden_size,
-            **kwargs,
-        )
-
-    else:  # if in inference mode materialize logits
-        slice_indices = (
-            slice(-logits_to_keep, None)
-            if isinstance(logits_to_keep, int)
-            else logits_to_keep
-        )
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits,
-                labels=labels,
-                vocab_size=self.config.vocab_size,
-                **kwargs,
-            )
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def apply_liger_kernel_to_qwen3(
-    cross_entropy: bool = False,
-    fused_linear_cross_entropy: bool = False,
-    rms_norm: bool = False,
-    glu_activation: bool = False,
-    layer_norm: bool = False,
-    **kwargs,  # pylint: disable=unused-argument
-) -> None:
-    # pylint: disable=duplicate-code
-    """
-    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
-
-    Args:
-        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
-        fused_linear_cross_entropy (bool):
-            Whether to apply Liger's fused linear cross entropy loss. Default is False.
-            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
-            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
-        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
-        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
-        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
-    """
-
-    import transformers.models.qwen3.modeling_qwen3  # noqa: F401  # pylint: disable=unused-import
-    from liger_kernel.transformers.functional import liger_cross_entropy
-    from liger_kernel.transformers.layer_norm import LigerLayerNorm
-    from liger_kernel.transformers.rms_norm import LigerRMSNorm
-    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
-
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
-
-    modeling_qwen3 = sys.modules["transformers.models.qwen3.modeling_qwen3"]
-
-    if rms_norm:
-        modeling_qwen3.Qwen3RMSNorm = LigerRMSNorm
-
-    if glu_activation:
-        modeling_qwen3.Qwen3MLP = LigerSwiGLUMLP
-
-    if layer_norm:
-        modeling_qwen3.nn.LayerNorm = LigerLayerNorm
-
-    if cross_entropy:
-        from transformers.loss.loss_utils import nn
-
-        nn.functional.cross_entropy = liger_cross_entropy
-
-    if fused_linear_cross_entropy:
-        modeling_qwen3.Qwen3ForCausalLM.forward = lce_forward
--- a/src/axolotl/integrations/liger/models/qwen3_moe.py
+++ b/src/axolotl/integrations/liger/models/qwen3_moe.py
@@ -1,191 +0,0 @@
-"""
-Liger FLCE for Qwen3 MoE. Based on transformers v4.51.3.
-"""
-
-import sys
-from copy import deepcopy
-from typing import List, Optional, Union
-
-import torch
-from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
-from transformers.modeling_outputs import MoeCausalLMOutputWithPast
-from transformers.models.qwen3_moe.modeling_qwen3_moe import load_balancing_loss_func
-
-
-def lce_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    output_router_logits: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **kwargs,
-) -> MoeCausalLMOutputWithPast:
-    r"""
-    Args:
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-    """
-
-    # pylint: disable=duplicate-code
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_router_logits = (
-        output_router_logits
-        if output_router_logits is not None
-        else self.config.output_router_logits
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        output_router_logits=output_router_logits,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs[0]
-
-    logits = None
-    loss = None
-    # if in training mode, don't materialize logits
-    if self.training and (labels is not None):
-        loss = LigerForCausalLMLoss(
-            hidden_states=hidden_states,
-            lm_head_weight=self.lm_head.weight,
-            labels=labels,
-            hidden_size=self.config.hidden_size,
-            **kwargs,
-        )
-
-    else:  # if in inference mode materialize logits
-        slice_indices = (
-            slice(-logits_to_keep, None)
-            if isinstance(logits_to_keep, int)
-            else logits_to_keep
-        )
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits,
-                labels=labels,
-                vocab_size=self.config.vocab_size,
-                **kwargs,
-            )
-
-    aux_loss = None
-    if output_router_logits:
-        aux_loss = load_balancing_loss_func(
-            outputs.router_logits,
-            self.num_experts,
-            self.num_experts_per_tok,
-            attention_mask,
-        )
-        if labels is not None:
-            loss += self.router_aux_loss_coef * aux_loss.to(
-                loss.device
-            )  # make sure to reside in the same device
-
-    return MoeCausalLMOutputWithPast(
-        loss=loss,
-        aux_loss=aux_loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def apply_liger_kernel_to_qwen3_moe(
-    cross_entropy: bool = False,
-    fused_linear_cross_entropy: bool = False,
-    rms_norm: bool = False,
-    glu_activation: bool = False,
-    layer_norm: bool = False,
-    **kwargs,  # pylint: disable=unused-argument
-) -> None:
-    # pylint: disable=duplicate-code
-    """
-    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
-
-    Args:
-        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
-        fused_linear_cross_entropy (bool):
-            Whether to apply Liger's fused linear cross entropy loss. Default is False.
-            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
-            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
-        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
-        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
-        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
-    """
-
-    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401  # pylint: disable=unused-import
-    from liger_kernel.transformers.functional import liger_cross_entropy
-    from liger_kernel.transformers.layer_norm import LigerLayerNorm
-    from liger_kernel.transformers.rms_norm import LigerRMSNorm
-    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
-
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
-
-    modeling_qwen3_moe = sys.modules["transformers.models.qwen3_moe.modeling_qwen3_moe"]
-
-    if rms_norm:
-        modeling_qwen3_moe.Qwen3MoeRMSNorm = LigerRMSNorm
-
-    if glu_activation:
-
-        def _liger_swiglu_mlp_wrapper(config, intermediate_size=None, **kwargs):
-            "Accepts intermediate_size to pass to LigerSwiGLUMLP"
-            # clone config to avoid modifying the original
-            config = deepcopy(config)
-            if intermediate_size:
-                setattr(config, "intermediate_size", intermediate_size)
-            return LigerSwiGLUMLP(config, **kwargs)
-
-        modeling_qwen3_moe.Qwen3MoeMLP = _liger_swiglu_mlp_wrapper
-
-    if layer_norm:
-        modeling_qwen3_moe.nn.LayerNorm = LigerLayerNorm
-
-    if cross_entropy:
-        from transformers.loss.loss_utils import nn
-
-        nn.functional.cross_entropy = liger_cross_entropy
-
-    if fused_linear_cross_entropy:
-        modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = lce_forward
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -18,8 +18,6 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "mixtral",
    "qwen2",
    "qwen2_moe",
-    "qwen3",
-    "qwen3_moe",
    "falcon",
    "phi",
    "phi3",
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -21,7 +21,6 @@ from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.trainer import Trainer

-from axolotl.cli.art import print_axolotl_text_art
 from axolotl.common.datasets import TrainDatasetMeta
 from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
@@ -517,8 +516,6 @@ def train(
    Returns:
        Tuple of (model, tokenizer) after training
    """
-    print_axolotl_text_art()
-
    # Setup model, tokenizer, (causal or RLHF) trainer, etc.
    (
        trainer,
--- a/src/axolotl/utils/init.py
+++ b/src/axolotl/utils/init.py
@@ -43,12 +43,3 @@ def set_pytorch_cuda_alloc_conf():
            os.environ["PYTORCH_CUDA_ALLOC_CONF"] = (
                "expandable_segments:True,roundup_power2_divisions:16"
            )
-
-
-def patch_optimized_env():
-    """
-    Patch environment variables to improve VRAM usage and increase download speed
-    """
-    if os.getenv("HF_HUB_ENABLE_HF_TRANSFER") is None:
-        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-    set_pytorch_cuda_alloc_conf()
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -59,7 +59,7 @@ def choose_device(cfg):

 def resolve_dtype(cfg):
    if (
-        not cfg.fp16 and cfg.bf16 == "auto" and not cfg.use_ray
+        cfg.bf16 == "auto" and not cfg.use_ray
    ):  # if we use ray we want to defer this check to the worker node
        if is_torch_bf16_gpu_available():
            LOG.debug("bf16 support detected, enabling for this configuration.")
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -190,7 +190,7 @@ class MultipackBatchSampler(BatchSampler):
        self.len_across_ranks = None

        if self.sequential and not isinstance(sampler, SequentialSampler):
-            LOG.warning(
+            LOG.warn(
                "using sequential sample packing with non-sequential sampler, did you want to also enable curriculum_sampling?"
            )

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,7 +4,6 @@ shared pytest fixtures

 import functools
 import importlib
-import os
 import shutil
 import sys
 import tempfile
@@ -530,32 +529,31 @@ def dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff(


 # # pylint: disable=redefined-outer-name,unused-argument
-@pytest.mark.skipif(
-    os.environ.get("AXOLOTL_IS_CI_CACHE_PRELOAD", "-1") != "1",
-    reason="Not running in CI cache preload",
-)
-def test_load_fixtures(
-    download_smollm2_135m_model,
-    download_qwen_2_5_half_billion_model,
-    download_tatsu_lab_alpaca_dataset,
-    download_mhenrichsen_alpaca_2k_dataset,
-    download_mhenrichsen_alpaca_2k_w_revision_dataset,
-    download_mlabonne_finetome_100k_dataset,
-    download_argilla_distilabel_capybara_dpo_7k_binarized_dataset,
-    download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset,
-    download_argilla_dpo_pairs_dataset,
-    download_tiny_shakespeare_dataset,
-    download_deepseek_model_fixture,
-    download_huggyllama_model_fixture,
-    download_llama_1b_model_fixture,
-    download_llama3_8b_model_fixture,
-    download_llama3_8b_instruct_model_fixture,
-    download_phi_35_mini_model_fixture,
-    download_phi_3_medium_model_fixture,
-    download_mistral_7b_model_fixture,
-    download_gemma_2b_model_fixture,
-    download_gemma2_9b_model_fixture,
-    download_mlx_mistral_7b_model_fixture,
-    download_llama2_model_fixture,
-):
-    pass
+# def test_load_fixtures(
+#     download_smollm2_135m_model,
+#     download_llama_68m_random_model,
+#     download_qwen_2_5_half_billion_model,
+#     download_tatsu_lab_alpaca_dataset,
+#     download_mhenrichsen_alpaca_2k_dataset,
+#     download_mhenrichsen_alpaca_2k_w_revision_dataset,
+#     download_mlabonne_finetome_100k_dataset,
+#     download_argilla_distilabel_capybara_dpo_7k_binarized_dataset,
+#     download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset,
+#     download_fozzie_alpaca_dpo_dataset,
+#     download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset,
+#     download_argilla_dpo_pairs_dataset,
+#     download_tiny_shakespeare_dataset,
+#     download_deepseek_model_fixture,
+#     download_huggyllama_model_fixture,
+#     download_llama_1b_model_fixture,
+#     download_llama3_8b_model_fixture,
+#     download_llama3_8b_instruct_model_fixture,
+#     download_phi_35_mini_model_fixture,
+#     download_phi_3_medium_model_fixture,
+#     download_mistral_7b_model_fixture,
+#     download_gemma_2b_model_fixture,
+#     download_gemma2_9b_model_fixture,
+#     download_mlx_mistral_7b_model_fixture,
+#     download_llama2_model_fixture,
+# ):
+#     pass