add output for train loss in assertian err

swap llama tests for 7m param model
fix perplexity scores
2025-04-18 08:11:11 -07:00 · 2025-04-17 09:52:35 -07:00 · 2025-04-17 07:58:53 -07:00 · 2025-04-16 21:14:04 -07:00
75 changed files with 146 additions and 980 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -46,18 +46,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "128"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -31,11 +31,6 @@ jobs:
            pytorch: 2.6.0
            axolotl_extras: vllm
            is_latest: true
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
            axolotl_extras: vllm
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -98,11 +93,6 @@ jobs:
            pytorch: 2.6.0
            axolotl_extras:
            is_latest: true
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -148,7 +138,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.4.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -45,13 +45,6 @@ jobs:
            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
@@ -74,7 +67,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.multigpu
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -147,7 +147,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -49,7 +49,7 @@ jobs:
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.4.1", "2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20
    steps:
@@ -109,7 +109,6 @@ jobs:
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          files: ./coverage.xml
          flags: unittests,pytorch-${{ matrix.pytorch_version }}
          fail_ci_if_error: false
@@ -242,7 +241,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
@@ -270,12 +268,6 @@ jobs:
            pytorch: 2.5.1
            num_gpus: 1
            axolotl_extras: vllm
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
            num_gpus: 1
            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -296,7 +288,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -9,7 +9,8 @@ pytest -v --durations=10 -n8 \
  --ignore=tests/patched/ \
  --ignore=tests/cli \
  /workspace/axolotl/tests/ \
-  --cov=axolotl
+  --cov=axolotl \
  --cov-report=xml:coverage.xml
 # Run lora kernels tests with coverage append
 pytest -v --durations=10 \
@@ -50,6 +51,11 @@ pytest -v --durations=10 \
  /workspace/axolotl/tests/e2e/ \
  --cov=axolotl \
  --cov-append \
-  --cov-report=xml:e2e-coverage.xml
+  --cov-report=xml:coverage.xml
-codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION}
+# Upload coverage to Codecov
 if [ -f e2e-coverage.xml ]; then
  codecov -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION}
 else
  echo "Coverage file not found. Coverage report may have failed."
 fi
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -28,7 +28,6 @@ df_args = {
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -29,7 +29,6 @@ df_args = {
    "CUDA": os.environ.get("CUDA", "121"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -1,23 +1,25 @@
 #!/bin/bash
 set -e
 # only run one test at a time so as not to OOM the GPU
 pytest -v  --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
 pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
 # Only run two tests at a time to avoid OOM on GPU (with coverage collection)
 pytest -v -n2 \
-  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
+  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
  --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
  /workspace/axolotl/tests/e2e/multigpu/ \
  --cov=axolotl
 # Run solo tests with coverage append
 pytest -v --durations=10 -n1 \
  /workspace/axolotl/tests/e2e/multigpu/solo/ \
  --cov=axolotl \
-  --cov-append
+  --cov-report=xml:multigpu-coverage.xml
-pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
+pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/ \
  --cov=axolotl \
  --cov-append \
  --cov-report=xml:multigpu-coverage.xml
 # Upload coverage to Codecov
-codecov upload-process -t $CODECOV_TOKEN -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}
+if [ -f multigpu-coverage.xml ]; then
  codecov -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}
 else
  echo "Coverage file not found. Coverage report may have failed."
 fi
--- a/codecov.yml
+++ b/codecov.yml
@@ -49,6 +49,3 @@ comment:
  require_changes: no
  require_base: no
  require_head: yes
 github_checks:
  annotations: false
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -37,7 +37,3 @@ RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10
 RUN if [ "$PYTORCH_VERSION" = "2.7.0" ] ; then \
        pip3 install flash-attn==2.7.4.post1; \
    fi
--- a/docs/cli.qmd
+++ b/docs/cli.qmd
@@ -199,17 +199,6 @@ output_dir: # Directory to save evaluation results
 See [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) for more details.
 ### delinearize-llama4
 Delinearizes a Llama 4 linearized model into a regular HuggingFace Llama 4 model. This only works with the non-quantized linearized model.
 ```bash
 axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
 ```
 This would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.
 ## Legacy CLI Usage
 While the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:
--- a/docs/custom_integrations.qmd
+++ b/docs/custom_integrations.qmd
@@ -49,8 +49,7 @@ sections = [
    ("Knowledge Distillation (KD)", "kd"),
    ("Liger Kernels", "liger"),
    ("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
-    ("Spectrum", "spectrum"),
+    ("Spectrum", "spectrum")
    ("LLMCompressor", "llm_compressor")
 ]
 for section_name, folder_name in sections:
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -19,12 +19,6 @@ This guide covers all the ways you can install and set up Axolotl for your envir
 ## Installation Methods {#sec-installation-methods}
 ::: {.callout-important}
 Please make sure to have Pytorch installed before installing Axolotl in your local environment.
 Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
 :::
 ### PyPI Installation (Recommended) {#sec-pypi}
 ```{.bash}
--- a/examples/glm4/qlora-32b.yaml
+++ b/examples/glm4/qlora-32b.yaml
@@ -1,62 +0,0 @@
 base_model: THUDM/GLM-4-32B-0414
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_4bit: true
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0
 output_dir: ./outputs/qlora-out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
 pad_to_sequence_len: true
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/llama-3/sparse-finetuning.yaml
+++ b/examples/llama-3/sparse-finetuning.yaml
@@ -1,77 +0,0 @@
 base_model: neuralmagic/Sparse-Llama-3.1-8B-2of4
 plugins:
  - axolotl.integrations.llm_compressor.LLMCompressorPlugin
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
 output_dir: ./outputs/out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 eval_sample_packing: false
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 8
 micro_batch_size: 1
 num_epochs: 1
 optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 2e-5
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 2
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: <|end_of_text|>
 llmcompressor:
  recipe:
    finetuning_stage:
      finetuning_modifiers:
        ConstantPruningModifier:
          targets: [
            're:.*q_proj.weight',
            're:.*k_proj.weight',
            're:.*v_proj.weight',
            're:.*o_proj.weight',
            're:.*gate_proj.weight',
            're:.*up_proj.weight',
            're:.*down_proj.weight',
          ]
          start: 0
  save_compressed: true
--- a/examples/llama-4/README.md
+++ b/examples/llama-4/README.md
@@ -26,11 +26,3 @@ Multi-GPU (4xH100) for Llama 4 Scout uses 62.8GB VRAM/GPU @ 4k contenxt length @
 ### Llama 4 Maverick 17Bx128Experts (400B)
 Coming Soon
 ## Delinearized Llama 4 Models
 We provide a script to delinearize Llama 4 linearized models into regular HuggingFace Llama 4 models.
 ```bash
 axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
 ```
--- a/requirements-tests.txt
+++ b/requirements-tests.txt
@@ -1,5 +1,4 @@
 codecov
 codecov-cli
 pytest
 pytest-cov
 pytest-retry
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ triton>=3.0.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
 autoawq==0.2.7.post3
-liger-kernel==0.5.8
+liger-kernel==0.5.6
 # END section
 packaging==23.2
@@ -19,7 +19,6 @@ datasets==3.5.0
 deepspeed>=0.15.4
 trl==0.16.1
 hf_xet==1.0.0
 hqq==0.2.5
 optimum==1.16.2
 hf_transfer
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@ def parse_requirements(extras_require_map):
            try:
                torch_version = version("torch")
            except PackageNotFoundError:
-                torch_version = "2.6.0"  # default to torch 2.6
+                torch_version = "2.5.1"
            _install_requires.append(f"torch=={torch_version}")
            version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
@@ -64,15 +64,9 @@ def parse_requirements(extras_require_map):
            else:
                raise ValueError("Invalid version format")
-            if (major, minor) >= (2, 7):
+            if (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
-                # _install_requires.append("xformers==0.0.29.post3")  # xformers seems to be hard pinned to 2.6.0
+                _install_requires.append("xformers==0.0.29.post2")
                extras_require_map["vllm"] = ["vllm==0.8.3"]
            elif (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
                _install_requires.append(
                    "xformers==0.0.29.post2"
                )  # vllm needs post2 w torch 2.6
                extras_require_map["vllm"] = ["vllm==0.8.3"]
            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
@@ -149,9 +143,6 @@ extras_require = {
    "vllm": [
        "vllm==0.7.2",
    ],
    "llmcompressor": [
        "llmcompressor==0.5.1",
    ],
 }
 install_requires, dependency_links, extras_require_build = parse_requirements(
--- a/src/axolotl/cli/args.py
+++ b/src/axolotl/cli/args.py
@@ -39,16 +39,16 @@ class TrainerCliArgs:
 class VllmServeCliArgs:
    """Dataclass with CLI arguments for `axolotl vllm-serve` command."""
-    tensor_parallel_size: Optional[int] = field(
+    tensor_parallel_size: int = field(
-        default=None,
+        default=1,
        metadata={"help": "Number of tensor parallel workers to use."},
    )
-    host: Optional[str] = field(
+    host: str = field(
-        default=None,  # nosec B104
+        default="0.0.0.0",  # nosec B104
        metadata={"help": "Host address to run the server on."},
    )
-    port: Optional[int] = field(
+    port: int = field(
-        default=None,
+        default=8000,
        metadata={"help": "Port to run the server on."},
    )
    gpu_memory_utilization: Optional[float] = field(
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -1040,11 +1040,9 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        if self.cfg.dataset_processes:
            training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
-        if self.cfg.trl and self.cfg.trl.beta is not None:
+        if (self.cfg.trl and self.cfg.trl.beta) or self.cfg.rl_beta:
-            training_args_kwargs["beta"] = self.cfg.trl.beta
+            training_args_kwargs["beta"] = self.cfg.trl.beta or self.cfg.rl_beta
-        elif self.cfg.rl_beta is not None:
+        if self.cfg.orpo_alpha:
            training_args_kwargs["beta"] = self.cfg.rl_beta
        elif self.cfg.orpo_alpha is not None:
            # trl does some odd mapping of alpha to beta to reuse the beta parameter ???
            training_args_kwargs["beta"] = self.cfg.orpo_alpha
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -40,8 +40,8 @@ class GRPOStrategy:
        if trl.use_vllm:
            grpo_args_kwargs["use_vllm"] = trl.use_vllm
-            grpo_args_kwargs["vllm_server_host"] = trl.vllm_server_host or trl.vllm.host
+            grpo_args_kwargs["vllm_server_host"] = trl.vllm_server_host
-            grpo_args_kwargs["vllm_server_port"] = trl.vllm_server_port or trl.vllm.port
+            grpo_args_kwargs["vllm_server_port"] = trl.vllm_server_port
            if trl.vllm_server_timeout:
                grpo_args_kwargs["vllm_server_timeout"] = trl.vllm_server_timeout
            if trl.vllm_guided_decoding_regex:
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -47,8 +47,6 @@ cut_cross_entropy: true
 - qwen2
 - cohere
 - cohere2
 - glm
 - glm4
 ## Citation
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/glm4.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/glm4.py
@@ -1,57 +0,0 @@
 """GLM 4 patch. GLM family inherits from Llama."""
 from types import MethodType
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
 )
 def patch_glm(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    # Set the _PATCH_OPTS in the llama patch file
    import cut_cross_entropy.transformers.llama as llama_patch
    llama_patch._PATCH_OPTS = patch_options  # pylint: disable=protected-access
    from cut_cross_entropy.transformers.llama import cce_forward
    from transformers.models.glm import modeling_glm
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_glm.GlmForCausalLM
        ), f"Expected a GlmForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward, maybe_model)
        return maybe_model
    modeling_glm.GlmForCausalLM.forward = cce_forward
    return None
 def patch_glm4(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    # Set the _PATCH_OPTS in the llama patch file
    import cut_cross_entropy.transformers.llama as llama_patch
    llama_patch._PATCH_OPTS = patch_options  # pylint: disable=protected-access
    from cut_cross_entropy.transformers.llama import cce_forward
    from transformers.models.glm4 import modeling_glm4
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_glm4.Glm4ForCausalLM
        ), f"Expected a Glm4ForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward, maybe_model)
        return maybe_model
    modeling_glm4.Glm4ForCausalLM.forward = cce_forward
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/patch.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/patch.py
@@ -20,10 +20,6 @@ from axolotl.integrations.cut_cross_entropy.monkeypatch.gemma3 import (
    patch_gemma3,
    patch_gemma3_text,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.glm4 import (
    patch_glm,
    patch_glm4,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.llama4 import (
    patch_llama4,
    patch_llama4_text,
@@ -49,8 +45,6 @@ CUT_CROSS_ENTROPY_MODEL_MAPPING = {
    "qwen2": patch_qwen2,
    "cohere": patch_cohere,
    "cohere2": patch_cohere2,
    "glm": patch_glm,
    "glm4": patch_glm4,
 }
--- a/src/axolotl/integrations/liger/README.md
+++ b/src/axolotl/integrations/liger/README.md
@@ -25,7 +25,7 @@ liger_fused_linear_cross_entropy: true
 - deepseek_v2
 - gemma
 - gemma2
- gemma3
+- gemma3 (partial support, no support for FLCE yet)
 - granite
 - jamba
 - llama
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -21,6 +21,7 @@ It is designed to be performant, correct, and light-weight.
 import inspect
 import logging
 import sys
 from functools import partial
 from axolotl.integrations.base import BasePlugin
@@ -54,6 +55,7 @@ class LigerPlugin(BasePlugin):
            )
        from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
        from liger_kernel.transformers.functional import liger_cross_entropy
        from liger_kernel.transformers.geglu import LigerGEGLUMLP
        from liger_kernel.transformers.layer_norm import LigerLayerNorm
        from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
        from liger_kernel.transformers.rms_norm import LigerRMSNorm
@@ -139,6 +141,38 @@ class LigerPlugin(BasePlugin):
                modeling_mod.CrossEntropyLoss = LigerCrossEntropyLoss
            if cfg.liger_fused_linear_cross_entropy:
                modeling_mod.DeepseekV2ForCausalLM.forward = deepseekv2_lce_forward
        elif cfg.model_config_type in ["gemma3", "gemma3_text"]:
            from transformers.models.gemma3 import modeling_gemma3
            if cfg.liger_rope:
                modeling_gemma3.apply_rotary_pos_emb = liger_rotary_pos_emb
            if cfg.liger_rms_norm:
                def _liger_rms_norm_wrapper(dim, **kwargs):
                    "Convert 'dim' keyword to 'hidden_size' to pass to LigerRMSNorm"
                    return LigerRMSNorm(hidden_size=dim, **kwargs)
                modeling_gemma3.Gemma3RMSNorm = partial(
                    _liger_rms_norm_wrapper,
                    offset=1.0,
                    casting_mode="gemma",
                    init_fn="zeros",
                    in_place=False,
                )
            if cfg.liger_glu_activation:
                modeling_gemma3.Gemma3MLP = LigerGEGLUMLP
            if cfg.liger_layer_norm:
                modeling_gemma3.nn.LayerNorm = LigerLayerNorm
            if cfg.liger_cross_entropy:
                from transformers.loss.loss_utils import nn
                nn.functional.cross_entropy = liger_cross_entropy
            if cfg.liger_fused_linear_cross_entropy:
                raise NotImplementedError(
                    "Fused linear cross entropy is not yet supported for Gemma3."
                )
        elif cfg.model_config_type == "llama4":
            from axolotl.integrations.liger.models.llama4 import (
                apply_liger_kernel_to_llama4,
--- a/src/axolotl/integrations/llm_compressor/README.md
+++ b/src/axolotl/integrations/llm_compressor/README.md
@@ -1,108 +0,0 @@
 # LLMCompressor Integration
 Fine-tune sparsified models in Axolotl using Neural Magic's [LLMCompressor](https://github.com/vllm-project/llm-compressor).
 This integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor's model compression capabilities with Axolotl's distributed training pipelines, users can efficiently fine-tune sparse models at scale.
 It uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.
 ---
 ## Requirements
 - Axolotl with `llmcompressor` extras:
  ```bash
  pip install "axolotl[llmcompressor]"
  ```
 - Requires `llmcompressor >= 0.5.1`
 This will install all necessary dependencies to fine-tune sparsified models using the integration.
 ---
 ## Usage
 To enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:
 ```yaml
 plugins:
  - axolotl.integrations.llm_compressor.LLMCompressorPlugin
 llmcompressor:
  recipe:
    finetuning_stage:
      finetuning_modifiers:
        ConstantPruningModifier:
          targets: [
            're:.*q_proj.weight',
            're:.*k_proj.weight',
            're:.*v_proj.weight',
            're:.*o_proj.weight',
            're:.*gate_proj.weight',
            're:.*up_proj.weight',
            're:.*down_proj.weight',
          ]
          start: 0
  save_compressed: true
 # ... (other training arguments)
 ```
 This plugin **does not apply pruning or sparsification itself** — it is intended for **fine-tuning models that have already been sparsified**.
 Pre-sparsified checkpoints can be:
 - Generated using [LLMCompressor](https://github.com/vllm-project/llm-compressor)
 - Downloaded from [Neural Magic's Hugging Face page](https://huggingface.co/neuralmagic)
 - Any custom LLM with compatible sparsity patterns that you've created yourself
 To learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:
 [https://github.com/vllm-project/llm-compressor/blob/main/README.md](https://github.com/vllm-project/llm-compressor/blob/main/README.md)
 ### Storage Optimization with save_compressed
 Setting `save_compressed: true` in your configuration enables saving models in a compressed format, which:
 - Reduces disk space usage by approximately 40%
 - Maintains compatibility with vLLM for accelerated inference
 - Maintains compatibility with llmcompressor for further optimization (example: quantization)
 This option is highly recommended when working with sparse models to maximize the benefits of model compression.
 ### Example Config
 See [`examples/llama-3/sparse-finetuning.yaml`](examples/llama-3/sparse-finetuning.yaml) for a complete example.
 ---
 ## Inference with vLLM
 After fine-tuning your sparse model, you can leverage vLLM for efficient inference.
 You can also use LLMCompressor to apply additional quantization to your fine-tuned
 sparse model before inference for even greater performance benefits.:
 ```python
 from vllm import LLM, SamplingParams
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM("path/to/your/sparse/model")
 outputs = llm.generate(prompts, sampling_params)
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 For more details on vLLM's capabilities and advanced configuration options, see the [official vLLM documentation](https://docs.vllm.ai/).
 ## Learn More
 For details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:
 [https://github.com/vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)
--- a/src/axolotl/integrations/llm_compressor/init.py
+++ b/src/axolotl/integrations/llm_compressor/init.py
@@ -1,5 +0,0 @@
 """Integration entry point for the LLMCompressor plugin."""
 from .plugin import LLMCompressorPlugin
 __all__ = ["LLMCompressorPlugin"]
--- a/src/axolotl/integrations/llm_compressor/args.py
+++ b/src/axolotl/integrations/llm_compressor/args.py
@@ -1,40 +0,0 @@
 """
 LLMCompressor and Sparse Finetuning config models.
 """
 from typing import Any
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 class CompressionArgs(BaseModel):
    """Sparse Finetuning config for LLMCompressor."""
    # Typing for recipe is set to Any due to:
    # https://github.com/vllm-project/llm-compressor/issues/1319
    recipe: Annotated[
        Any,
        Field(
            description="The recipe containing the compression algorithms and hyperparameters to apply."
        ),
    ]
    save_compressed: Annotated[
        bool,
        Field(
            default=False,
            description="Whether to save the compressed model after training.",
        ),
    ]
 class LLMCompressorArgs(BaseModel):
    """LLMCompressor configuration BaseModel."""
    llmcompressor: Annotated[
        CompressionArgs,
        Field(
            description="Arguments enabling compression pathways through the LLM Compressor plugins"
        ),
    ]
--- a/src/axolotl/integrations/llm_compressor/plugin.py
+++ b/src/axolotl/integrations/llm_compressor/plugin.py
@@ -1,171 +0,0 @@
 """
 Sparse Finetuning plugin for Axolotl — enables handling of sparse neural networks
 by maintaining masks for zero weights during training.
 """
 import logging
 from functools import wraps
 from typing import Any, Callable, Concatenate, ParamSpec, TypeVar
 from llmcompressor import active_session, create_session
 from llmcompressor.core import callbacks as session_callbacks
 from llmcompressor.recipe import Recipe
 from torch.nn import Module
 from transformers.trainer import Trainer
 from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
 from transformers.training_args import TrainingArguments
 from axolotl.integrations.base import BasePlugin
 P = ParamSpec("P")  # Params for generic function signatures
 R = TypeVar("R")  # Return type for generic function signatures
 LOG = logging.getLogger("axolotl.integrations.llm_compressor")
 class LLMCompressorCallbackHandler(TrainerCallback):
    """
    Trainer callback for Sparse Finetuning.
    Maintains sparsity patterns during training by applying masks after optimization steps,
    ensuring zero-weight updates are canceled out.
    """
    def __init__(self, trainer: Trainer, recipe: Any):
        """
        Initialize the Sparse Finetuning callback handler.
        Args:
            trainer (Trainer): Huggingface Trainer instance.
            recipe (Recipe | dict): Sparse finetuning recipe to apply.
        """
        super().__init__()
        self.trainer = trainer
        self.recipe = (
            Recipe.model_validate(recipe) if not isinstance(recipe, Recipe) else recipe
        )
        self.original_compute_loss = trainer.compute_loss
        self.trainer.compute_loss = compute_loss_wrapper(self.trainer.compute_loss)
        create_session()
    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the beginning of training. Initializes the compression session.
        Args:
            args (TrainingArguments): Training arguments.
            state (TrainerState): Trainer state.
            control (TrainerControl): Trainer control.
        """
        super().on_train_begin(args, state, control, **kwargs)
        self.trainer.accelerator.wait_for_everyone()
        active_session().initialize(
            model=self.trainer.model,
            optimizer=self.trainer.optimizer,
            start=state.epoch,
            recipe=self.recipe,
        )
        self.trainer.accelerator.wait_for_everyone()
    def on_step_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the beginning of a training step. Triggers batch_start callback.
        """
        super().on_step_begin(args, state, control, **kwargs)
        session_callbacks.batch_start()
    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the end of a training step. Triggers optimizer and batch_end callbacks.
        """
        super().on_step_end(args, state, control, **kwargs)
        session_callbacks.optim_pre_step()
        session_callbacks.optim_post_step()
        session_callbacks.batch_end()
    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the end of training. Finalizes the compression session.
        """
        super().on_train_end(args, state, control, **kwargs)
        active_session().finalize()
        self.trainer.compute_loss_func = self.original_compute_loss
 class LLMCompressorPlugin(BasePlugin):
    """
    Sparse Finetuning plugin for Axolotl integration.
    """
    def get_input_args(self) -> str:
        """
        Returns the path to the plugin's argument definition.
        Returns:
            str: Dotted path to the LLMCompressorArgs class.
        """
        return "axolotl.integrations.llm_compressor.args.LLMCompressorArgs"
    def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list:
        """
        Adds Sparse Finetuning callback to the Trainer instance.
        Args:
            cfg (Any): Configuration object containing the sparse recipe.
            trainer (Trainer): Huggingface Trainer instance.
        Returns:
            list: List containing the configured callback instances.
        """
        LOG.info("Adding Sparse Finetuning callback to the trainer")
        callback = LLMCompressorCallbackHandler(
            trainer=trainer,
            recipe=cfg.llmcompressor.recipe,
        )
        return [callback]
 def compute_loss_wrapper(
    compute_loss_func: Callable[Concatenate[Module, P], R],
 ) -> Callable[Concatenate[Module, P], R]:
    """
    Wraps the loss computation function to trigger the loss_calculated callback.
    Args:
        compute_loss_func (Callable): Original loss computation function.
    Returns:
        Callable: Wrapped function that also invokes the loss_calculated callback.
    """
    @wraps(compute_loss_func)
    def compute_and_notify(model: Module, *args: P.args, **kwargs: P.kwargs) -> R:
        loss = compute_loss_func(model, *args, **kwargs)
        if active_session().lifecycle.initialized_ and model.training:
            session_callbacks.loss_calculated(loss=loss)
        return loss
    return compute_and_notify
--- a/src/axolotl/integrations/llm_compressor/utils.py
+++ b/src/axolotl/integrations/llm_compressor/utils.py
@@ -1,40 +0,0 @@
 """Utilities for llmcompressor integration with axolotl."""
 from typing import Union
 from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
    modify_save_pretrained,
 )
 from transformers import PreTrainedModel, Trainer
 def save_compressed_model(
    model: PreTrainedModel,
    output_dir: Union[str, bytes],
    trainer: Trainer,
    safe_serialization: bool = False,
    save_compressed: bool = False,
 ) -> None:
    """
    Synchronize processes, apply compression hooks, and save the model.
    Args:
        model (PreTrainedModel): The model to be saved.
        output_dir (str or bytes): Path where the model files will be written.
        trainer (Trainer): Hugging Face Trainer for process synchronization.
        safe_serialization (bool): Use safe serialization if True.
        save_compressed (bool): Write compressed tensors if True.
    """
    trainer.accelerator.wait_for_everyone()
    # Only the main process writes the files
    if not trainer.accelerator.is_main_process:
        return
    modify_save_pretrained(model)
    model.save_pretrained(
        output_dir,
        safe_serialization=safe_serialization,
        save_compressed=save_compressed,
        skip_sparsity_compression_stats=not save_compressed,
    )
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -31,8 +31,6 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "starcoder2",
    "deepseek_v2",
    "deepseek_v3",
    "glm",
    "glm4",
 ]
--- a/src/axolotl/monkeypatch/relora.py
+++ b/src/axolotl/monkeypatch/relora.py
@@ -272,7 +272,7 @@ class ReLoRAScheduler(LRScheduler):
        self.warmup_steps = warmup_steps
        self.anneal_steps = anneal_steps
        self.min_lr_scale = min_lr_scale
-        super().__init__(optimizer, inner_schedule.last_epoch)
+        super().__init__(optimizer, inner_schedule.last_epoch, inner_schedule.verbose)
    def get_lr(self) -> float:
        self.inner_schedule.last_epoch = self.last_epoch
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -271,19 +271,6 @@ def save_trained_model(
                os.remove(os.path.join(cfg.output_dir, "model.safetensors"))
            except FileNotFoundError:
                pass
    elif hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
        from axolotl.integrations.llm_compressor.utils import (
            save_compressed_model,
        )
        save_compressed_model(
            model=model,
            output_dir=cfg.output_dir,
            trainer=trainer,
            safe_serialization=safe_serialization,
            save_compressed=cfg.llmcompressor.save_compressed,
        )
    elif cfg.local_rank == 0:
        if cfg.flash_optimum and BetterTransformer:
            model = BetterTransformer.reverse(model)
@@ -292,7 +279,6 @@ def save_trained_model(
            trainer.model.save_pretrained(
                cfg.output_dir, safe_serialization=safe_serialization
            )
        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -3,7 +3,6 @@
 import functools
 import logging
 import os
 import tempfile
 from pathlib import Path
 from typing import List, Optional, Tuple, Union
@@ -118,27 +117,9 @@ def prepare_dataset(cfg, tokenizer, processor=None, preprocess_iterable=None):
            cfg.pretraining_dataset[0]["type"] or "pretrain",
        )
-        # when letting accelerator dispatch batches from the main process, we don't need to load the dataset from
+        iter_ds = load_dataset(
-        # other ranks, we just need to present a fake dataset
+            path, streaming=True, split=split, name=name, data_files=data_files
-        if (
+        )
            cfg.accelerator_config
            and cfg.accelerator_config.dispatch_batches
            and not is_local_main_process()
        ):
            with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f:
                f.write("text\n")
                f.write("lorem ipsum dolor sit amet\n")
                # rewind the file pointer to the beginning so we can read it again
                f.seek(0)
                iter_ds = load_dataset(
                    "csv", data_files=f.name, split="train", streaming=True
                )
        else:
            if is_local_main_process():
                iter_ds = load_dataset(
                    path, streaming=True, split=split, name=name, data_files=data_files
                )
        if skip:
            LOG.info(f"Skipping {skip} samples from the dataset")
            iter_ds = iter_ds.skip(skip)
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -139,22 +139,6 @@ def check_model_config(cfg: DictDefault, model_config: PretrainedConfig):
        hasattr(model_config, "quantization_config")
        and model_config.quantization_config
    )
    # Detect compressed-tensors config
    is_compressed_tensors_config = (
        quant_config_exists
        and model_config.quantization_config.get("quant_method") == "compressed-tensors"
    )
    if is_compressed_tensors_config:
        if model_config.quantization_config.get("config_groups"):
            LOG.warning(
                "Found `config_groups` in a compressed-tensors config. "
                "QAT integration with llmcompressor is not tested."
            )
        # Skip further quant checks for compressed-tensors
        return
    quant_config_method_is_gptq = (
        quant_config_exists
        and "quant_method" in model_config.quantization_config
--- a/src/axolotl/utils/schedulers.py
+++ b/src/axolotl/utils/schedulers.py
@@ -40,7 +40,7 @@ class RexLR(LRScheduler):
        self.max_lr = max_lr
        self.total_steps = total_steps
        self.num_warmup_steps = num_warmup_steps
-        self.last_step = max(last_step - 1, 0)
+        self.last_step = last_step - 1
        # Ensure each parameter group has an "initial_lr" key to avoid issues when resuming.
        for group in optimizer.param_groups:
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -660,7 +660,6 @@ class AxolotlInputConfig(
            data.get("val_set_size") == 0
            and (data.get("eval_steps") or data.get("eval_strategy"))
            and not data.get("test_datasets")
            and data.get("eval_strategy") != "no"
        ):
            raise ValueError(
                "eval_steps and eval_strategy are not supported with val_set_size == 0"
--- a/src/axolotl/utils/schemas/vllm.py
+++ b/src/axolotl/utils/schemas/vllm.py
@@ -36,11 +36,3 @@ class VllmConfig(BaseModel):
        default=None,
        json_schema_extra={"description": "Enable prefix caching for VLLM"},
    )
    host: str | None = Field(
        default="0.0.0.0",  # nosec B104
        json_schema_extra={"description": "Host for the vLLM server to start on"},
    )
    port: int | None = Field(
        default=8000,
        json_schema_extra={"description": "Port of the vLLM server to start on"},
    )
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -193,14 +193,6 @@ def download_tiny_shakespeare_dataset():
    snapshot_download_w_retry("winglian/tiny-shakespeare", repo_type="dataset")
@pytest.fixture(scope="session", autouse=True)
 def download_evolkit_kd_sample_dataset():
    # download the dataset
    snapshot_download_w_retry(
        "axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample", repo_type="dataset"
    )
@pytest.fixture(scope="session", autouse=True)
 def download_deepseek_model_fixture():
    snapshot_download_w_retry("axolotl-ai-co/DeepSeek-V3-11M", repo_type="model")
@@ -216,16 +208,6 @@ def download_huggyllama_model_fixture():
    )
@pytest.fixture(scope="session", autouse=True)
 def download_llama33_70b_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "axolotl-ai-co/Llama-3.3-70B-Instruct-tokenizer",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )
@pytest.fixture(scope="session", autouse=True)
 def download_llama_1b_model_fixture():
    # download the tokenizer only
@@ -333,14 +315,6 @@ def download_llama2_model_fixture():
    )
@pytest.fixture(scope="session", autouse=True)
 def download_llama32_1b_model_fixture():
    snapshot_download_w_retry(
        "osllmai-community/Llama-3.2-1B",
        repo_type="model",
    )
@pytest.fixture
@enable_hf_offline
 def tokenizer_huggyllama(
@@ -522,6 +496,12 @@ def dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff(
    return datasets.load_from_disk(ds_path)["train"]
@pytest.fixture(scope="session", autouse=True)
 def download_tiny_llama_7m_model():
    # download the model
    return snapshot_download_w_retry("axolotl-ai-internal/llama-7m", repo_type="model")
 # # pylint: disable=redefined-outer-name,unused-argument
 # def test_load_fixtures(
 #     download_smollm2_135m_model,
--- a/tests/e2e/integrations/test_cut_cross_entropy.py
+++ b/tests/e2e/integrations/test_cut_cross_entropy.py
@@ -8,7 +8,7 @@ from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils import get_pytorch_version
-from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
+from axolotl.utils.config import normalize_config, prepare_plugins
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists
@@ -56,7 +56,6 @@ class TestCutCrossEntropyIntegration:
    # pylint: disable=redefined-outer-name
    def test_llama_w_cce(self, min_cfg, temp_dir):
        cfg = DictDefault(min_cfg)
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
@@ -102,7 +101,6 @@ class TestCutCrossEntropyIntegration:
                "bf16": "auto",
            }
        )
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
@@ -131,7 +129,6 @@ class TestCutCrossEntropyIntegration:
                attention_type: True,
            }
        )
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -90,7 +90,7 @@ class TestKnowledgeDistillation:
        train(cfg=cfg, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "model.safetensors").exists()
        check_tensorboard(
-            temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/loss", 1.0, "Train loss (%s) is too high"
        )
    @pytest.mark.parametrize(
@@ -121,5 +121,5 @@ class TestKnowledgeDistillation:
        train(cfg=cfg, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "adapter_model.safetensors").exists()
        check_tensorboard(
-            temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/loss", 1.0, "Train loss (%s) is too high"
        )
--- a/tests/e2e/integrations/test_liger.py
+++ b/tests/e2e/integrations/test_liger.py
@@ -5,7 +5,7 @@ Simple end-to-end test for Liger integration
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
-from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
+from axolotl.utils.config import normalize_config, prepare_plugins
 from axolotl.utils.dict import DictDefault
 from tests.e2e.utils import check_model_output_exists, require_torch_2_4_1
@@ -54,7 +54,6 @@ class LigerIntegrationTestCase:
            }
        )
        # pylint: disable=duplicate-code
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
@@ -101,7 +100,6 @@ class LigerIntegrationTestCase:
            }
        )
        # pylint: disable=duplicate-code
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
--- a/tests/e2e/integrations/test_llm_compressor.py
+++ b/tests/e2e/integrations/test_llm_compressor.py
@@ -1,104 +0,0 @@
 """
 E2E smoke tests for LLMCompressorPlugin integration
 """
 from pathlib import Path
 import pytest
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
 from axolotl.utils.dict import DictDefault
 from tests.e2e.utils import check_model_output_exists, require_torch_2_4_1
 MODELS = [
    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
    "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
 ]
@pytest.mark.parametrize(
    "base_model", MODELS, ids=["no-checkpoint-recipe", "with-checkpoint-recipe"]
 )
@pytest.mark.parametrize(
    "save_compressed", [True, False], ids=["save_compressed", "save_uncompressed"]
 )
 class TestLLMCompressorIntegration:
    """
    e2e tests for axolotl.integrations.llm_compressor.LLMCompressorPlugin
    """
    @require_torch_2_4_1
    def test_llmcompressor_plugin(
        self, temp_dir, base_model: str, save_compressed: bool
    ):
        # core cfg
        cfg = DictDefault(
            {
                "base_model": base_model,
                "plugins": ["axolotl.integrations.llm_compressor.LLMCompressorPlugin"],
                "sequence_len": 1024,
                "val_set_size": 0.05,
                "special_tokens": {"pad_token": "<|endoftext|>"},
                "datasets": [{"path": "mhenrichsen/alpaca_2k_test", "type": "alpaca"}],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 1e-5,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "save_safetensors": True,
                "bf16": "auto",
                "max_steps": 5,
                "llmcompressor": {
                    "recipe": {
                        "finetuning_stage": {
                            "finetuning_modifiers": {
                                "ConstantPruningModifier": {
                                    "targets": [
                                        "re:.*q_proj.weight",
                                        "re:.*k_proj.weight",
                                        "re:.*v_proj.weight",
                                        "re:.*o_proj.weight",
                                        "re:.*gate_proj.weight",
                                        "re:.*up_proj.weight",
                                        "re:.*down_proj.weight",
                                    ],
                                    "start": 0,
                                },
                            },
                        },
                    },
                    "save_compressed": save_compressed,
                },
            }
        )
        prepare_plugins(cfg)
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
        _check_llmcompressor_model_outputs(temp_dir, save_compressed)
 def _check_llmcompressor_model_outputs(temp_dir, save_compressed):
    # recipe.yaml should exist
    assert (Path(temp_dir) / "recipe.yaml").exists()
    # sparsity config exists if save_compressed
    if save_compressed:
        from compressed_tensors import ModelCompressor
        from compressed_tensors.config import Sparse24BitMaskConfig
        compressor = ModelCompressor.from_pretrained(temp_dir)
        assert compressor is not None
        assert isinstance(compressor.sparsity_config, Sparse24BitMaskConfig)
--- a/tests/e2e/multigpu/patched/init.py
+++ b/tests/e2e/multigpu/patched/init.py
--- a/tests/e2e/multigpu/solo/init.py
+++ b/tests/e2e/multigpu/solo/init.py
@@ -1,2 +0,0 @@
 # Tests under this directory should get run "solo" on their own as they
 # seem to cause issues when run in the same batch as other tests.
--- a/tests/e2e/multigpu/solo/test_flex.py
+++ b/tests/e2e/multigpu/solo/test_flex.py
@@ -49,9 +49,8 @@ class TestPackedFlex:
                },
                "datasets": [
                    {
-                        "path": "tatsu-lab/alpaca",
+                        "path": "vicgalle/alpaca-gpt4",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
@@ -90,5 +89,5 @@ class TestPackedFlex:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high"
        )
--- a/tests/e2e/multigpu/test_gemma3.py
+++ b/tests/e2e/multigpu/test_gemma3.py
@@ -96,5 +96,5 @@ class TestMultiGPUGemma3:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 1.8, "Train loss (%s) is too high"
        )
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -43,7 +43,7 @@ class TestMultiGPULlama:
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "axolotl-ai-internal/llama-7m",
                "sequence_len": 2048,
                "adapter": "lora",
                "lora_r": 8,
@@ -94,7 +94,7 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
        )
    @pytest.mark.parametrize(
@@ -105,7 +105,7 @@ class TestMultiGPULlama:
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "axolotl-ai-internal/llama-7m",
                "sequence_len": 2048,
                "sample_packing": True,
                "eval_sample_packing": False,
@@ -159,14 +159,14 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
        )
    def test_dpo_lora_ddp(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "axolotl-ai-internal/llama-7m",
                "sequence_len": 2048,
                "sample_packing": False,
                "eval_sample_packing": False,
@@ -244,7 +244,7 @@ class TestMultiGPULlama:
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "axolotl-ai-internal/llama-7m",
                "sequence_len": 2048,
                "sample_packing": False,
                "eval_sample_packing": False,
@@ -326,7 +326,7 @@ class TestMultiGPULlama:
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "axolotl-ai-internal/llama-7m",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "special_tokens": {
@@ -385,7 +385,7 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
        )
    @pytest.mark.parametrize(
@@ -396,7 +396,7 @@ class TestMultiGPULlama:
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "axolotl-ai-internal/llama-7m",
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 1024,
@@ -457,7 +457,7 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
        )
    @require_torch_2_6_0
@@ -475,7 +475,7 @@ class TestMultiGPULlama:
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "axolotl-ai-internal/llama-7m",
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 2048,
@@ -538,7 +538,7 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.1, "Train loss (%s) is too high"
        )
    def test_fsdp_qlora_prequant_packed(self, temp_dir):
@@ -618,7 +618,7 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
        )
    @pytest.mark.parametrize(
@@ -654,7 +654,7 @@ class TestMultiGPULlama:
            adapter = {}
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "axolotl-ai-internal/llama-7m",
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 1024,
@@ -702,7 +702,7 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
        )
    @pytest.mark.parametrize(
@@ -728,7 +728,7 @@ class TestMultiGPULlama:
            adapter = {}
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "axolotl-ai-internal/llama-7m",
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 1024,
@@ -776,7 +776,7 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
        )
    @pytest.mark.parametrize(
@@ -802,7 +802,7 @@ class TestMultiGPULlama:
            adapter = {}
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "axolotl-ai-internal/llama-7m",
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 1024,
@@ -850,7 +850,7 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
        )
    @pytest.mark.skip(
@@ -860,7 +860,7 @@ class TestMultiGPULlama:
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "axolotl-ai-internal/llama-7m",
                "fix_untrained_tokens": True,
                "sequence_len": 512,
                "val_set_size": 0.0,
@@ -917,5 +917,5 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 4.0, "Train loss (%s) is too high"
        )
--- a/tests/e2e/multigpu/test_ray.py
+++ b/tests/e2e/multigpu/test_ray.py
@@ -80,7 +80,7 @@ class TestMultiGPURay:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
        )
    @require_torch_lt_2_6_0
@@ -138,5 +138,5 @@ class TestMultiGPURay:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
        )
--- a/tests/e2e/multigpu/patched/test_sp.py
+++ b/tests/e2e/multigpu/patched/test_sp.py
@@ -10,7 +10,7 @@ from transformers.testing_utils import get_torch_dist_unique_port
 from axolotl.utils.dict import DictDefault
-from ...utils import check_tensorboard
+from ..utils import check_tensorboard
 os.environ["WANDB_DISABLED"] = "true"
@@ -93,7 +93,7 @@ class TestSequenceParallelism:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.6, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.6, "Train loss (%s) is too high"
        )
    @pytest.mark.parametrize(
--- a/tests/e2e/patched/test_4d_multipack_llama.py
+++ b/tests/e2e/patched/test_4d_multipack_llama.py
@@ -9,7 +9,7 @@ import unittest
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists, with_temp_dir
@@ -60,7 +60,6 @@ class Test4dMultipackLlama(unittest.TestCase):
                "fp16": True,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
@@ -105,7 +104,6 @@ class Test4dMultipackLlama(unittest.TestCase):
                "fp16": True,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/patched/test_fa_xentropy.py
+++ b/tests/e2e/patched/test_fa_xentropy.py
@@ -86,5 +86,5 @@ class TestFAXentropyLlama:
        check_model_output_exists(temp_dir, cfg)
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 1.5, "Train loss (%s) is too high"
        )
--- a/tests/e2e/patched/test_falcon_samplepack.py
+++ b/tests/e2e/patched/test_falcon_samplepack.py
@@ -9,7 +9,7 @@ import unittest
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists, with_temp_dir
@@ -63,7 +63,6 @@ class TestFalconPatched(unittest.TestCase):
                "bf16": "auto",
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
@@ -104,7 +103,6 @@ class TestFalconPatched(unittest.TestCase):
                "bf16": "auto",
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/patched/test_fused_llama.py
+++ b/tests/e2e/patched/test_fused_llama.py
@@ -12,7 +12,7 @@ from transformers.utils import is_torch_bf16_gpu_available
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists, with_temp_dir
@@ -67,7 +67,6 @@ class TestFusedLlama(unittest.TestCase):
            cfg.bf16 = True
        else:
            cfg.fp16 = True
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/patched/test_llama_s2_attention.py
+++ b/tests/e2e/patched/test_llama_s2_attention.py
@@ -11,7 +11,7 @@ import pytest
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists, with_temp_dir
@@ -65,7 +65,6 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
@@ -106,7 +105,6 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/patched/test_lora_llama_multipack.py
+++ b/tests/e2e/patched/test_lora_llama_multipack.py
@@ -12,7 +12,7 @@ from transformers.utils import is_auto_gptq_available, is_torch_bf16_gpu_availab
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists, with_temp_dir
@@ -70,7 +70,6 @@ class TestLoraLlama(unittest.TestCase):
        else:
            cfg.fp16 = True
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
@@ -121,7 +120,6 @@ class TestLoraLlama(unittest.TestCase):
                "lr_scheduler": "cosine",
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/patched/test_mistral_samplepack.py
+++ b/tests/e2e/patched/test_mistral_samplepack.py
@@ -9,7 +9,7 @@ import unittest
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists, with_temp_dir
@@ -63,7 +63,6 @@ class TestMistral(unittest.TestCase):
                "bf16": "auto",
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
@@ -105,7 +104,6 @@ class TestMistral(unittest.TestCase):
                "bf16": "auto",
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/patched/test_mixtral_samplepack.py
+++ b/tests/e2e/patched/test_mixtral_samplepack.py
@@ -9,7 +9,7 @@ import unittest
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists, with_temp_dir
@@ -60,7 +60,6 @@ class TestMixtral(unittest.TestCase):
                "bf16": "auto",
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/patched/test_model_patches.py
+++ b/tests/e2e/patched/test_model_patches.py
@@ -6,7 +6,7 @@ import unittest
 import transformers
-from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_tokenizer
@@ -47,7 +47,6 @@ class TestModelPatches(unittest.TestCase):
                "eval_steps": 10,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        tokenizer = load_tokenizer(cfg)
        load_model(cfg, tokenizer, inference=False)
@@ -80,7 +79,6 @@ class TestModelPatches(unittest.TestCase):
                "eval_steps": 10,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        tokenizer = load_tokenizer(cfg)
        load_model(cfg, tokenizer, inference=False)
--- a/tests/e2e/patched/test_phi_multipack.py
+++ b/tests/e2e/patched/test_phi_multipack.py
@@ -9,7 +9,7 @@ import unittest
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists, with_temp_dir
@@ -63,7 +63,6 @@ class TestPhiMultipack(unittest.TestCase):
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
@@ -83,7 +82,7 @@ class TestPhiMultipack(unittest.TestCase):
                "sample_packing": True,
                "flash_attention": True,
                "pad_to_sequence_len": True,
-                "load_in_4bit": True,
+                "load_in_8bit": False,
                "adapter": "qlora",
                "lora_r": 64,
                "lora_alpha": 32,
@@ -115,7 +114,6 @@ class TestPhiMultipack(unittest.TestCase):
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/patched/test_resume.py
+++ b/tests/e2e/patched/test_resume.py
@@ -12,7 +12,7 @@ from transformers.utils import is_torch_bf16_gpu_available
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists, most_recent_subdir
@@ -46,9 +46,8 @@ class TestResumeLlama:
                },
                "datasets": [
                    {
-                        "path": "tatsu-lab/alpaca",
+                        "path": "vicgalle/alpaca-gpt4",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 2,
@@ -68,7 +67,6 @@ class TestResumeLlama:
            cfg.bf16 = True
        else:
            cfg.fp16 = True
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/patched/test_unsloth_qlora.py
+++ b/tests/e2e/patched/test_unsloth_qlora.py
@@ -10,7 +10,7 @@ import pytest
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists, check_tensorboard
@@ -72,7 +72,6 @@ class TestUnslothQLoRA:
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
@@ -81,7 +80,7 @@ class TestUnslothQLoRA:
        check_model_output_exists(temp_dir, cfg)
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high"
        )
    def test_unsloth_llama_qlora_unpacked(self, temp_dir):
@@ -123,7 +122,6 @@ class TestUnslothQLoRA:
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
@@ -132,7 +130,7 @@ class TestUnslothQLoRA:
        check_model_output_exists(temp_dir, cfg)
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high"
        )
    @pytest.mark.parametrize(
@@ -179,7 +177,6 @@ class TestUnslothQLoRA:
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
@@ -188,5 +185,5 @@ class TestUnslothQLoRA:
        check_model_output_exists(temp_dir, cfg)
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high"
        )
--- a/tests/e2e/solo/test_flex.py
+++ b/tests/e2e/solo/test_flex.py
@@ -41,9 +41,8 @@ class TestPackedFlex(unittest.TestCase):
                },
                "datasets": [
                    {
-                        "path": "tatsu-lab/alpaca",
+                        "path": "vicgalle/alpaca-gpt4",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
@@ -70,5 +69,5 @@ class TestPackedFlex(unittest.TestCase):
        train(cfg=cfg, dataset_meta=dataset_meta)
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high"
        )
--- a/tests/e2e/test_embeddings_lr.py
+++ b/tests/e2e/test_embeddings_lr.py
@@ -102,7 +102,6 @@ class TestEmbeddingsLrScale(unittest.TestCase):
                "use_tensorboard": True,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -84,5 +84,5 @@ class TestPretrainLlama:
            temp_dir + "/runs",
            "train/train_loss",
            loss_threshold,
-            "Train Loss is too high",
+            "Train Loss (%s) is too high",
        )
--- a/tests/e2e/test_llama_vision.py
+++ b/tests/e2e/test_llama_vision.py
@@ -109,7 +109,6 @@ class TestLlamaVision(unittest.TestCase):
                "bf16": True,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -40,9 +40,8 @@ class TestPackedLlama(unittest.TestCase):
                },
                "datasets": [
                    {
-                        "path": "tatsu-lab/alpaca",
+                        "path": "vicgalle/alpaca-gpt4",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
@@ -69,5 +68,5 @@ class TestPackedLlama(unittest.TestCase):
        train(cfg=cfg, dataset_meta=dataset_meta)
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high"
        )
--- a/tests/e2e/test_phi.py
+++ b/tests/e2e/test_phi.py
@@ -79,7 +79,7 @@ class TestPhi(unittest.TestCase):
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 2048,
                "sample_packing": False,
-                "load_in_4bit": True,
+                "load_in_8bit": False,
                "adapter": "qlora",
                "lora_r": 64,
                "lora_alpha": 32,
@@ -111,7 +111,6 @@ class TestPhi(unittest.TestCase):
                "bf16": "auto",
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/test_process_reward_model_smollm2.py
+++ b/tests/e2e/test_process_reward_model_smollm2.py
@@ -9,7 +9,7 @@ import unittest
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 from .utils import check_model_output_exists, check_tensorboard, with_temp_dir
@@ -57,7 +57,6 @@ class TestProcessRewardSmolLM2(unittest.TestCase):
                "seed": 42,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/test_reward_model_smollm2.py
+++ b/tests/e2e/test_reward_model_smollm2.py
@@ -73,6 +73,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
        train(cfg=cfg, dataset_meta=dataset_meta)
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train loss (%s) is too high"
        )
        check_model_output_exists(temp_dir, cfg)
--- a/tests/test_exact_deduplication.py
+++ b/tests/test_exact_deduplication.py
@@ -11,7 +11,7 @@ from unittest.mock import patch
 import pytest
 from datasets import Dataset
-from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.config import normalize_config
 from axolotl.utils.data import prepare_dataset
 from axolotl.utils.data.rl import load_prepare_preference_datasets
 from axolotl.utils.data.utils import deduplicate_and_log_datasets
@@ -319,7 +319,6 @@ class TestDeduplicateNonRL(unittest.TestCase):
                "num_epochs": 1,
            }
        )
        self.cfg_1 = validate_config(self.cfg_1)
        normalize_config(self.cfg_1)
    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
--- a/tests/test_perplexity.py
+++ b/tests/test_perplexity.py
@@ -8,7 +8,7 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer
 from axolotl.utils.callbacks.perplexity import Perplexity
-MODEL_NAME = "HuggingFaceTB/SmolLM2-135M"
+MODEL_NAME = "axolotl-ai-internal/llama-7m"
@fixture()
@@ -36,7 +36,7 @@ One day, a little fish named Fin was swimming near the shore. He saw a big crab
 """
    result = metric.compute(model, [sample_text])
    ppl = result["score"]
-    assert round(ppl, 2) == 7.41
+    assert round(ppl, 2) == 75.14
 def test_perplexity_short(model, metric):
@@ -44,4 +44,4 @@ def test_perplexity_short(model, metric):
    sample_text = "Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun."
    result = metric.compute(model, [sample_text])
    ppl = result["score"]
-    assert round(ppl, 2) == 10.33
+    assert round(ppl, 2) == 70.54
Author	SHA1	Message	Date
Wing Lian	a0670abc94	add output for train loss in assertian err	2025-04-18 08:11:11 -07:00
Wing Lian	08f287b57f	swap llama tests for 7m param model	2025-04-17 09:52:35 -07:00
Wing Lian	b4c7d9c29d	fix perplexity scores	2025-04-17 07:58:53 -07:00
Wing Lian	d2637fb01d	first pass at modifying tests to use llama-7m	2025-04-16 21:14:04 -07:00
		`@@ -1,2 +0,0 @@`
			`# Tests under this directory should get run "solo" on their own as they`
			`# seem to cause issues when run in the same batch as other tests.`