more parity across tests and docker images for packaging/setuptools

make sure packaging version is consistent
comment out license for validation for now
2025-03-21 08:56:01 -04:00 · 2025-03-21 08:27:17 -04:00 · 2025-03-21 08:20:28 -04:00 · 2025-03-21 08:12:07 -04:00 · 2025-03-21 07:25:09 -04:00 · 2025-03-21 07:19:12 -04:00
35 changed files with 341 additions and 70 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -40,6 +40,12 @@ jobs:
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "128"
+            cuda_version: 12.8.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: nightly
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -61,7 +67,7 @@ jobs:
        uses: docker/build-push-action@v4
        with:
          context: .
-          file: ./docker/Dockerfile-base
+          file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || './docker/Dockerfile-base' }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -88,6 +88,11 @@ jobs:
            pytorch: 2.5.1
            axolotl_extras:
            is_latest: true
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.6.0
+            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -80,6 +80,11 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.6.0
+            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -40,7 +40,7 @@ jobs:

      - name: Install dependencies
        run: |
-          pip3 install wheel packaging
+          pip3 install wheel packaging==23.2
          pip3 install --no-build-isolation -e .
          pip3 install -r requirements-dev.txt -r requirements-tests.txt

--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -42,7 +42,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging setuptools wheel
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel

      - name: Install PyTorch
        run: |
@@ -59,7 +59,7 @@ jobs:
      - name: Install dependencies
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging
+          pip3 install --upgrade packaging==23.2
          pip3 install --no-build-isolation -U -e .
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -74,7 +74,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging setuptools wheel
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel

      - name: Install PyTorch
        run: |
@@ -147,7 +147,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging setuptools setuptools_scm build wheel
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel

      - name: Install PyTorch
        run: |
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,8 +22,8 @@ repos:
    rev: 6.1.0
    hooks:
    - id: flake8
-   repo: https://github.com/PyCQA/pylint
-    rev: v3.3.0
+-   repo: https://github.com/pylint-dev/pylint
+    rev: c8c96d20cde3552a79858c7456bb1483bf83d633
    hooks:
    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
--- a/README.md
+++ b/README.md
@@ -55,6 +55,7 @@ Features:
 ### Installation

 ```bash
+pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]

 # Download example axolotl configs, deepspeed configs
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -32,8 +32,9 @@ website:
          contents:
            - docs/getting-started.qmd
            - docs/installation.qmd
-            - docs/cli.qmd
            - docs/inference.qmd
+            - docs/cli.qmd
+            - docs/config.qmd

        - section: "Dataset Formats"
          contents: docs/dataset-formats/*
@@ -74,10 +75,6 @@ website:
            - docs/debugging.qmd
            - docs/nccl.qmd

-        - section: "Reference"
-          contents:
-            - docs/config.qmd
-
 format:
  html:
    theme: darkly
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -31,6 +31,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi

+RUN pip install packaging==23.2 setuptools==75.8.0
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -28,7 +28,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"

 WORKDIR /workspace

-RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
+RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
--- a/docker/Dockerfile-base-nightly
+++ b/docker/Dockerfile-base-nightly
@@ -0,0 +1,39 @@
+ARG CUDA_VERSION="12.8.1"
+ARG CUDNN_VERSION="8"
+ARG UBUNTU_VERSION="22.04"
+ARG MAX_JOBS=4
+
+FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
+
+ENV PATH="/root/miniconda3/bin:${PATH}"
+
+ARG PYTHON_VERSION="3.11"
+ARG PYTORCH_VERSION="nightly"
+ARG CUDA="128"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+
+ENV PYTHON_VERSION=$PYTHON_VERSION
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+
+RUN apt-get update \
+    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
+    && wget \
+    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+    && mkdir /root/.conda \
+    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && rm -f Miniconda3-latest-Linux-x86_64.sh \
+    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
+
+ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
+
+WORKDIR /workspace
+
+RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
+    python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
+    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
+    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
+
+RUN git lfs install --skip-repo && \
+    pip3 install awscli && \
+    # The base image ships with `pydantic==1.8.2` which is not working
+    pip3 install -U --no-cache-dir pydantic==1.10.10
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -1,5 +1,5 @@
 ---
-title: Config options
+title: Config Reference
 description: A complete list of all configuration options.
 ---

@@ -30,6 +30,8 @@ tokenizer_legacy:
 # Resize the model embeddings when new tokens are added to multiples of 32
 # This is reported to improve training speed on some models
 resize_token_embeddings_to_32x:
+# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
+shrink_embeddings:

 # (Internal use only)
 # Used to identify which the model is based on
@@ -205,10 +207,46 @@ test_datasets:
    data_files:
      - /workspace/data/eval.jsonl

-# use RL training: 'dpo', 'ipo', 'kto'
+# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'
 rl:
-# whether to perform weighting if doing DPO training. Boolean.
-dpo_use_weighting:
+rl_beta:  # Optional[float]. The beta parameter for the RL training.
+
+# dpo
+dpo_use_weighting:  # Optional[bool]. Whether to perform weighting.
+rpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper.
+
+# orpo
+orpo_alpha: 0.1  # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping.
+
+# kto
+kto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss.
+kto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss.
+
+# simpo
+cpo_alpha: 1.0  # Weight of the BC regularizer
+simpo_gamma: 0.5  # Target reward margin for the SimPO loss
+
+# grpo
+trl:
+  use_vllm: # Optional[bool]. Whether to use VLLM for RL training.
+  vllm_device: # Optional[str]. Device to use for VLLM.
+  vllm_gpu_memory_utilization: # Optional[float]. GPU memory utilization for VLLM.
+  vllm_max_model_len: # Optional[int]. Maximum length of the model for VLLM.
+  vllm_dtype: # Optional[str]. Data type for VLLM.
+
+  beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use
+  max_completion_length: # Optional[int]. Maximum length of the completion for RL training.
+
+  reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir.
+  reward_weights: # Optional[list[float]]. List of reward weights for the reward functions.
+
+  num_generations: # Optional[int]. Number of generations to sample.
+  log_completions: # Optional[bool]. Whether to log completions.
+
+  sync_ref_model: # Optional[bool]. Whether to sync the reference model.
+  ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.
+  ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.
+

 # reward modelling: `True` or `False`
 reward_model:
@@ -232,7 +270,7 @@ default_system_message: You are a helpful assistant. Please give a long and deta
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
 # Push prepared dataset to hub
-push_dataset_to_hub: # repo path
+push_dataset_to_hub: # Optional[str] repo_org/repo_name
 # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
 # if not set.
 dataset_processes: # defaults to os.cpu_count() if not set
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -27,6 +27,16 @@ description: Frequently asked questions

 > A: This is usually an issue with the GPU. This can be resolved through setting the os environment variable `CUDA_VISIBLE_DEVICES=0`. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.

+**Q: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.**
+
+> A: This is likely due to vocab size mismatch. By default, Axolotl expands the model's embeddings if the tokenizer has more tokens than the model. Please use the `axolotl merge-lora` command to merge the adapters instead of using your own scripts.
+
+> On the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model's embeddings unless `shrink_embeddings: true` is set in the config.
+
+**Q: How to call Axolotl via custom python scripts?**
+
+> A: Yes, since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.
+
 ### Chat templates

 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
--- a/docs/getting-started.qmd
+++ b/docs/getting-started.qmd
@@ -36,7 +36,9 @@ The YAML configuration file controls everything about your training. Here's what

 ```yaml
 base_model: NousResearch/Llama-3.2-1B
-# hub_model_id: username/custom_model_name
+
+load_in_8bit: true
+adapter: lora

 datasets:
  - path: teknium/GPT4-LLM-Cleaned
@@ -44,11 +46,15 @@ datasets:
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/lora-out
-
-adapter: lora
-lora_model_dir:
 ```

+::: {.callout-tip}
+`load_in_8bit: true` and `adapter: lora` enables LoRA adapter finetuning.
+
+- To perform Full finetuning, remove these two lines.
+- To perform QLoRA finetuning, replace with `load_in_4bit: true` and `adapter: qlora`.
+:::
+
 See our [Config options](config.qmd) for more details.

 ### Training {#sec-training}
@@ -56,7 +62,7 @@ See our [Config options](config.qmd) for more details.
 When you run `axolotl train`, Axolotl:

 1. Downloads the base model
-2. (If specified) applies LoRA adapter layers
+2. (If specified) applies QLoRA/LoRA adapter layers
 3. Loads and processes the dataset
 4. Runs the training loop
 5. Saves the trained model and / or LoRA weights
@@ -69,6 +75,8 @@ Let's modify the example for your own data:

 ```yaml
 base_model: NousResearch/Nous-Hermes-llama-1b-v1
+
+load_in_8bit: true
 adapter: lora

 # Training settings
@@ -104,8 +112,6 @@ format):
 {"instruction": "Classify this text", "input": "Not good at all", "output": "negative"}
 ```

-Please consult the supported [Dataset Formats](dataset-formats/) for more details.
-
 3. Run the training:

 ```bash
--- a/docs/inference.qmd
+++ b/docs/inference.qmd
@@ -1,5 +1,5 @@
 ---
-title: "Inference"
+title: "Inference and Merging"
 format:
  html:
    toc: true
@@ -9,10 +9,14 @@ execute:
  enabled: false
 ---

-This guide covers how to use your trained models for inference, including model loading, interactive testing, and common troubleshooting steps.
+This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps.

 ## Quick Start {#sec-quickstart}

+::: {.callout-tip}
+Use the same config used for training on inference/merging.
+:::
+
 ### Basic Inference {#sec-basic}

 ::: {.panel-tabset}
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -22,6 +22,7 @@ This guide covers all the ways you can install and set up Axolotl for your envir
 ### PyPI Installation (Recommended) {#sec-pypi}

 ```{.bash}
+pip3 install -U packaging setuptools wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
 ```

@@ -37,7 +38,7 @@ For the latest features between releases:
 ```{.bash}
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install packaging ninja
+pip3 install -U packaging setuptools wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

@@ -107,7 +108,7 @@ We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
 2. Install PyTorch: https://pytorch.org/get-started/locally/
 3. Install Axolotl:
   ```{.bash}
-   pip3 install packaging
+   pip3 install -U packaging setuptools wheel ninja
   pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
   ```
 4. (Optional) Login to Hugging Face:
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -66,6 +66,10 @@ logic to be compatible with more of them.

 </details>

+::: {.callout-tip}
+Check out our [LoRA optimizations blog](https://axolotlai.substack.com/p/accelerating-lora-fine-tuning-with).
+:::
+
 ## Usage

 These optimizations can be enabled in your Axolotl config YAML file. The
--- a/docs/reward_modelling.qmd
+++ b/docs/reward_modelling.qmd
@@ -41,6 +41,10 @@ Bradley-Terry chat templates expect single-turn conversations in the following f

 ### Process Reward Models (PRM)

+::: {.callout-tip}
+Check out our [PRM blog](https://axolotlai.substack.com/p/process-reward-models).
+:::
+
 Process reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning.
 ```yaml
 base_model: Qwen/Qwen2.5-3B
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -298,7 +298,7 @@ The input format is a simple JSON input with customizable fields based on the ab

 ### IPO

-As IPO is just DPO with a different loss function, all supported options for DPO works here.
+As IPO is just DPO with a different loss function, all supported dataset formats for [DPO](#dpo) are also supported for IPO.

 ```yaml
 rl: ipo
@@ -344,8 +344,9 @@ ORPO supports the following types with the following dataset format:

 ```yaml
 rl: kto
-rl_beta: 0.5
-kto_desirable_weight: 0.2
+rl_beta: 0.1  # default
+kto_desirable_weight: 1.0  # default
+kto_undesirable_weight: 1.0  # default

 remove_unused_columns: false

@@ -497,6 +498,10 @@ The input format is a simple JSON input with customizable fields based on the ab

 ### GRPO

+::: {.callout-tip}
+Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
+:::
+
 GRPO uses custom reward functions and transformations. Please have them ready locally.

 For ex, to load OpenAI's GSM8K and use a random reward for completions:
@@ -540,6 +545,19 @@ To see other examples of custom reward functions, please see [TRL GRPO Docs](htt

 To see description of the configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/models/input/v0_4_1/trl.py).

+### SimPO
+
+SimPO uses [CPOTrainer](https://huggingface.co/docs/trl/main/en/cpo_trainer) but with alternative loss function.
+
+```yaml
+rl: simpo
+rl_beta: 0.1  # default in CPOTrainer
+cpo_alpha: 1.0  # default in CPOTrainer
+simpo_gamma: 0.5  # default in CPOTrainer
+```
+
+This method uses the same dataset format as [DPO](#dpo).
+
 ### Using local dataset files

 ```yaml
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=64", "wheel", "setuptools_scm>=8"]
+requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==23.2"]
 build-backend = "setuptools.build_meta"

 [project]
@@ -8,6 +8,7 @@ dynamic = ["version", "dependencies", "optional-dependencies"]
 description = "LLM Trainer"
 readme = "README.md"
 requires-python = ">=3.10"
+# license = "Apache-2.0"

 [project.scripts]
 axolotl = "axolotl.cli.main:main"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/

 # START section of dependencies that don't install on Darwin/MacOS
-bitsandbytes==0.45.2
+bitsandbytes==0.45.3
 triton>=3.0.0
 mamba-ssm==1.2.0.post1
 flash-attn==2.7.4.post1
@@ -12,12 +12,12 @@ liger-kernel==0.5.3

 packaging==23.2

-peft==0.14.0
+peft==0.15.0
 transformers==4.49.0
-tokenizers>=0.21.0
-accelerate==1.3.0
-datasets==3.2.0
-deepspeed==0.16.1
+tokenizers>=0.21.1
+accelerate==1.5.2
+datasets==3.4.1
+deepspeed==0.16.4
 trl==0.15.1

 optimum==1.16.2
@@ -62,5 +62,5 @@ antlr4-python3-runtime==4.13.2
 torchao==0.7.0
 schedulefree==1.3.0

-axolotl-contribs-lgpl @ git+https://github.com/axolotl-ai-cloud/axolotl-contribs-lgpl.git@import-issues-v2
+axolotl-contribs-lgpl==0.0.6
 axolotl-contribs-mit==0.0.3
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -17,12 +17,12 @@ if v < V("2.4.0"):

 cce_spec = importlib.util.find_spec("cut_cross_entropy")

-UNINSTALL_PREFIX = ""
+uninstall_prefix = ""
 if cce_spec:
    if not importlib.util.find_spec("cut_cross_entropy.transformers"):
-        UNINSTALL_PREFIX = "pip uninstall -y cut-cross-entropy && "
+        uninstall_prefix = "pip uninstall -y cut-cross-entropy && "

 print(
-    UNINSTALL_PREFIX
+    uninstall_prefix
    + 'pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@24fbe4b5dab9a6c250a014573613c1890190536c"'
 )
--- a/setup.py
+++ b/setup.py
@@ -128,7 +128,7 @@ setup(
            "flash-attn==2.7.4.post1",
        ],
        "deepspeed": [
-            "deepspeed==0.16.1",
+            "deepspeed==0.16.4",
            "deepspeed-kernels",
        ],
        "mamba-ssm": [
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -1,6 +1,7 @@
 """CLI to run training on a model."""

 import logging
+import os
 from pathlib import Path
 from typing import Union

@@ -34,7 +35,8 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
    """
    print_axolotl_text_art()
    check_accelerate_default_config()
-    check_user_token()
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+        check_user_token()

    if cfg.rl:
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
--- a/src/axolotl/core/datasets/chat.py
+++ b/src/axolotl/core/datasets/chat.py
@@ -43,7 +43,7 @@ class TokenizedChatDataset(Dataset):
        process_or_cpu_count: int = (
            process_count or os.cpu_count()  # type: ignore[assignment]
        )
-        num_proc = min(64, process_or_cpu_count)
+        num_proc = min(32, process_or_cpu_count)
        features = data.features.keys()
        tokenized_data = data.map(
            map_fn,
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -7,7 +7,7 @@ import signal
 import sys
 import weakref
 from pathlib import Path
-from typing import Any
+from typing import Any, Dict

 import torch
 import transformers.modelcard
@@ -20,7 +20,7 @@ from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.trainer import Trainer

 from axolotl.common.datasets import TrainDatasetMeta
-from axolotl.contribs.lgpl.unsloth import (  # pylint: disable = no-name-in-module
+from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
 )
 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
@@ -382,21 +382,23 @@ def handle_untrained_tokens_fix(
    if not cfg.fix_untrained_tokens:
        return

+    is_ds_zero3: bool = False
+    if os.environ.get("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3":
+        is_ds_zero3 = True
+
    # Check if the `token_ids_to_fix` kwarg exists in the fix_untrained_tokens args
    sig = inspect.signature(fix_untrained_tokens)

+    fix_kwargs: Dict[str, Any] = {}
    # If the function has the `token_ids_to_fix` arg, and fix_untrained_tokens is a list
    if "token_ids_to_fix" in sig.parameters and isinstance(
        cfg.fix_untrained_tokens, list
    ):
-        fix_untrained_tokens(
-            model,
-            tokenizer,
-            train_dataset,
-            token_ids_to_fix=cfg.fix_untrained_tokens,
-        )
-    else:
-        fix_untrained_tokens(model, tokenizer, train_dataset)
+        fix_kwargs["token_ids_to_fix"] = cfg.fix_untrained_tokens
+    if "is_ds_zero3" in sig.parameters:
+        fix_kwargs["is_ds_zero3"] = is_ds_zero3
+
+    fix_untrained_tokens(model, tokenizer, train_dataset, **fix_kwargs)

    if cfg.local_rank == 0:
        model.save_pretrained(
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -1,4 +1,5 @@
 """Module with Pydantic models for configuration."""
+
 # pylint: disable=too-many-lines

 import logging
@@ -506,7 +507,7 @@ class HyperparametersConfig(BaseModel):
    weight_decay: Optional[float] = 0.0
    optimizer: Optional[
        Union[OptimizerNames, CustomSupportedOptimizers]
-    ] = OptimizerNames.ADAMW_HF
+    ] = OptimizerNames.ADAMW_TORCH_FUSED
    optim_args: Optional[Union[str, Dict[str, Any]]] = Field(
        default=None,
        json_schema_extra={"description": "Optional arguments to supply to optimizer."},
@@ -728,7 +729,7 @@ class AxolotlInputConfig(
        default=None,
        json_schema_extra={"description": "streaming dataset to use for pretraining"},
    )
-    dataset_processes: Optional[int] = Field(default=os.cpu_count())
+    dataset_processes: Optional[int] = Field(default=min(32, os.cpu_count()))  # type: ignore[type-var]
    dataset_exact_deduplication: Optional[bool] = None
    dataset_keep_in_memory: Optional[bool] = None
    dataloader_pin_memory: Optional[bool] = None
@@ -1827,6 +1828,14 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
                data["torch_compile"] = False
        return data

+    @model_validator(mode="before")
+    @classmethod
+    def check_beta_and_trl_beta_match(cls, data):
+        if data.get("beta") and data.get("trl", {}).get("beta"):
+            if data["beta"] != data["trl"]["beta"]:
+                raise ValueError("beta and trl.beta must match or one must be removed")
+        return data
+

 def handle_legacy_message_fields_logic(data: dict) -> dict:
    """
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -2,6 +2,7 @@

 import functools
 import logging
+import os
 from pathlib import Path
 from typing import List, Optional, Tuple, Union

@@ -344,6 +345,7 @@ def load_tokenized_prepared_datasets(
                )
                ds_from_iter.save_to_disk(str(prepared_ds_path))
            else:
+                os.makedirs(prepared_ds_path, exist_ok=True)
                dataset.save_to_disk(str(prepared_ds_path))
            if cfg.push_dataset_to_hub:
                LOG.info(
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -24,7 +24,6 @@ from peft import (
    PeftModelForCausalLM,
    prepare_model_for_kbit_training,
 )
-from peft.tuners.lora import QuantLinear
 from torch import nn
 from transformers import (  # noqa: F401
    AddedToken,
@@ -1360,7 +1359,7 @@ def load_llama_adapter(model, cfg):


 def find_all_linear_names(model):
-    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear, QuantLinear)
+    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if (
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -108,6 +108,12 @@ def download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset():
    )


+@pytest.fixture(scope="session", autouse=True)
+def download_tiny_shakespeare_dataset():
+    # download the dataset
+    snapshot_download_w_retry("Trelis/tiny-shakespeare", repo_type="dataset")
+
+
@pytest.fixture
 def temp_dir():
    # Create a temporary directory
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -750,3 +750,66 @@ class TestMultiGPULlama:
        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
        )
+
+    def test_fix_untrained_tokens(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "fix_untrained_tokens": True,
+                "sequence_len": 512,
+                "val_set_size": 0.0,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                    "bos_token": "<|custom_im_start|>",
+                    "eos_token": "<|custom_im_end|>",
+                },
+                "datasets": [
+                    {
+                        "chat_template": "jinja",
+                        "chat_template_jinja": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|custom_im_start|>' + message['role'] + '\n' + message['content'] + '<|custom_im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|custom_im_start|>assistant\n' }}{% endif %}",
+                        "path": "mlabonne/FineTome-100k",
+                        "type": "chat_template",
+                        "split": "train[:10%]",
+                        "field_messages": "conversations",
+                        "message_field_role": "from",
+                        "message_field_content": "value",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 5,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "sample_packing": True,
+                "bf16": True,
+                "save_safetensors": True,
+                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero3_bf16.json"),
+                "use_tensorboard": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss is too high"
+        )
--- a/tests/e2e/solo/test_relora_llama.py
+++ b/tests/e2e/solo/test_relora_llama.py
@@ -40,8 +40,8 @@ class TestReLoraLlama(unittest.TestCase):
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_modules": ["q_proj", "v_proj"],
-                "relora_steps": 100,
-                "relora_warmup_steps": 20,
+                "relora_steps": 50,
+                "relora_warmup_steps": 10,
                "relora_anneal_steps": 10,
                "relora_prune_ratio": 0.9,
                "relora_cpu_offload": True,
@@ -60,9 +60,9 @@ class TestReLoraLlama(unittest.TestCase):
                        "message_field_content": "value",
                    },
                ],
-                "warmup_steps": 20,
+                "warmup_steps": 10,
                "num_epochs": 2,
-                "max_steps": 205,  # at least 2x relora_steps
+                "max_steps": 105,  # at least 2x relora_steps
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
--- a/tests/e2e/test_llama.py
+++ b/tests/e2e/test_llama.py
@@ -66,6 +66,54 @@ class TestLlama:
        check_model_output_exists(temp_dir, cfg)

    def test_fix_untrained_tokens(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "fix_untrained_tokens": True,
+                "sequence_len": 512,
+                "val_set_size": 0.0,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                    "bos_token": "<|custom_im_start|>",
+                    "eos_token": "<|custom_im_end|>",
+                },
+                "datasets": [
+                    {
+                        "chat_template": "jinja",
+                        "chat_template_jinja": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|custom_im_start|>' + message['role'] + '\n' + message['content'] + '<|custom_im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|custom_im_start|>assistant\n' }}{% endif %}",
+                        "path": "mlabonne/FineTome-100k",
+                        "type": "chat_template",
+                        "split": "train[:10%]",
+                        "field_messages": "conversations",
+                        "message_field_role": "from",
+                        "message_field_content": "value",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 5,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_8bit",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "sample_packing": True,
+                "bf16": True,
+                "save_safetensors": True,
+            }
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        cli_args = TrainerCliArgs()
+        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(temp_dir, cfg)
+
+    def test_fix_untrained_tokens_already_trained(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -7,13 +7,13 @@ import tempfile
 import unittest
 from pathlib import Path

+from conftest import snapshot_download_w_retry
 from constants import (
    ALPACA_MESSAGES_CONFIG_OG,
    ALPACA_MESSAGES_CONFIG_REVISION,
    SPECIAL_TOKENS,
 )
 from datasets import Dataset
-from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer

 from axolotl.utils.data import load_tokenized_prepared_datasets
@@ -69,7 +69,7 @@ class TestDatasetPreparation(unittest.TestCase):
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-            snapshot_download(
+            snapshot_download_w_retry(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
                local_dir=tmp_ds_path,
@@ -81,7 +81,7 @@ class TestDatasetPreparation(unittest.TestCase):
            # how to load it.
            cfg = DictDefault(
                {
-                    "tokenizer_config": "huggyllama/llama-7b",
+                    "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
                    "sequence_len": 1024,
                    "datasets": [
                        {
@@ -339,7 +339,7 @@ class TestDatasetPreparation(unittest.TestCase):
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-            snapshot_download(
+            snapshot_download_w_retry(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
                local_dir=tmp_ds_path,
@@ -381,7 +381,7 @@ class TestDatasetPreparation(unittest.TestCase):
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-            snapshot_download(
+            snapshot_download_w_retry(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
                local_dir=tmp_ds_path,
Author	SHA1	Message	Date
Wing Lian	31799bdcc0	more parity across tests and docker images for packaging/setuptools	2025-03-21 08:56:01 -04:00
Wing Lian	25455ac25f	make sure packaging version is consistent	2025-03-21 08:27:17 -04:00
Wing Lian	edea25bd58	comment out license for validation for now	2025-03-21 08:20:28 -04:00
Wing Lian	42e32223c9	try rolling back packaging and setuptools versions	2025-03-21 08:12:07 -04:00
Wing Lian	6e0fed0ce7	use license instead of license-file	2025-03-21 07:25:09 -04:00
Wing Lian	5ece44b4a8	try with reversion of packaging/setuptools/wheel install	2025-03-21 07:19:12 -04:00
Wing Lian	e7532c9b0c	make sure ninja is installed	2025-03-21 06:57:06 -04:00
Wing Lian	2518a9b2a2	multiline fix	2025-03-20 20:51:16 -04:00
Wing Lian	faeae323cb	install deepspeed by itself	2025-03-20 20:04:39 -04:00
Wing Lian	bb683644c3	deepspeed binary fixes hopefully	2025-03-20 19:52:07 -04:00
Wing Lian	7009a48398	bump deepspeed and set no binary	2025-03-20 14:01:01 -04:00
Wing Lian	ee529e2354	use nightly	2025-03-20 11:24:30 -04:00
Wing Lian	b2976e64ec	add 12.8.1 cuda to the base matrix	2025-03-20 11:24:30 -04:00
Wing Lian	38df5a36ea	bump HF versions except for trl (#2427 )	2025-03-20 10:22:05 -04:00
Wing Lian	4d92a68a96	use default torch fused adamw optimizer as default as adamw_hf is deprecated (#2425 ) * use default torch fused adamw optimizer as default as adamw_hf is deprecated * make sure to have latest packaging installed * bump packagingin requirements.txt too	2025-03-19 23:58:33 -04:00
SicariusSicariiStuff	85147ec430	Update README.md (#2360 ) * Update README.md wheel is needed * feat: add ninja, setuptools, packing to installation steps * fix: add missing instruction --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>	2025-03-17 08:39:17 -04:00
NanoCode012	51cd409488	Feat: minor docs improvements for RLHF and faq on embeddings (#2401 ) [skip ci] * feat: add doc on shrink_embeddings and custom calling * chore: rename inference doc * fix: clarify same config is used for all cli * chore: rearrange order inference qmd * feat: add simpo to doc * fix: update defaults * feat: add rl configs to doc * fix: ensure beta consistent with trl.beta * fix: clarify about lora/fft * chore: rename title * chore: fix language * feat: move config reference higher * Update docs/getting-started.qmd Co-authored-by: salman <salman.mohammadi@outlook.com> * Update docs/rlhf.qmd Co-authored-by: salman <salman.mohammadi@outlook.com> --------- Co-authored-by: salman <salman.mohammadi@outlook.com>	2025-03-17 08:39:04 -04:00
NanoCode012	7235123d44	chore(docs): add cookbook/blog link to docs (#2410 ) [skip ci]	2025-03-17 08:38:19 -04:00
Wing Lian	4f5eb42a73	remove reference to deprecated import (#2407 )	2025-03-15 08:49:41 -04:00
Wing Lian	fbe54be6b8	only validate hf user token on rank 0 (#2408 )	2025-03-13 23:29:06 -04:00
Wing Lian	04f6324833	build cloud images with torch 2.6.0 (#2413 ) * build cloud images with torch 2.6.0 * nightlies too	2025-03-13 23:28:51 -04:00
Wing Lian	f0072f3b9d	use max of 32 dataset processes if not explicit (#2403 ) * use max of 32 dataset processes if not explicit * change alternate min val for consistency	2025-03-11 12:02:58 -04:00
Wing Lian	59899b9817	pass additional info for fix untrained tokens when using distributed + offloading (#2388 ) * pass additional info for fix untrained tokens when using distributed + offloading * use latest version of vendored lib * use v0.0.5 of contribs lgpl * fix for no bad tokens and add tests * use release * add multigpu test too * make sure the multigpu zero3 test actually uses zero3	2025-03-11 12:02:43 -04:00