diff --git a/.bandit b/.bandit
index 82e88e814..b81428751 100644
--- a/.bandit
+++ b/.bandit
@@ -1,3 +1,3 @@
[bandit]
exclude = tests
-skips = B101,B615
+skips = B101,B615,B102,B110
diff --git a/.coderabbit.yaml b/.coderabbit.yaml
index 95c044f02..821d6bd5b 100644
--- a/.coderabbit.yaml
+++ b/.coderabbit.yaml
@@ -12,5 +12,6 @@ reviews:
auto_review:
enabled: true
drafts: false
+ auto_incremental_review: false
chat:
auto_reply: true
diff --git a/.flake8 b/.flake8
deleted file mode 100644
index fd69af775..000000000
--- a/.flake8
+++ /dev/null
@@ -1,5 +0,0 @@
-[flake8]
-max-line-length = 88
-
-select = C,E,F,W,B,B950
-extend-ignore = E203, E501, W503
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 8f67908e8..fcfd96891 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -57,6 +57,13 @@ We welcome ideas for improvements and new features. To suggest an enhancement, o
5. Push your branch to your fork on GitHub.
6. Open a new pull request against the `main` branch of the axolotl repository. Include a clear and concise description of your changes, referencing any related issues.
+#### Skipping CI Checks
+
+You can skip certain CI checks by including specific keywords in your commit messages:
+
+- `[skip ci]` or `skip ci` - Skips all CI checks for that commit
+- `[skip-e2e]` or `skip-e2e` - Skips only end-to-end tests while running other CI checks. You may also include this in the title of your PR to disable end-to-end tests for the entire PR.
+
## Style Guidelines
### Code Style
diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
index 160ed7df9..87d6772dd 100644
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -25,20 +25,6 @@ jobs:
fail-fast: false
matrix:
include:
- - cuda: "124"
- cuda_version: 12.4.1
- cudnn_version: ""
- python_version: "3.11"
- pytorch: 2.6.0
- torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
- dockerfile: "Dockerfile-base"
- - cuda: "126"
- cuda_version: 12.6.3
- cudnn_version: ""
- python_version: "3.11"
- pytorch: 2.6.0
- torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
- dockerfile: "Dockerfile-base"
- cuda: "126"
cuda_version: 12.6.3
cudnn_version: ""
@@ -67,6 +53,20 @@ jobs:
pytorch: 2.8.0
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-base"
+ - cuda: "128"
+ cuda_version: 12.8.1
+ cudnn_version: ""
+ python_version: "3.11"
+ pytorch: 2.9.0
+ torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+ dockerfile: "Dockerfile-base"
+ - cuda: "130"
+ cuda_version: 13.0.0
+ cudnn_version: ""
+ python_version: "3.11"
+ pytorch: 2.9.0
+ torch_cuda_arch_list: "9.0+PTX"
+ dockerfile: "Dockerfile-base"
# - cuda: "128"
# cuda_version: 12.8.1
# cudnn_version: ""
@@ -122,13 +122,6 @@ jobs:
fail-fast: false
matrix:
include:
- - cuda: "126"
- cuda_version: 12.6.3
- cudnn_version: ""
- python_version: "3.11"
- pytorch: 2.6.0
- torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
- dockerfile: "Dockerfile-uv-base"
- cuda: "126"
cuda_version: 12.6.3
cudnn_version: ""
@@ -150,6 +143,20 @@ jobs:
pytorch: 2.8.0
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-uv-base"
+ - cuda: "128"
+ cuda_version: 12.8.1
+ cudnn_version: ""
+ python_version: "3.11"
+ pytorch: 2.9.0
+ torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+ dockerfile: "Dockerfile-uv-base"
+ - cuda: "130"
+ cuda_version: 13.0.0
+ cudnn_version: ""
+ python_version: "3.11"
+ pytorch: 2.9.0
+ torch_cuda_arch_list: "9.0+PTX"
+ dockerfile: "Dockerfile-uv-base"
steps:
- name: Checkout
uses: actions/checkout@v4
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 3daf39e43..4040ccdc9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,11 +15,6 @@ jobs:
fail-fast: false
matrix:
include:
- - cuda: 126
- cuda_version: 12.6.3
- python_version: "3.11"
- pytorch: 2.6.0
- axolotl_extras:
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
@@ -36,6 +31,11 @@ jobs:
python_version: "3.11"
pytorch: 2.7.1
axolotl_extras:
+ - cuda: 128
+ cuda_version: 12.8.1
+ python_version: "3.11"
+ pytorch: 2.8.0
+ axolotl_extras:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout
@@ -83,11 +83,6 @@ jobs:
strategy:
matrix:
include:
- - cuda: 126
- cuda_version: 12.6.3
- python_version: "3.11"
- pytorch: 2.6.0
- axolotl_extras:
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
@@ -110,6 +105,11 @@ jobs:
python_version: "3.11"
pytorch: 2.7.1
axolotl_extras:
+ - cuda: 128
+ cuda_version: 12.8.1
+ python_version: "3.11"
+ pytorch: 2.8.0
+ axolotl_extras:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout
@@ -152,11 +152,6 @@ jobs:
strategy:
matrix:
include:
- - cuda: 126
- cuda_version: 12.6.3
- python_version: "3.11"
- pytorch: 2.6.0
- axolotl_extras:
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
@@ -169,6 +164,12 @@ jobs:
pytorch: 2.7.1
axolotl_extras: vllm
is_latest: true
+ - cuda: 128
+ cuda_version: 12.8.1
+ python_version: "3.11"
+ pytorch: 2.8.0
+ axolotl_extras:
+ is_latest:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout
diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
index 308526151..1682beb31 100644
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -26,20 +26,6 @@ jobs:
fail-fast: false
matrix:
include:
- - cuda: 126
- cuda_version: 12.6.3
- python_version: "3.11"
- pytorch: 2.6.0
- axolotl_extras:
- num_gpus: 2
- nightly_build: "true"
- - cuda: 126
- cuda_version: 12.6.3
- python_version: "3.11"
- pytorch: 2.7.0
- axolotl_extras:
- num_gpus: 2
- nightly_build: "true"
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
@@ -47,6 +33,20 @@ jobs:
axolotl_extras: vllm
num_gpus: 2
nightly_build: "true"
+ - cuda: 128
+ cuda_version: 12.8.1
+ python_version: "3.11"
+ pytorch: 2.8.0
+ axolotl_extras: fbgemm-gpu
+ num_gpus: 2
+ nightly_build: "true"
+ - cuda: 128
+ cuda_version: 12.8.1
+ python_version: "3.11"
+ pytorch: 2.9.0
+ axolotl_extras: fbgemm-gpu
+ num_gpus: 2
+ nightly_build: "true"
runs-on: [self-hosted, modal]
timeout-minutes: 120
steps:
diff --git a/.github/workflows/nightlies.yml b/.github/workflows/nightlies.yml
index 49bce470b..18b036a0d 100644
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,16 +12,16 @@ jobs:
fail-fast: false
matrix:
include:
- - cuda: 126
- cuda_version: 12.6.3
- python_version: "3.11"
- pytorch: 2.6.0
- axolotl_extras:
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
pytorch: 2.7.1
axolotl_extras:
+ - cuda: 128
+ cuda_version: 12.8.1
+ python_version: "3.11"
+ pytorch: 2.8.0
+ axolotl_extras:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout
@@ -65,16 +65,16 @@ jobs:
strategy:
matrix:
include:
- - cuda: 126
- cuda_version: 12.6.3
- python_version: "3.11"
- pytorch: 2.6.0
- axolotl_extras:
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
pytorch: 2.7.1
axolotl_extras:
+ - cuda: 128
+ cuda_version: 12.8.1
+ python_version: "3.11"
+ pytorch: 2.8.0
+ axolotl_extras:
runs-on: axolotl-gpu-runner
steps:
- name: Checkout
diff --git a/.github/workflows/precommit-autoupdate.yml b/.github/workflows/precommit-autoupdate.yml
index 10330f955..4c2e59b6b 100644
--- a/.github/workflows/precommit-autoupdate.yml
+++ b/.github/workflows/precommit-autoupdate.yml
@@ -2,7 +2,7 @@ name: Pre-commit auto-update
on:
schedule:
- - cron: '0 0 * * 0' # Run weekly
+ - cron: '0 0 1 * *' # Run monthly
workflow_dispatch: # Manual kickoff
jobs:
diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml
index fc6c2b396..35cb707eb 100644
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -26,7 +26,7 @@ jobs:
max-parallel: 2
matrix:
python_version: ["3.11"]
- pytorch_version: ["2.6.0", "2.7.0"]
+ pytorch_version: ["2.7.1", "2.8.0"]
timeout-minutes: 20
steps:
@@ -102,14 +102,14 @@ jobs:
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
- pytorch: 2.6.0
+ pytorch: 2.7.1
num_gpus: 1
axolotl_extras:
nightly_build: "true"
- - cuda: 126
- cuda_version: 12.6.3
+ - cuda: 128
+ cuda_version: 12.8.1
python_version: "3.11"
- pytorch: 2.7.1
+ pytorch: 2.8.0
num_gpus: 1
axolotl_extras:
nightly_build: "true"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 912b3f1d6..7ad9d1ab4 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -55,7 +55,7 @@ jobs:
fail-fast: false
matrix:
python_version: ["3.11"]
- pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
+ pytorch_version: ["2.7.1", "2.8.0", "2.9.0"]
timeout-minutes: 20
steps:
@@ -81,12 +81,12 @@ jobs:
- name: Install PyTorch
run: |
- pip3 install torch==${{ matrix.pytorch_version }} torchvision
+ pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
- name: Install dependencies
run: |
pip3 show torch
- pip3 install --no-build-isolation -U -e .
+ pip3 install --no-cache-dir --no-build-isolation -U -e .
python scripts/unsloth_install.py | sh
python scripts/cutcrossentropy_install.py | sh
pip3 install -r requirements-dev.txt -r requirements-tests.txt
@@ -130,7 +130,7 @@ jobs:
fail-fast: false
matrix:
python_version: ["3.11"]
- pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
+ pytorch_version: ["2.7.1", "2.8.0", "2.9.0"]
timeout-minutes: 20
steps:
@@ -152,17 +152,17 @@ jobs:
- name: upgrade pip
run: |
pip3 install --upgrade pip
- pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel
+ pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel psutil
- name: Install PyTorch
run: |
- pip3 install torch==${{ matrix.pytorch_version }} torchvision
+ pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
- name: Install dependencies
run: |
pip3 show torch
python -m build --no-isolation --sdist
- pip3 install --no-build-isolation dist/axolotl*.tar.gz
+ pip3 install --no-cache-dir --no-build-isolation dist/axolotl*.tar.gz
python scripts/unsloth_install.py | sh
python scripts/cutcrossentropy_install.py | sh
pip3 install -r requirements-dev.txt -r requirements-tests.txt
@@ -188,28 +188,53 @@ jobs:
run: |
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+ gate-skip-e2e:
+ needs: [pre-commit, pytest, pytest-sdist]
+ runs-on: ubuntu-latest
+ outputs:
+ skip: ${{ steps.compute.outputs.skip }}
+ steps:
+ - uses: actions/github-script@v7
+ id: compute
+ with:
+ script: |
+ const token = /\[skip-e2e\]/i;
+ let msg = '';
+ if (context.eventName === 'push') {
+ msg = context.payload.head_commit?.message || '';
+ } else if (context.eventName === 'pull_request') {
+ const { owner, repo } = context.repo;
+ const prNumber = context.payload.pull_request.number;
+ const commits = await github.paginate(
+ github.rest.pulls.listCommits,
+ { owner, repo, pull_number: prNumber, per_page: 100 }
+ );
+ msg = commits.at(-1)?.commit?.message || '';
+ }
+ const title = context.payload.pull_request?.title || '';
+ const body = context.payload.pull_request?.body || '';
+ const skip = token.test(msg) || token.test(title) || token.test(body);
+ core.setOutput('skip', String(skip));
+
docker-e2e-tests-1st:
# Run this job first as a gate for running the remainder of the test matrix
- if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
+ if: >
+ github.repository_owner == 'axolotl-ai-cloud' &&
+ (github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
+ needs.gate-skip-e2e.outputs.skip != 'true'
# this job needs to be run on self-hosted GPU runners...
runs-on: [self-hosted, modal]
timeout-minutes: 120
- needs: [pre-commit, pytest, pytest-sdist]
+ needs: [pre-commit, pytest, pytest-sdist, gate-skip-e2e]
strategy:
fail-fast: false
matrix:
include:
- - cuda: 126
- cuda_version: 12.6.3
+ - cuda: 128
+ cuda_version: 12.8.1
python_version: "3.11"
- pytorch: 2.7.1
- num_gpus: 1
- axolotl_extras:
- - cuda: 126
- cuda_version: 12.6.3
- python_version: "3.11"
- pytorch: 2.6.0
+ pytorch: 2.8.0
num_gpus: 1
axolotl_extras:
dockerfile: "Dockerfile-uv.jinja"
@@ -240,13 +265,16 @@ jobs:
modal run cicd.e2e_tests
docker-e2e-tests:
- if: ${{ github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
+ if: >
+ github.repository_owner == 'axolotl-ai-cloud' &&
+ (github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
+ needs.gate-skip-e2e.outputs.skip != 'true'
# this job needs to be run on self-hosted GPU runners...
runs-on: [self-hosted, modal]
timeout-minutes: 120
# Only run the remainder of the matrix if the first e2e check passed;
# this is to save on wasted compute costs for known failures that get caught in the first run
- needs: [pre-commit, pytest, docker-e2e-tests-1st]
+ needs: [pre-commit, pytest, gate-skip-e2e, docker-e2e-tests-1st]
strategy:
fail-fast: false
@@ -255,13 +283,26 @@ jobs:
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
- pytorch: 2.6.0
+ pytorch: 2.7.1
num_gpus: 1
axolotl_extras:
+# - cuda: 128
+# cuda_version: 12.8.1
+# python_version: "3.11"
+# pytorch: 2.7.1
+# num_gpus: 1
+# axolotl_extras:
- cuda: 128
cuda_version: 12.8.1
python_version: "3.11"
- pytorch: 2.7.1
+ pytorch: 2.8.0
+ num_gpus: 1
+ gpu_type: "B200"
+ axolotl_extras: fbgemm-gpu
+ - cuda: 128
+ cuda_version: 12.8.1
+ python_version: "3.11"
+ pytorch: 2.9.0
num_gpus: 1
axolotl_extras:
steps:
@@ -284,6 +325,7 @@ jobs:
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+ echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
- name: Run tests job on Modal
@@ -300,10 +342,10 @@ jobs:
fail-fast: false
matrix:
include:
- - cuda: 124
- cuda_version: 12.4.1
+ - cuda: 126
+ cuda_version: 12.6.3
python_version: "3.11"
- pytorch: 2.6.0
+ pytorch: 2.7.1
num_gpus: 1
axolotl_extras:
steps:
diff --git a/.gitignore b/.gitignore
index 40084b408..b75becc7c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -190,3 +190,6 @@ out/
# vim
*.swp
+
+# scm auto-versioning
+src/axolotl/_version.py
diff --git a/.isort.cfg b/.isort.cfg
deleted file mode 100644
index bf9afe319..000000000
--- a/.isort.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-[settings]
-profile=black
-known_third_party=wandb,comet_ml
-known_local_folder=src,tests
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4c9268529..86d8927d2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,24 +10,14 @@ repos:
- id: trailing-whitespace
- id: no-commit-to-branch
args: ['--branch', 'main']
-- repo: https://github.com/psf/black
- rev: 25.1.0
+- repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.14.3
hooks:
- - id: black
-- repo: https://github.com/pycqa/isort
- rev: 6.0.1
- hooks:
- - id: isort
-- repo: https://github.com/PyCQA/flake8
- rev: 7.3.0
- hooks:
- - id: flake8
-- repo: https://github.com/pylint-dev/pylint
- rev: v3.3.8
- hooks:
- - id: pylint
+ - id: ruff
+ args: [--fix]
+ - id: ruff-format
- repo: https://github.com/pre-commit/mirrors-mypy
- rev: v1.17.1
+ rev: v1.18.2
hooks:
- id: mypy
additional_dependencies:
diff --git a/.pylintrc b/.pylintrc
deleted file mode 100644
index 208dd32b6..000000000
--- a/.pylintrc
+++ /dev/null
@@ -1,15 +0,0 @@
-[MASTER]
-init-hook="from pylint.config import find_default_config_files; import sys; sys.path.append(next(find_default_config_files()).parent.as_posix())"
-
-[TYPECHECK]
-
-# List of members which are set dynamically and missed by Pylint inference
-# system, and so shouldn't trigger E1101 when accessed.
-generated-members=numpy.*, torch.*
-
-
-[pylint.messages_control]
-disable=missing-function-docstring, line-too-long, import-error,
- too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
- too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
- too-many-positional-arguments, possibly-used-before-assignment
diff --git a/CITATION.cff b/CITATION.cff
index e6ecc7cb8..7bbfeec64 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -1,6 +1,6 @@
cff-version: 1.2.0
type: software
-title: "Axolotl: Post-Training for AI Models"
+title: "Axolotl: Open Source LLM Post-Training"
message: "If you use this software, please cite it as below."
authors:
- name: "Axolotl maintainers and contributors"
diff --git a/README.md b/README.md
index 117eb9b12..6313a73ca 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,9 @@
+ A Free and Open Source LLM Fine-tuning Framework
+
"
- ],
- "text/plain": [
- " "
+ ],
+ "text/plain": [
+ "
@@ -17,6 +20,7 @@
+
@@ -49,26 +53,31 @@
## ✨ Overview
-Axolotl is a tool designed to streamline post-training for various AI models.
+Axolotl is a free and open-source tool designed to streamline post-training and fine-tuning for the latest large language models (LLMs).
Features:
-- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more. We are compatible with HuggingFace transformers causal language models.
-- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), Multimodal, and Reward Modelling (RM) / Process Reward Modelling (PRM).
-- **Easy Configuration**: Re-use a single YAML file between dataset preprocess, training, evaluation, quantization, and inference.
+- **Multiple Model Support**: Train various models like GPT-OSS, LLaMA, Mistral, Mixtral, Pythia, and many more models available on the Hugging Face Hub.
+- **Multimodal Training**: Fine-tune vision-language models (VLMs) including LLaMA-Vision, Qwen2-VL, Pixtral, LLaVA, SmolVLM2, and audio models like Voxtral with image, video, and audio support.
+- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), and Reward Modelling (RM) / Process Reward Modelling (PRM).
+- **Easy Configuration**: Re-use a single YAML configuration file across the full fine-tuning pipeline: dataset preprocessing, training, evaluation, quantization, and inference.
- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), [Sequence Parallelism (SP)](https://docs.axolotl.ai/docs/sequence_parallelism.html), [LoRA optimizations](https://docs.axolotl.ai/docs/lora_optims.html), [Multi-GPU training (FSDP1, FSDP2, DeepSpeed)](https://docs.axolotl.ai/docs/multi-gpu.html), [Multi-node training (Torchrun, Ray)](https://docs.axolotl.ai/docs/multi-node.html), and many more!
- **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
- **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.
-## 🚀 Quick Start
+## 🚀 Quick Start - LLM Fine-tuning in Minutes
**Requirements**:
- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
- Python 3.11
-- PyTorch ≥2.6.0
+- PyTorch ≥2.7.1
+
+### Google Colab
+
+[](https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb#scrollTo=msOCO4NRmRLa)
### Installation
@@ -155,7 +164,7 @@ If you use Axolotl in your research or projects, please cite it as follows:
```bibtex
@software{axolotl,
- title = {Axolotl: Post-Training for AI Models},
+ title = {Axolotl: Open Source LLM Post-Training},
author = {{Axolotl maintainers and contributors}},
url = {https://github.com/axolotl-ai-cloud/axolotl},
license = {Apache-2.0},
diff --git a/TODO.md b/TODO.md
deleted file mode 100644
index 2002bbbaf..000000000
--- a/TODO.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# todo list
-
-- [] Validation of parameters for combinations that won't work
-
-
-
-## things that are known not to work
-
-- FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203
-- adamw_bnb_8bit doesn't play well with FSDP offload
diff --git a/_quarto.yml b/_quarto.yml
index 934d393cb..fad3f6786 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -153,7 +153,7 @@ quartodoc:
- utils.distributed
- utils.dict
- utils.optimizers.adopt
- - utils.data.pretraining
+ - utils.data.streaming
- utils.data.sft
- utils.quantization
- title: Schemas
@@ -267,11 +267,13 @@ website:
- docs/dataset_loading.qmd
- docs/qat.qmd
- docs/quantize.qmd
+ - docs/optimizations.qmd
- section: "Core Concepts"
contents:
- docs/batch_vs_grad.qmd
- docs/dataset_preprocessing.qmd
+ - docs/streaming.qmd
- docs/multipack.qmd
- docs/mixed_precision.qmd
- docs/optimizers.qmd
diff --git a/cicd/Dockerfile-uv.jinja b/cicd/Dockerfile-uv.jinja
index 860386187..6a4d8a7d3 100644
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -32,6 +32,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
fi
RUN uv pip install packaging==23.2 setuptools==75.8.0
+RUN uv pip install torchvision
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
else \
diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja
index 94c9a67e3..81ed5453e 100644
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -1,6 +1,6 @@
FROM axolotlai/axolotl-base:{{ BASE_TAG }}
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
ENV CUDA="{{ CUDA }}"
@@ -9,7 +9,7 @@ ENV GITHUB_REF="{{ GITHUB_REF }}"
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
ENV HF_HOME="{{ HF_HOME }}"
-ENV AXOLOTL_DATASET_PROCESSES="8"
+ENV AXOLOTL_DATASET_NUM_PROC="8"
RUN apt-get update && \
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
@@ -32,7 +32,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
fi
-RUN pip install packaging==23.2 setuptools==75.8.0
+RUN pip install packaging==23.2 setuptools==75.8.0 psutil
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
else \
diff --git a/cicd/multigpu.py b/cicd/multigpu.py
index 2c067f143..5bd8d3c04 100644
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -2,8 +2,6 @@
modal application to run axolotl gpu tests in Modal
"""
-# pylint: disable=duplicate-code
-
import os
import pathlib
import tempfile
@@ -63,7 +61,7 @@ def run_cmd(cmd: str, run_folder: str):
# Propagate errors from subprocess.
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
- exit(exit_code) # pylint: disable=consider-using-sys-exit
+ exit(exit_code)
@app.function(
diff --git a/cicd/single_gpu.py b/cicd/single_gpu.py
index eb34e1748..cd73f60b8 100644
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -1,7 +1,5 @@
"""Modal app to run axolotl GPU tests"""
-# pylint: disable=duplicate-code
-
import os
import pathlib
import tempfile
@@ -59,15 +57,21 @@ VOLUME_CONFIG = {
}
N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_CONFIG = f"L40S:{N_GPUS}"
+GPU_TYPE = os.environ.get("GPU_TYPE", "L40S")
+GPU_CONFIG = f"{GPU_TYPE}:{N_GPUS}"
def run_cmd(cmd: str, run_folder: str):
import subprocess # nosec
sp_env = os.environ.copy()
- sp_env["AXOLOTL_DATASET_PROCESSES"] = "8"
+ sp_env["AXOLOTL_DATASET_NUM_PROC"] = "8"
# Propagate errors from subprocess.
- if exit_code := subprocess.call(cmd.split(), cwd=run_folder, env=sp_env): # nosec
- exit(exit_code) # pylint: disable=consider-using-sys-exit
+ try:
+ exit_code = subprocess.call(cmd.split(), cwd=run_folder, env=sp_env) # nosec
+ if exit_code:
+ print(f"Command '{cmd}' failed with exit code {exit_code}")
+ return exit_code
+ except Exception as e: # pylint: disable=broad-except
+ print(f"Command '{cmd}' failed with exception {e}")
diff --git a/codecov.yml b/codecov.yml
index 28921f9be..fa3ad3073 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -12,7 +12,7 @@ coverage:
default:
# basic
target: auto
- threshold: 0%
+ threshold: 1%
base: auto
# advanced
branches: null
@@ -27,7 +27,7 @@ coverage:
default:
# basic
target: auto
- threshold: 0%
+ threshold: 1%
base: auto
# advanced
branches: null
diff --git a/devtools/dev_chat_template.yml b/devtools/dev_chat_template.yml
index 27dc9be1a..32d5e56a0 100644
--- a/devtools/dev_chat_template.yml
+++ b/devtools/dev_chat_template.yml
@@ -13,7 +13,7 @@ datasets:
val_set_size: 0
output_dir: temp_debug/axolotl_outputs/model
dataset_prepared_path: temp_debug/axolotl_outputs/data
-dataset_processes: 1
+dataset_num_proc: 1
sequence_len: 4096
sample_packing: false
diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
index 0434a583f..25eae4fde 100644
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -35,18 +35,24 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
WORKDIR /workspace
-RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
+RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel psutil && \
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
- python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
- python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
python3 -m pip cache purge
+RUN if [ "$CUDA" != "130" ] ; then \
+ CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@v1.5.4"; \
+ python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
+ python3 -m pip cache purge; \
+ fi
+
RUN git lfs install --skip-repo && \
pip3 install awscli && \
# The base image ships with `pydantic==1.8.2` which is not working
pip3 install -U --no-cache-dir pydantic==1.10.10 && \
pip3 cache purge
-RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
- FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
+RUN if [ "$PYTORCH_VERSION" = "2.9.0" ] && [ "$CUDA" = "128" ] ; then \
+ wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+ pip3 install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+ rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
fi
diff --git a/docker/Dockerfile-uv-base b/docker/Dockerfile-uv-base
index 4b08e55f8..2ca272c6e 100644
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -30,7 +30,13 @@ RUN uv venv --no-project --relocatable axolotl-venv
ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
RUN uv pip install packaging setuptools wheel psutil \
- && uv pip install torch==${PYTORCH_VERSION} \
+ && uv pip install torch==${PYTORCH_VERSION} torchvision \
&& uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
&& uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
&& uv pip install awscli pydantic
+
+RUN if [ "$PYTORCH_VERSION" = "2.9.0" ] && [ "$CUDA" = "128" ] ; then \
+ wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+ uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+ rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+ fi
diff --git a/docs/dataset-formats/conversation.qmd b/docs/dataset-formats/conversation.qmd
index d53c68598..870a2b67d 100644
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -212,6 +212,14 @@ Instead of passing `tools` via the system prompt, an alternative method would be
Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
:::
+::: {.callout-warning}
+If you have tool arguments with same name but different dtypes (like `"time": string` and `"time": number`), please save `arguments: ` as JSON string to prevent `datasets` from having casting issues.
+
+```
+"arguments": "{\"...\": \"...\"}"
+```
+:::
+
Example config for Llama4:
```yaml
chat_template: llama4
diff --git a/docs/dataset-formats/index.qmd b/docs/dataset-formats/index.qmd
index a0113db07..715e3ef20 100644
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -61,7 +61,7 @@ While we recommend `.jsonl`, you can also use the other formats (`csv`, `parquet
### Pre-training without streaming
-On the rare case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the `completion` format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.
+In the case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the `completion` format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.
One benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.
diff --git a/docs/debugging.qmd b/docs/debugging.qmd
index bf3c6fe7e..04b4faa64 100644
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -29,7 +29,7 @@ While debugging it's helpful to simplify your test scenario as much as possible.
1. **Make sure you are using the latest version of axolotl**: This project changes often and bugs get fixed fast. Check your git branch and make sure you have pulled the latest changes from `main`.
1. **Eliminate concurrency**: Restrict the number of processes to 1 for both training and data preprocessing:
- Set `CUDA_VISIBLE_DEVICES` to a single GPU, ex: `export CUDA_VISIBLE_DEVICES=0`.
- - Set `dataset_processes: 1` in your axolotl config or run the training command with `--dataset_processes=1`.
+ - Set `dataset_num_proc: 1` in your axolotl config or run the training command with `--dataset_num_proc=1`.
2. **Use a small dataset**: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure `sample_packing: False` and `eval_sample_packing: False` to avoid errors. If you are in a pinch and don't have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training. For example, to shard the dataset into 20 pieces, add the following to your axolotl config):
```yaml
@@ -101,7 +101,7 @@ For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 acceler
"-m", "axolotl.cli.train", "dev_chat_template.yml",
// The flags below simplify debugging by overriding the axolotl config
// with the debugging tips above. Modify as needed.
- "--dataset_processes=1", // limits data preprocessing to one process
+ "--dataset_num_proc=1", // limits data preprocessing to one process
"--max_steps=1", // limits training to just one step
"--batch_size=1", // minimizes batch size
"--micro_batch_size=1", // minimizes batch size
diff --git a/docs/faq.qmd b/docs/faq.qmd
index 08d439af7..92b432f2d 100644
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -63,6 +63,14 @@ description: Frequently asked questions
> A: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.
+**Q: Can we mix text and text+image datasets for VLM training?**
+
+> A: Yes, you can for newer VLM arch. The ones that would not work are LLaVA / Pixtral arch. If you notice one not working, please let us know!
+
+**Q: Why is `memory/max_*` different from `nvidia-smi`?**
+
+> A: We use `torch` APIs to retrieve this information. You can see https://docs.pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management for more information.
+
### Chat templates
**Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
@@ -140,3 +148,7 @@ description: Frequently asked questions
**Q: `ValueError("Backward pass should have cleared tracker of all tensors")`
> A: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with `offload_activations: legacy` in your YAML.
+
+**Q: `Error parsing tool_calls arguments as JSON.`
+
+> A: There is an error parsing string arguments to a dict. Please check your dataset and the error message for more details.
diff --git a/docs/fsdp_qlora.qmd b/docs/fsdp_qlora.qmd
index 2f1b0358f..01f57e627 100644
--- a/docs/fsdp_qlora.qmd
+++ b/docs/fsdp_qlora.qmd
@@ -1,5 +1,5 @@
---
-title: "FDSP + QLoRA"
+title: "FSDP + QLoRA"
description: Use FSDP with QLoRA to fine-tune large LLMs on consumer GPUs.
format:
html:
@@ -23,6 +23,12 @@ To enable `QLoRA` with `FSDP`, you need to perform the following steps:
2. Enable FSDP in your axolotl config, as [described here](multi-gpu.qmd#sec-fsdp).
3. Use one of the supported model types: `llama`, `mistral` or `mixtral`.
+## Enabling Swap for FSDP2
+
+If available memory is insufficient even after FSDP's CPU offloading, you can enable swap memory usage by setting `cpu_offload_pin_memory: false` alongside `offload_params: true` in FSDP config.
+
+This disables memory pinning, allowing FSDP to use disk swap space as fallback. Disabling memory pinning itself incurs performance overhead, and actually having to use swap adds more, but it may enable training larger models that would otherwise cause OOM errors on resource constrained systems.
+
## Example Config
[examples/llama-2/qlora-fsdp.yml](../examples/llama-2/qlora-fsdp.yml) contains an example of how to enable QLoRA + FSDP in axolotl.
diff --git a/docs/installation.qmd b/docs/installation.qmd
index 763539278..265ff238c 100644
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -134,7 +134,7 @@ For providers supporting Docker:
### Google Colab {#sec-colab}
-Use our [example notebook](../examples/colab-notebooks/colab-axolotl-example.ipynb).
+[](https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb#scrollTo=msOCO4NRmRLa)
## Platform-Specific Instructions {#sec-platform-specific}
diff --git a/docs/lora_optims.qmd b/docs/lora_optims.qmd
index 7cdf53975..40893387b 100644
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -5,10 +5,11 @@ description: "Custom autograd functions and Triton kernels in Axolotl for optimi
Inspired by [Unsloth](https://github.com/unslothai/unsloth), we've implemented two
optimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU
-(in the DDP and DeepSpeed settings) training. These include (1) SwiGLU and GEGLU activation function
-Triton kernels, and (2) LoRA MLP and attention custom autograd functions. Our goal was
-to leverage operator fusion and tensor re-use in order to improve speed and reduce
-memory usage during the forward and backward passes of these calculations.
+(including the DDP, DeepSpeed, and FSDP2 settings) training. These include (1) SwiGLU
+and GEGLU activation function Triton kernels, and (2) LoRA MLP and attention custom
+autograd functions. Our goal was to leverage operator fusion and tensor re-use in order
+to improve speed and reduce memory usage during the forward and backward passes of
+these calculations.
We currently support several common model architectures, including (but not limited to):
@@ -131,6 +132,5 @@ computation path.
## Future Work
- Support for additional model architectures
-- Support for the FSDP setting
- Support for dropout and bias
- Additional operator fusions
diff --git a/docs/lr_groups.qmd b/docs/lr_groups.qmd
index 52059016c..ce5350722 100644
--- a/docs/lr_groups.qmd
+++ b/docs/lr_groups.qmd
@@ -27,3 +27,9 @@ learning_rate: 2e-5
In this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate
of 1e-6 for all the self attention `o_proj` modules across all layers, and a learning are of 1e-5 to the 3rd layer's
self attention `q_proj` module.
+
+::: {.callout-note}
+
+We currently only support varying `lr` for now. If you're interested in adding support for others (`weight_decay`), we welcome PRs. See https://github.com/axolotl-ai-cloud/axolotl/blob/613bcf90e58f3ab81d3827e7fc572319908db9fb/src/axolotl/core/trainers/mixins/optimizer.py#L17
+
+:::
diff --git a/docs/multi-gpu.qmd b/docs/multi-gpu.qmd
index 71676bc84..57a941b04 100644
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -63,15 +63,6 @@ Start from Stage 1 -> Stage 2 -> Stage 3.
:::
-::: {.callout-tip}
-
-Using ZeRO Stage 3 with Single-GPU training
-
-ZeRO Stage 3 can be used for training on a single GPU by manually setting the environment variables:
-`WORLD_SIZE=1 LOCAL_RANK=0 MASTER_ADDR=0.0.0.0 MASTER_PORT=29500`
-
-:::
-
## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
::: {.callout-note}
@@ -97,6 +88,7 @@ fsdp_sync_module_states | **REMOVED**
fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading
fsdp_state_dict_type | state_dict_type
fsdp_use_orig_params | **REMOVED**
+fsdp_activation_checkpointing | activation_checkpointing
For more details, please see the migration guide in the [torchtitan repo](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md). In Axolotl,
if you were using the following FSDP1 config:
diff --git a/docs/multimodal.qmd b/docs/multimodal.qmd
index dbb365f73..1c4e28ea7 100644
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -13,10 +13,14 @@ format:
- [Pixtral](#sec-pixtral)
- [Llava-1.5](#sec-llava-15)
- [Mistral-Small-3.1](#sec-mistral-small-31)
+- [Magistral-Small-2509](#sec-magistral-small-2509)
+- [Voxtral](#sec-voxtral)
- [Gemma-3](#sec-gemma-3)
- [Gemma-3n](#sec-gemma-3n)
- [Qwen2-VL](#sec-qwen2-vl)
- [Qwen2.5-VL](#sec-qwen25-vl)
+- [SmolVLM2](#sec-smolvlm2)
+- [LFM2-VL](#sec-lfm2-vl)
## Usage
@@ -31,14 +35,13 @@ skip_prepare_dataset: true
remove_unused_columns: false # leave columns in place as they are needed to handle image embeddings during training
sample_packing: false # not yet supported with multimodal
-chat_template: # see in next section
+chat_template: # see in next section if specified
# example dataset
datasets:
- path: HuggingFaceH4/llava-instruct-mix-vsft
type: chat_template
split: train[:1%]
- field_messages: messages
# (optional) if doing lora, only finetune the Language model,
# leave the vision model and vision tower frozen
@@ -53,10 +56,14 @@ image_resize_algorithm: bilinear
Please see [examples](https://github.com/axolotl-ai/axolotl/tree/main/examples) folder for full configs.
-::: {.callout-warning}
+::: {.callout-tip}
Some of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.
:::
+::: {.callout-note}
+As of now, we do not truncate nor drop samples based on `sequence_len` as each arch has different ways to process non-text tokens. We are looking for help on this.
+:::
+
### Mllama {#sec-mllama}
```yaml
@@ -91,10 +98,32 @@ chat_template: llava
### Mistral-Small-3.1 {#sec-mistral-small-31}
+::: {.callout-tip}
+Please make sure to install vision lib via `pip install 'mistral-common[opencv]==1.8.5'`
+:::
+
```yaml
base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
+```
-chat_template: mistral_v7_tekken
+### Magistral-Small-2509 {#sec-magistral-small-2509}
+
+::: {.callout-tip}
+Please make sure to install vision lib via `pip install 'mistral-common[opencv]==1.8.5'`
+:::
+
+```yaml
+base_model: mistralai/Magistral-Small-2509
+```
+
+### Voxtral {#sec-voxtral}
+
+::: {.callout-tip}
+Please make sure to install audio lib via `pip3 install librosa==0.11.0 'mistral_common[audio]==1.8.3'`
+:::
+
+```yaml
+base_model: mistralai/Voxtral-Mini-3B-2507
```
### Gemma-3 {#sec-gemma-3}
@@ -143,6 +172,34 @@ base_model: Qwen/Qwen2.5-VL-7B-Instruct
chat_template: qwen2_vl # same as qwen2-vl
```
+### Qwen3-VL {#sec-qwen3-vl}
+
+```yaml
+base_model: Qwen/Qwen3-VL-4B-Instruct
+
+chat_template: qwen2_vl # same as qwen2-vl
+```
+
+### SmolVLM2 {#sec-smolvlm2}
+
+::: {.callout-tip}
+Please make sure to install `num2words` via `pip3 install num2words==0.5.14`
+:::
+
+```yaml
+base_model: HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+```
+
+### LFM2-VL {#sec-lfm2-vl}
+
+::: {.callout-warning}
+Please uninstall `causal-conv1d` via `pip3 uninstall -y causal-conv1d`
+:::
+
+```yaml
+base_model: LiquidAI/LFM2-VL-450M
+```
+
## Dataset Format
For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
@@ -181,6 +238,20 @@ You may need to install `librosa` via `pip3 install librosa==0.11.0`.
:::
+### Video
+
+::: {.callout-warning}
+
+This is not well tested at the moment. We welcome contributors!
+
+:::
+
+For video loading, you can use the following keys within `content` alongside `"type": "video"`:
+
+- `"path": "/path/to/video.mp4"`
+- `"url": "https://example.com/video.mp4"`
+- `"video": np.ndarray | list[PIL.Image.Image] | torch.Tensor` (or list of the aforementioned)
+
### Example
Here is an example of a multi-modal dataset:
diff --git a/docs/optimizations.qmd b/docs/optimizations.qmd
new file mode 100644
index 000000000..967ec2d34
--- /dev/null
+++ b/docs/optimizations.qmd
@@ -0,0 +1,133 @@
+---
+title: Optimizations Guide
+description: A guide to the performance and memory optimizations available in Axolotl.
+---
+
+Axolotl includes numerous optimizations to speed up training, reduce memory usage, and handle large models.
+
+This guide provides a high-level overview and directs you to the detailed documentation for each feature.
+
+## Speed Optimizations
+
+These optimizations focus on increasing training throughput and reducing total training time.
+
+### Sample Packing
+
+Improves GPU utilization by combining multiple short sequences into a single packed sequence for training. This requires enabling one of the [attention](#attention-implementations) implementations below.
+
+- **Config:** `sample_packing: true`
+- **Learn more:** [Sample Packing](multipack.qmd)
+
+### Attention Implementations
+
+Using an optimized attention implementation is critical for training speed.
+
+- **[Flash Attention 2](https://github.com/Dao-AILab/flash-attention)**: `flash_attention: true`. **(Recommended)** The industry standard for fast attention on modern GPUs. Requires Ampere or higher. For AMD, check [AMD Support](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#amd-rocm-support).
+- **[Flex Attention](https://pytorch.org/blog/flexattention/)**: `flex_attention: true`.
+- **[SDP Attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)**: `sdp_attention: true`. PyTorch's native implementation.
+- **[Xformers](https://github.com/facebookresearch/xformers)**: `xformers_attention: true`. Works with FP16.
+
+*Note: You should only enable one attention backend.*
+
+### LoRA Optimizations
+
+Leverages optimized kernels to accelerate LoRA training and reduce memory usage.
+
+- **Learn more:** [LoRA Optimizations Documentation](lora_optims.qmd)
+
+## Memory Optimizations
+
+These techniques help you fit larger models or use bigger batch sizes on your existing hardware.
+
+### Parameter Efficient Finetuning (LoRA & QLoRA)
+
+Drastically reduces memory by training a small set of "adapter" parameters instead of the full model. This is the most common and effective memory-saving technique.
+
+- Examples: Find configs with `lora` or `qlora` in the [examples directory](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-3).
+- Config Reference: See `adapter`, `load_in_4bit`, and `load_in_8bit` in the [Configuration Reference](config-reference.qmd).
+
+### Gradient Checkpointing & Activation Offloading
+
+These techniques save VRAM by changing how activations are handled.
+
+- Gradient Checkpointing: re-computes activations during the backward pass, trading compute time for VRAM.
+- Activation Offloading: moves activations to CPU RAM or disk, trading I/O overhead for VRAM.
+- Learn more: [Gradient Checkpointing and Offloading Docs](gradient_checkpointing.qmd)
+
+### Cut Cross Entropy (CCE)
+
+Reduces VRAM usage by using an optimized cross-entropy loss calculation.
+
+- **Learn more:** [Custom Integrations - CCE](custom_integrations.qmd#cut-cross-entropy)
+
+### Liger Kernels
+
+Provides efficient Triton kernels to improve training speed and reduce memory usage.
+
+- **Learn more:** [Custom Integrations - Liger Kernels](custom_integrations.qmd#liger-kernels)
+
+## Long Context Models
+
+Techniques to train models on sequences longer than their original context window.
+
+### RoPE Scaling
+
+Extends a model's context window by interpolating its Rotary Position Embeddings.
+
+- **Config:** Pass the `rope_scaling` config under the `overrides_of_model_config: `. To learn how to set RoPE, check the respective model config.
+
+### Sequence Parallelism
+
+Splits long sequences across multiple GPUs, enabling training with sequence lengths that would not fit on a single device.
+
+- **Learn more:** [Sequence Parallelism Documentation](sequence_parallelism.qmd)
+
+### Artic Long Sequence Training (ALST)
+
+ALST is a recipe that combines several techniques to train long-context models efficiently. It typically involves:
+
+- TiledMLP to reduce memory usage in MLP layers.
+- Tiled Loss functions (like [CCE](#cut-cross-entropy-(cce) or [Liger](#liger-kernels)).
+- Activation Offloading to CPU.
+
+- Example: [ALST Example Configuration](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst)
+
+## Large Models (Distributed Training)
+
+To train models that don't fit on a single GPU, you'll need to use a distributed training strategy like FSDP or DeepSpeed. These frameworks shard the model weights, gradients, and optimizer states across multiple GPUs and nodes.
+
+- **Learn more:** [Multi-GPU Guide](multi-gpu.qmd)
+- **Learn more:** [Multi-Node Guide](multi-node.qmd)
+
+### N-D Parallelism (Beta)
+
+For advanced scaling, Axolotl allows you to compose different parallelism techniques (e.g., Data, Tensor, Sequence Parallelism). This is a powerful approach to train an extremely large model by overcoming multiple bottlenecks at once.
+
+- **Learn more:** [N-D Parallelism Guide](nd_parallelism.qmd)
+
+
+## Quantization
+
+Techniques to reduce the precision of model weights for memory savings.
+
+### 4-bit Training (QLoRA)
+
+The recommended approach for quantization-based training. It loads the base model in 4-bit using `bitsandbytes` and then trains QLoRA adapters. See [Adapter Finetuning](#adapter-finetuning-lora-qlora) for details.
+
+### FP8 Training
+
+Enables training with 8-bit floating point precision on supported hardware (e.g., NVIDIA Hopper series GPUs) for significant speed and memory gains.
+
+- **Example:** [Llama 3 FP8 FSDP Example](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-3/3b-fp8-fsdp2.yaml)
+
+### Quantization Aware Training (QAT)
+
+Simulates quantization effects during training, helping the model adapt and potentially improving the final accuracy of the quantized model.
+
+- **Learn more:** [QAT Documentation](qat.qmd)
+
+### GPTQ
+
+Allows you to finetune LoRA adapters on top of a model that has already been quantized using the GPTQ method.
+
+- **Example:** [GPTQ LoRA Example](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-2/gptq-lora.yml)
diff --git a/docs/qat.qmd b/docs/qat.qmd
index e0d000a79..91fe5180c 100644
--- a/docs/qat.qmd
+++ b/docs/qat.qmd
@@ -23,10 +23,18 @@ To enable QAT in axolotl, add the following to your configuration file:
```yaml
qat:
- activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
- weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
+ activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4", "int8", "float8"
+ weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4", "fp8", and "nvfp4".
group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
```
+We support the following quantization schemas:
+
+- `Int4WeightOnly` (requires the `fbgemm-gpu` extra when installing Axolotl)
+- `Int8DynamicActivationInt4Weight`
+- `Float8DynamicActivationFloat8Weight`
+- `Float8DynamicActivationInt4Weight`
+- `NVFP4`
+
Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize`](./quantize.qmd) command to do this.
diff --git a/docs/quantize.qmd b/docs/quantize.qmd
index 113fcafbe..9c3de1ef1 100644
--- a/docs/quantize.qmd
+++ b/docs/quantize.qmd
@@ -22,8 +22,8 @@ Quantization is configured using the `quantization` key in your configuration fi
```yaml
base_model: # The path to the model to quantize.
quantization:
- weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
- activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
+ activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4", "int8", "float8"
+ weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4", "fp8", and "nvfp4".
group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
@@ -39,9 +39,8 @@ you used to train the model:
# qat.yml
qat:
activation_dtype: int8
- weight_dtype: int8
+ weight_dtype: int4
group_size: 256
- quantize_embedding: true
output_dir: # The path to the output directory used during training where the final checkpoint has been saved.
```
@@ -51,3 +50,11 @@ axolotl quantize qat.yml
```
This ensures that an identical quantization configuration is used to quantize the model as was used to train it.
+
+
+::: {.callout-note}
+
+If you have configured pushing to hub with `hub_model_id`, your model hub name will have the quantization schema appended to it,
+e.g. `axolotl-ai-cloud/qat-nvfp4-llama3B` will become `axolotl-ai-cloud/qat-nvfp4-llama3B-nvfp4w`
+
+:::
diff --git a/docs/reward_modelling.qmd b/docs/reward_modelling.qmd
index 386dc1f57..b5cf3010d 100644
--- a/docs/reward_modelling.qmd
+++ b/docs/reward_modelling.qmd
@@ -11,6 +11,7 @@ We support the reward modelling techniques supported by `trl`.
### (Outcome) Reward Models
Outcome reward models are trained using data which contains preference annotations for an entire interaction between the user and model (e.g. rather than per-turn or per-step).
+For improved training stability, you can use the `center_rewards_coefficient` parameter to encourage mean-zero reward outputs ([see TRL docs](https://huggingface.co/docs/trl/v0.10.1/en/reward_trainer#centering-rewards)).
```yaml
base_model: google/gemma-2-2b
diff --git a/docs/rlhf.qmd b/docs/rlhf.qmd
index 4a67b7559..1eea42036 100644
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -219,6 +219,21 @@ DPO supports the following types with the following dataset format:
}
```
+#### chat_template.argilla_chat
+
+```json
+{
+ "chosen": [
+ {"role": "user", "content": "..."},
+ {"role": "assistant", "content": "..."}
+ ],
+ "rejected": [
+ {"role": "user", "content": "..."},
+ {"role": "assistant", "content": "..."}
+ ]
+}
+```
+
#### chat_template.default
```yaml
@@ -582,6 +597,116 @@ To see other examples of custom reward functions, please see [TRL GRPO Docs](htt
To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
+#### OpenEnv Rollout Functions
+
+GRPO supports custom rollout functions for OpenEnv-style environments, enabling interactive tasks like web browsing, code execution, or tool use. This allows you to implement custom generation logic that interacts with external environments.
+
+For example, to implement a simple math-solving environment with step-by-step verification:
+
+```python
+# math_env.py
+import re
+
+def math_solver_rollout(model, processing_class, prompts, generation_config=None):
+ """
+ Custom rollout function that generates step-by-step math solutions.
+
+ Args:
+ model: The language model
+ processing_class: The tokenizer/processing_class
+ prompts: List of prompt dicts (with 'messages' key for chat format)
+ generation_config: Optional generation configuration
+
+ Returns:
+ List of completion strings
+ """
+ completions = []
+
+ for prompt in prompts:
+ # Apply chat template to prompt
+ messages = prompt.get("messages", [])
+ formatted_prompt = processing_class.apply_chat_template(
+ messages, processing_class=False, add_generation_prompt=True
+ )
+
+ # Generate step-by-step solution
+ full_response = ""
+ for step in range(5): # Max 5 reasoning steps
+ current_input = formatted_prompt + full_response + "\nNext step:"
+ inputs = processing_class(current_input, return_tensors="pt").to(model.device)
+
+ outputs = model.generate(
+ **inputs,
+ max_new_tokens=100,
+ generation_config=generation_config,
+ )
+ step_text = processing_class.decode(
+ outputs[0][inputs.input_ids.shape[1]:],
+ skip_special_tokens=True
+ )
+
+ # Check if solution is complete
+ if "FINAL ANSWER:" in step_text:
+ full_response += step_text
+ break
+ full_response += step_text + "\n"
+
+ completions.append(full_response)
+
+ return completions
+
+def math_reward(prompts, completions, answers, **kwargs):
+ """Reward function that checks mathematical correctness"""
+ rewards = []
+ for completion, correct_answer in zip(completions, answers):
+ # Extract predicted answer
+ match = re.search(r"FINAL ANSWER:\s*(.+)", completion)
+ predicted = match.group(1).strip() if match else ""
+
+ # Compare with correct answer
+ reward = 1.0 if predicted == str(correct_answer) else 0.0
+ rewards.append(reward)
+
+ return rewards
+
+def math_transform(cfg, *args, **kwargs):
+ """Transform dataset to GRPO format with answer field"""
+ def transform_fn(example, processing_class=None):
+ return {
+ "prompt": [{"role": "user", "content": example["question"]}],
+ "answer": str(example["answer"]),
+ }
+ return transform_fn, {"remove_columns": ["question"]}
+```
+
+```yaml
+rl: grpo
+
+trl:
+ beta: 0.001
+ max_completion_length: 512
+ num_generations: 4
+ rollout_func: "math_env.math_solver_rollout" # Custom rollout function
+ reward_funcs: ["math_env.math_reward"]
+ reward_weights: [1.0]
+
+datasets:
+ - path: openai/gsm8k
+ name: main
+ type: math_env.math_transform
+```
+
+The `rollout_func` parameter accepts a fully qualified name (e.g., `module_name.function_name`) that points to a callable function in your local directory. The function receives:
+
+- `model`: The language model
+- `processing_class`: The tokenizer/processing class
+- `prompts`: List of prompt dictionaries
+- `generation_config` (optional): Generation configuration
+
+And should return a list of completion strings.
+
+For more OpenEnv examples, see [TRL OpenEnv Documentation](https://huggingface.co/docs/trl/main/en/openenv).
+
#### GRPO with DAPO/Dr. GRPO loss
The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
diff --git a/docs/scripts/generate_config_docs.py b/docs/scripts/generate_config_docs.py
index e22da7d05..6efa2038b 100644
--- a/docs/scripts/generate_config_docs.py
+++ b/docs/scripts/generate_config_docs.py
@@ -47,7 +47,6 @@ class QuartoGenerator:
"""Check if a type is a Pydantic BaseModel."""
return inspect.isclass(type_obj) and issubclass(type_obj, BaseModel)
- # pylint: disable=too-many-return-statements
def _extract_nested_type(self, field_type) -> Any:
"""Extract the actual type from complex type annotations."""
# Handle Annotated types (Python 3.9+)
@@ -124,7 +123,6 @@ class QuartoGenerator:
return field_type
- # pylint: disable=too-many-return-statements
def _extract_all_pydantic_models_from_type(
self, field_type
) -> list[type[BaseModel]]:
@@ -318,7 +316,6 @@ class QuartoGenerator:
return all_groups
- # pylint: disable=too-many-return-statements
def _extract_field_groups_from_source(
self, model_class: type[BaseModel]
) -> list[dict]:
@@ -503,7 +500,7 @@ class QuartoGenerator:
nested_schema = nested_model.model_json_schema()
nested_properties = nested_schema.get("properties", {})
nested_required = nested_schema.get("required", [])
- except Exception: # pylint: disable=broad-exception-caught
+ except Exception:
# Fallback: use model fields directly
nested_properties = {}
nested_required = []
@@ -607,7 +604,7 @@ class QuartoGenerator:
schema = model_class.model_json_schema()
properties = schema.get("properties", {})
required = schema.get("required", [])
- except Exception as e: # pylint: disable=broad-exception-caught
+ except Exception as e:
print(
f"Warning: Could not generate JSON schema ({e}). Using model fields instead."
)
diff --git a/docs/streaming.qmd b/docs/streaming.qmd
new file mode 100644
index 000000000..2a233a4fc
--- /dev/null
+++ b/docs/streaming.qmd
@@ -0,0 +1,120 @@
+---
+title: Streaming Datasets
+description: How to use streaming mode for large-scale datasets and memory-efficient training
+order: 10
+---
+
+Streaming enables memory-efficient training with large datasets by loading data
+incrementally rather than loading the entire dataset into memory at once.
+
+Use streaming when:
+
+- Your dataset is too large to fit in memory (e.g. when you're doing pretraining with massive text corpora)
+- You want to start training immediately without preprocessing the entire dataset
+
+Streaming works with both remote and locally stored datasets!
+
+::: {.callout-note}
+Streaming currently only supports a single dataset. Multi-dataset support will be added soon.
+:::
+
+
+## Configuration
+
+### Basic Streaming
+
+Enable streaming mode by setting the `streaming` flag:
+
+```yaml
+streaming: true
+```
+
+### Pretraining with Streaming
+
+For pretraining tasks, streaming is automatically enabled when using `pretraining_dataset`:
+
+```yaml
+pretraining_dataset:
+ - path: HuggingFaceFW/fineweb-edu
+ type: pretrain
+ text_column: text
+ split: train
+
+# Optionally, enable sample packing
+streaming_multipack_buffer_size: 10000
+sample_packing: true
+```
+
+### SFT with Streaming
+
+For supervised fine-tuning with streaming:
+
+```yaml
+streaming: true
+datasets:
+ - path: tatsu-lab/alpaca
+ type: alpaca
+ split: train
+
+# Optionally, enable sample packing
+streaming_multipack_buffer_size: 10000
+sample_packing: true
+```
+
+## Configuration Options
+
+### `streaming_multipack_buffer_size`
+
+Controls the buffer size for multipack streaming (default: 10,000). This determines how
+many samples are buffered before packing. Larger buffers can improve packing efficiency
+but use more memory.
+
+### `shuffle_merged_datasets`
+
+When enabled, shuffles the streaming dataset using the buffer. This requires additional
+memory for the shuffle buffer.
+
+## Sample Packing with Streaming
+
+Sample packing is supported for streaming datasets. When enabled, multiple samples are
+packed into a single sequence to maximize GPU utilization:
+
+```yaml
+sample_packing: true
+streaming_multipack_buffer_size: 10000
+
+# For SFT: attention is automatically isolated between packed samples
+# For pretraining: control with pretrain_multipack_attn
+pretrain_multipack_attn: true # prevent cross-attention between packed samples
+```
+
+For more information, see our [documentation](multipack.qmd) on multipacking.
+
+## Important Considerations
+
+### Memory Usage
+
+While streaming reduces memory usage compared to loading entire datasets, you still need
+to consider:
+
+- You can control the memory usage by adjusting `streaming_multipack_buffer_size`
+- Sample packing requires buffering multiple samples
+- Shuffling requires additional memory for the shuffle buffer
+
+### Performance
+
+- Streaming may have slightly higher latency compared to preprocessed datasets, as samples are processed on-the-fly
+- Network speed and disk read speed are important when streaming from remote sources or a local dataset, respectively
+- Consider using `axolotl preprocess` for smaller or more frequently used datasets
+
+### Evaluation Datasets
+
+Evaluation datasets are not streamed to ensure consistent evaluation metrics. They're
+loaded normally even when training uses streaming.
+
+## Examples
+
+See the `examples/streaming/` directory for complete configuration examples:
+
+- `pretrain.yaml`: Pretraining with streaming dataset
+- `sft.yaml`: Supervised fine-tuning with streaming
diff --git a/examples/LiquidAI/README.md b/examples/LiquidAI/README.md
new file mode 100644
index 000000000..8a18d9eb1
--- /dev/null
+++ b/examples/LiquidAI/README.md
@@ -0,0 +1,67 @@
+# Finetune Liquid Foundation Models 2 (LFM2) with Axolotl
+
+[Liquid Foundation Models 2 (LFM2)](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38) are a family of small, open-weight models from [Liquid AI](https://www.liquid.ai/) focused on quality, speed, and memory efficiency. Liquid AI released text-only [LFM2](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38) and text+vision [LFM2-VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa) models.
+
+LFM2 features a new hybrid Liquid architecture with multiplicative gates, short-range convolutions, and grouped query attention, enabling fast training and inference.
+
+This guide shows how to fine-tune both the LFM2 and LFM2-VL models with Axolotl.
+
+Thanks to the team at LiquidAI for giving us early access to prepare for these releases.
+
+## Getting Started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
+
+ Here is an example of how to install from pip:
+ ```bash
+ # Ensure you have a compatible version of Pytorch installed
+ pip3 install packaging setuptools wheel ninja
+ pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+ ```
+
+2. Run one of the finetuning examples below.
+
+ **LFM2**
+ ```bash
+ # FFT SFT (1x48GB @ 25GiB)
+ axolotl train examples/LiquidAI/lfm2-350m-fft.yaml
+ ```
+
+ **LFM2-VL**
+ ```bash
+ # LoRA SFT (1x48GB @ 2.7GiB)
+ axolotl train examples/LiquidAI/lfm2-vl-lora.yaml
+ ```
+
+ **LFM2-MoE**
+ ```bash
+ pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6
+
+ # LoRA SFT (1x48GB @ 16.2GiB)
+ axolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml
+ ```
+
+### TIPS
+
+- **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it:
+ ```bash
+ pip uninstall -y causal-conv1d
+ ```
+
+- **Dataset Loading**: Read more on how to load your own dataset in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).
+- **Dataset Formats**:
+ - For LFM2 models, the dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+ - For LFM2-VL models, Axolotl follows the multi-content Messages format. See our [Multimodal docs](https://docs.axolotl.ai/docs/multimodal.html#dataset-format) for details.
+
+## Optimization Guides
+
+- [Optimizations Guide](https://docs.axolotl.ai/docs/optimizations.html)
+
+## Related Resources
+
+- [LFM2 Blog](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models)
+- [LFM2-VL Blog](https://www.liquid.ai/blog/lfm2-vl-efficient-vision-language-models)
+- [LFM2-MoE Blog](https://www.liquid.ai/blog/lfm2-8b-a1b-an-efficient-on-device-mixture-of-experts)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
diff --git a/examples/lfm2/lfm2-350m-fft.yaml b/examples/LiquidAI/lfm2-350m-fft.yaml
similarity index 92%
rename from examples/lfm2/lfm2-350m-fft.yaml
rename to examples/LiquidAI/lfm2-350m-fft.yaml
index 16a0a028e..145b56dd1 100644
--- a/examples/lfm2/lfm2-350m-fft.yaml
+++ b/examples/LiquidAI/lfm2-350m-fft.yaml
@@ -1,8 +1,8 @@
base_model: LiquidAI/LFM2-350M
-chunked_cross_entropy: true
+plugins:
+ - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-chat_template: tokenizer_default
eot_tokens:
- "<|im_end|>"
datasets:
diff --git a/examples/LiquidAI/lfm2-8b-a1b-lora.yaml b/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
new file mode 100644
index 000000000..73cbfcce7
--- /dev/null
+++ b/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -0,0 +1,59 @@
+base_model: LiquidAI/LFM2-8B-A1B
+
+plugins:
+ - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+load_in_8bit: true
+
+eot_tokens:
+ - "<|im_end|>"
+datasets:
+ - path: mlabonne/FineTome-100k
+ type: chat_template
+ split: train[:20%]
+ field_messages: conversations
+ message_field_role: from
+ message_field_content: value
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.05
+output_dir: ./outputs/out
+
+sequence_len: 4096
+sample_packing: true
+
+adapter: lora
+lora_model_dir:
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 4
+num_epochs: 1
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 5e-5
+
+bf16: true
+tf32: true
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 2
+saves_per_epoch: 1
+
+weight_decay: 0.0
+
+# save_first_step: true # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/LiquidAI/lfm2-vl-lora.yaml b/examples/LiquidAI/lfm2-vl-lora.yaml
new file mode 100644
index 000000000..313da8274
--- /dev/null
+++ b/examples/LiquidAI/lfm2-vl-lora.yaml
@@ -0,0 +1,61 @@
+base_model: LiquidAI/LFM2-VL-450M
+trust_remote_code: true
+model_type: AutoModelForImageTextToText
+processor_type: AutoProcessor
+
+plugins:
+ - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+# these 3 lines are needed for now to handle vision chat templates w images
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+datasets:
+ - path: HuggingFaceH4/llava-instruct-mix-vsft
+ type: chat_template
+ split: train[:1%]
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: lora
+lora_model_dir:
+
+sequence_len: 8192
+pad_to_sequence_len: false
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: true
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+logging_steps: 1
+flash_attention: true
+eager_attention:
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
+
+# save_first_step: true # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/alst/README.md b/examples/alst/README.md
index 7f194d299..6d201f826 100644
--- a/examples/alst/README.md
+++ b/examples/alst/README.md
@@ -7,3 +7,24 @@ techniques. It is a combination of:
- Activation Offloading: Offload activations to CPU RAM to reduce memory usage
For more information, you can check out the ALST paper [here](https://www.arxiv.org/abs/2506.13996).
+
+## Usage
+
+```yaml
+tiled_mlp: true
+
+# See Sequence Parallelism docs
+# https://docs.axolotl.ai/docs/sequence_parallelism.html
+context_parallel_size: int
+
+plugins:
+# See Cut Cross Entropy docs
+# https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
+ - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+# or Liger Kernel docs
+# https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels
+ - axolotl.integrations.liger.LigerPlugin
+# ...
+
+```
diff --git a/examples/apertus/README.md b/examples/apertus/README.md
new file mode 100644
index 000000000..774286333
--- /dev/null
+++ b/examples/apertus/README.md
@@ -0,0 +1,110 @@
+# Finetune Swiss-AI's Apertus with Axolotl
+
+[Apertus](https://huggingface.co/collections/swiss-ai/apertus-llm-68b699e65415c231ace3b059) is a family of opensource models trained by Swiss-ai.
+
+This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Apertus is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
+
+ Here is an example of how to install from main for pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+git clone https://github.com/axolotl-ai-cloud/axolotl.git
+cd axolotl
+
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'
+
+# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
+python scripts/cutcrossentropy_install.py | sh
+```
+
+2. (Optional, highly recommended) Install XIELU CUDA
+
+```bash
+## Recommended for reduced VRAM and faster speeds
+
+# Point to CUDA toolkit directory
+# For those using our Docker image, use the below path.
+export CUDA_HOME=/usr/local/cuda
+
+pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+```
+
+For any installation errors, see [XIELU Installation Issues](#xielu-installation-issues)
+
+3. Run the finetuning example:
+
+```bash
+axolotl train examples/apertus/apertus-8b-qlora.yaml
+```
+
+This config uses about 8.7 GiB VRAM.
+
+Let us know how it goes. Happy finetuning! 🚀
+
+### Tips
+
+- For inference, the official Apertus team recommends `top_p=0.9` and `temperature=0.8`.
+- You can instead use full paremter fine-tuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+
+### XIELU Installation Issues
+
+#### `ModuleNotFoundError: No module named 'torch'`
+
+Please check these one by one:
+- Running in correct environment
+- Env has PyTorch installed
+- CUDA toolkit is at `CUDA_HOME`
+
+If those didn't help, please try the below solutions:
+
+1. Pass env for CMAKE and try install again:
+
+ ```bash
+ Python_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+ ```
+
+2. Git clone the repo and manually hardcode python path:
+
+ ```bash
+ git clone https://github.com/nickjbrowning/XIELU
+ cd xielu
+ git checkout 59d6031
+
+ cd xielu
+ nano CMakeLists.txt # or vi depending on your preference
+ ```
+
+ ```diff
+ execute_process(
+ - COMMAND ${Python_EXECUTABLE} -c "import torch.utils; print(torch.utils.cmake_prefix_path)"
+ + COMMAND /root/miniconda3/envs/py3.11/bin/python -c "import torch.utils; print(torch.utils.cmake_prefix_path)"
+ RESULT_VARIABLE TORCH_CMAKE_PATH_RESULT
+ OUTPUT_VARIABLE TORCH_CMAKE_PATH_OUTPUT
+ ERROR_VARIABLE TORCH_CMAKE_PATH_ERROR
+ )
+ ```
+
+ ```bash
+ pip3 install . --no-build-isolation --no-deps
+ ```
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+
+## Related Resources
+
+- [Apertus Tech Report](https://github.com/swiss-ai/apertus-tech-report/blob/main/Apertus_Tech_Report.pdf)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
diff --git a/examples/apertus/apertus-8b-qlora.yaml b/examples/apertus/apertus-8b-qlora.yaml
new file mode 100644
index 000000000..521b282da
--- /dev/null
+++ b/examples/apertus/apertus-8b-qlora.yaml
@@ -0,0 +1,64 @@
+base_model: swiss-ai/Apertus-8B-Instruct-2509
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+ - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+load_in_8bit: false
+load_in_4bit: true
+
+datasets:
+ - path: fozziethebeat/alpaca_messages_2k_test
+ type: chat_template
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/lora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_target_modules:
+ - gate_proj
+ - down_proj
+ - up_proj
+ - q_proj
+ - v_proj
+ - k_proj
+ - o_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+# save_first_step: true # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/arcee/README.md b/examples/arcee/README.md
index 217893306..23f63663e 100644
--- a/examples/arcee/README.md
+++ b/examples/arcee/README.md
@@ -19,6 +19,9 @@ cd axolotl
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation -e '.[flash-attn]'
+
+# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
+python scripts/cutcrossentropy_install.py | sh
```
2. Run the finetuning example:
diff --git a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
index 2202091d5..3223ec19a 100644
--- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
@@ -9,10 +9,6 @@ strict: false
datasets:
- path: fozziethebeat/alpaca_messages_2k_test
type: chat_template
- field_messages: messages
- message_property_mappings:
- role: role
- content: content
dataset_prepared_path:
val_set_size: 0.05
diff --git a/examples/cloud/baseten.yaml b/examples/cloud/baseten.yaml
new file mode 100644
index 000000000..23c4b52d6
--- /dev/null
+++ b/examples/cloud/baseten.yaml
@@ -0,0 +1,10 @@
+provider: baseten
+project_name:
+
+secrets:
+ - HF_TOKEN
+ - WANDB_API_KEY
+
+gpu: h100
+gpu_count: 8
+node_count: 1
diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb
index 69881997e..cea1aeda0 100644
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -1,9934 +1,9944 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "OPLSwmgdrB7g"
- },
- "source": [
- "# Fine-Tune Qwen3 14B with Axolotl\n",
- "\n",
- "[
](https://github.com/axolotl-ai-cloud/axolotl)\n",
- "\n",
- "Axolotl is the most performant LLM post-training framework available, delivering faster training with efficient, consistent and stable performance. Train your workload and ship your product 30% faster; saving you both time and money.\n",
- "\n",
- "- ⭐ us on [GitHub](https://github.com/axolotl-ai-cloud/axolotl)\n",
- "- 📜 Read the [Docs](http://docs.axolotl.ai/)\n",
- "- 💬 Chat with us on [Discord](https://discord.gg/mnpEYgRUmD)\n",
- "- 📰 Get updates on [X/Twitter](https://x.com/axolotl_ai)\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "rVjKD7CbxIP3"
- },
- "source": [
- "# Installation\n",
- "\n",
- "Axolotl is easy to install from [pip](https://pypi.org/project/axolotl/), or use our [pre-built Docker images](http://docs.axolotl.ai/docs/docker.html) for a hassle free dependency experience. See our [docs](http://docs.axolotl.ai/docs/installation.html) for more information."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "msOCO4NRmRLa"
- },
- "outputs": [],
- "source": [
- "%%capture\n",
- "# This step can take ~5-10 minutes to install dependencies\n",
- "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
- "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "N0OW0YeksDLr"
- },
- "source": [
- "## Demo: Talk Like a Pirate\n",
- "\n",
- "In this demo, we are training the model ***to respond like a pirate***. This was chosen as a way to easily show how to train a model to respond in a certain style of your choosing (without being prompted) and is quite easy to validate within the scope of a Colab."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "8Du2fANTsNCK"
- },
- "source": [
- "### Upload your own dataset or use a Huggingface dataset\n",
- "\n",
- "You can choose to use your own JSONL file from your own [Google Drive](https://drive.google.com/drive/home); for example downloading the [Pirate-Ultrachat JSONL](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k/blob/main/train.jsonl) to your Google Drive. JSONL datasets should be formatted similar to the [OpenAI dataset format](https://cookbook.openai.com/examples/chat_finetuning_data_prep).\n",
- "\n",
- "You can also simply use the [`winglian/pirate-ultrachat-10k`](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k) dataset directly.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "fGEEjyQ-r_IV"
- },
- "outputs": [],
- "source": [
- "# Default to HF dataset location\n",
- "dataset_id = \"winglian/pirate-ultrachat-10k\"\n",
- "uploaded = {}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "c5MyYqk7vIsG"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "# Optionally, upload your own JSONL to your Google Drive\n",
- "GOOGLE_DRIVE_PATH = \"\" # ex: \"MyDrive/Colab\\ Notebooks/train.jsonl\"\n",
- "\n",
- "# \"Select All\" permissions, or you may get the error:\n",
- "# \"MessageError: Error: credential propagation was unsuccessful\"\n",
- "if GOOGLE_DRIVE_PATH:\n",
- " from google.colab import drive\n",
- " # Mount your Google Drive\n",
- " GOOGLE_DRIVE_MNT = \"/content/drive/\"\n",
- " drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)\n",
- " tmp_path = os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip(\"/\"))\n",
- " # make sure file exists\n",
- " if not os.path.isfile(tmp_path):\n",
- " raise ValueError(f\"File {tmp_path} does not exist\")\n",
- " dataset_id = tmp_path\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "U6pTk3A9xj1W"
- },
- "source": [
- "# Configure for Supervised Fine-Tuning (SFT)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 151,
- "referenced_widgets": [
- "388f618924274d21a066f098f4f1e744",
- "7c95f85a2b1f47a1bd846d110c47bb3c",
- "083f9cda8d754c168beee10d2f8955a2",
- "62e1a65582f446a78612eaa804e08a7d",
- "487a177d020f4605834878b2fdc7afa3",
- "7fd44cf9ca6e4726bfd7ac21846d6a14",
- "366a343b62fa47d8985a3bd464d99f9e",
- "a0a11e929edd4189b79723d618522c33",
- "e87ea87fcff247b5bbcc331ba79a8dc2",
- "5e18768f7ad6434ba8b8b8a2e853e204",
- "bb33aec33a6447078c31bfd728942994"
- ]
- },
- "id": "fdRioqytmTtX",
- "outputId": "f0acdcec-4b41-4a3f-ffed-c2d2d929158e"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[2025-05-08 13:40:27,488] [INFO] [root.register:348] [PID:174] Attempting to load plugin: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n",
- "[2025-05-08 13:40:27,493] [INFO] [root.register:351] [PID:174] Plugin loaded successfully: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n",
- "[2025-05-08 13:40:27,959] [INFO] [axolotl.utils.schemas.config.check_eval_packing:721] [PID:174] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`\u001b[39m\n",
- "[2025-05-08 13:40:27,960] [INFO] [axolotl.utils.schemas.config.hint_sample_packing_padding:514] [PID:174] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing\u001b[39m\n",
- "[2025-05-08 13:40:27,961] [INFO] [axolotl.utils.schemas.config.check_bf16:1251] [PID:174] [RANK:0] bf16 support detected, but not enabled for this configuration.\u001b[39m\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "388f618924274d21a066f098f4f1e744",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "config.json: 0%| | 0.00/728 [00:00, ?B/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[2025-05-08 13:40:28,590] [INFO] [axolotl.normalize_config:237] [PID:174] [RANK:0] cuda memory usage baseline: 0.000GB (+0.002GB cache, +0.359GB misc)\u001b[39m\n"
- ]
- }
- ],
- "source": [
- "from axolotl.utils.dict import DictDefault\n",
- "from axolotl.cli.config import load_cfg\n",
- "\n",
- "# Axolotl provides full control and transparency over model and training configuration\n",
- "config = DictDefault(\n",
- " base_model = \"Qwen/Qwen3-14B\", # Use the instruct tuned model, but we're aligning it to be a pirate\n",
- " load_in_4bit = True, # set to True for qLoRA\n",
- " adapter = \"qlora\",\n",
- " lora_r = 32,\n",
- " lora_alpha = 64,\n",
- " lora_target_modules = [\n",
- " \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", # train self_attn linear modules\n",
- " \"gate_proj\", \"down_proj\", \"up_proj\", # train MLP linear modules\n",
- " ],\n",
- " lora_qkv_kernel = True, # optimized triton kernels for LoRA\n",
- " lora_o_kernel = True,\n",
- " lora_mlp_kernel = True,\n",
- " embeddings_skip_upcast = True, # keep embeddings in fp16 so the model fits in 15GB VRAM\n",
- " xformers_attention = True, # use xformers on Colab w/ T4 for memory efficient attention, flash_attention only on Ampere or above\n",
- " plugins = [\n",
- " # more efficient training using Apple's Cut Cross Entropy; https://github.com/apple/ml-cross-entropy\n",
- " \"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\",\n",
- " ],\n",
- " sample_packing = True, # 2-6x increase in tokens per micro-batch\n",
- " # when using packing, use a slightly higher learning rate to account for fewer steps\n",
- " # alternatively, reduce the micro_batch_size + gradient_accumulation_steps to achieve closer to the same number of steps/epoch\n",
- " learning_rate = 0.00019,\n",
- " sequence_len = 4096, # larger sequence length improves packing efficiency for more tokens/sec\n",
- " micro_batch_size = 1,\n",
- " gradient_accumulation_steps = 1,\n",
- " gradient_checkpointing = True, # tradeoff reduced VRAM for increased time\n",
- " gradient_checkpointing_kwargs = {\n",
- " \"use_reentrant\": False,\n",
- " },\n",
- " optimizer = \"paged_adamw_8bit\",\n",
- " lr_scheduler = \"cosine\",\n",
- " warmup_steps = 5,\n",
- " fp16 = True, # use float16 + automatic mixed precision, bfloat16 not supported on Colab w/ T4\n",
- " bf16 = False,\n",
- " max_grad_norm = 0.1, # gradient clipping\n",
- " num_epochs = 1,\n",
- " saves_per_epoch = 2, # how many checkpoints to save over one epoch\n",
- " logging_steps = 1,\n",
- " output_dir = \"./outputs/qwen-sft-pirate-rrr\",\n",
- " chat_template = \"qwen3\",\n",
- " datasets = [\n",
- " {\n",
- " \"path\": dataset_id, # Huggingface Dataset id or path to train.jsonl\n",
- " \"type\": \"chat_template\",\n",
- " \"split\": \"train\",\n",
- " \"eot_tokens\": [\"<|im_end|>\"],\n",
- " }\n",
- " ],\n",
- " dataloader_prefetch_factor = 8, # dataloader optimizations\n",
- " dataloader_num_workers = 2,\n",
- " dataloader_pin_memory = True,\n",
- " )\n",
- "\n",
- "# validates the configuration\n",
- "cfg = load_cfg(config)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "715UpvnSoBIS"
- },
- "outputs": [],
- "source": [
- "from axolotl.utils import patch_optimized_env\n",
- "# speedup downloads from HF 🤗 and set \"PYTORCH_CUDA_ALLOC_CONF\" env to save memory\n",
- "patch_optimized_env()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Vc6MC-hwyH-n"
- },
- "source": [
- "# Datasets\n",
- "\n",
- "Axolotl has a robust suite of loaders and transforms to parse most open datasets of any format into the appropriate chat template for your model. Axolotl will mask input tokens from the user's prompt so that the train loss is only calculated against the model's response. For more information, [see our documentation](http://docs.axolotl.ai/docs/dataset-formats/conversation.html) on dataset preparation.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 1000,
- "referenced_widgets": [
- "b82aa8c57f7c422a9a9c90f333ed2a99",
- "c0991cf63ee6458b96e9a75e7a88b61a",
- "71c8af139cd248b1b51101fd46a93f35",
- "1d5117195d4b49eb8f1a73b18419f7ce",
- "3c21e4a511b4441192c03b7f1d0976e9",
- "ed28e2e0410d4e0b855467e798e53d66",
- "d93f134f802b4b69b575bdaf07dbd27c",
- "d0e9dce55cec4c1ca619a0ccf209d924",
- "4c727d40ef0443449afc31724ee79f0c",
- "0dea5caa27384f5689e3cab51f558727",
- "a6f48410b9964fefba0c3009a77dc838",
- "95caff42f08a4c2aa14c867b8f37f231",
- "de7c37ee83e24f0c889e84d07279c2ec",
- "9d4897eefb5f48259ffb2d23e332f752",
- "253017b0d0534e54ab44e181f6d7c82d",
- "27beaf06e41b472abdb544a43c720c5a",
- "34cf3df51fbc41cabfdbba153c007f0e",
- "ac764024cf1c4e08ba7749afd2cd20ac",
- "30a81da86f8043eca301e86a8651201a",
- "e8b7a81040904c1e89e58978223b1737",
- "1c6f1f10667545aaab958016ba7e2c94",
- "e6e969610738449887259063967f82b0",
- "a138859f19b74fc0928dc236ab5359db",
- "9b42e08b3c9548818488268768a118b1",
- "12b56912736849fea2ad8124456fdc5c",
- "879c8ab5873847a8833bd74123be90a4",
- "20352e5f58d24bb8b1f3940efd14fe4a",
- "d955dcaa0e944e719f3a06139dd54a03",
- "d3de2662c7964f1ba96e58da382af720",
- "97e36007e1304e1583fd81bfb13f0edd",
- "c65dc74c7d6f4bab8f7dd28455161dd8",
- "ef223e8504b64e3592589880326aaf41",
- "598da69727bd4fb8b1caf465ac736d7a",
- "5f86cd894de94c3280fadc1e2fd0ee13",
- "a20927bf5f2c41f58c1e31ac858ab36c",
- "0a46ad75c198463d843fb35e813642cb",
- "09007681cf8d42aeb8c1d2f6a74e470a",
- "ebc80d1a55fa47f4a5ea2756588569ec",
- "1811cda0644e4190a9469d1774435d82",
- "35c811d2ae8e43f3b5cecbdd3cfa857f",
- "b8e39e4dddc3497fbc29ae45c66da759",
- "63b4e563e85c4f03b1b72beda9577bcc",
- "b195f160ca20442fadd8b5aed0ee41af",
- "ca65e32eb52f48c09a84b33cb18f22cd",
- "7cd0b85ebd204b7aba908417811ce4e0",
- "7baeab52d6694c32b1efd1ea1a0a7782",
- "519a7b154022443db6703f04a9142bae",
- "d4183e9715f34d249942b8271cca3bdf",
- "da2347ac94764a3fa2743343cf0d3cd2",
- "93a44a11aa4846fa8efc6c1413ef1627",
- "a55060adc3564407ac81ad7297d34aaa",
- "d02274afd47b462291c745f261209d42",
- "0f417447a7bd4a33acca96fa37aec877",
- "63580b6fb30642479fe3000915bf551a",
- "8f726dbfb45d4528afa33e36a6313267",
- "03b093d592ba4386aa61f7b8483da660",
- "b8766a88716948cf968f4563531a76d9",
- "6f3a28b912714c6e931003549664bfa3",
- "16d1283741404b7bb319094c992fce01",
- "2a5bb0e818ab47be8cf6465988328503",
- "2b3a2659b12244bd8548320320016dbf",
- "0cd7efffbb3c4c4b972e63749f61ab97",
- "5ca240f31e6b44e3882c5eb37cd5a309",
- "5eb06edeb58e4930b1affef2a59eae81",
- "a4e5789584564049b83df7c6c54a3e08",
- "ff3a94b146a948b6907f5d80c7157f99",
- "258b7c635c1045329d4669e48c46ccd5",
- "6f68ed9889f54ad2ae8a3b95ac263a83",
- "80366349d81e4dcc892db6cd56e384f3",
- "c73055099c084dca996159e23e162d0b",
- "977f799afaac4a55b2dc1cffa7d5b63b",
- "41f3b32c2f6b4034ae7a3b9124e28bc7",
- "a10d0a76010f4e508c65a9b69ebc5156",
- "f8ef805b776145c3bfa9ba8d90972058",
- "cc587493c33c4f118d1b1170f85be24c",
- "e40d1c1ac9494b3bade9858324e7ffdf",
- "d65b6b060d9845779299491ac5599c31",
- "0f6907ebbc6242c8bde059cef1e1bd29",
- "5bdfd87fc6cd4f9dabef7cfee29c8060",
- "64f54d4a744a4627a07c3c0120276f3b",
- "65b75b9b8bc143cf997796af68ff6668",
- "d6fe74e4255444368f8f90a62157d869",
- "4d468f96ec924681ad65eb671674b93e",
- "ad7599de524549c48bf2d3124ad4b299",
- "0546d04aae644dde846c58a4afb598a6",
- "897b77a56c09479bb11d7f2a30997e55",
- "81c3db71ac704280ad030072655f1537",
- "042e091f75694c47aee761e760e76773",
- "ef0a3c7a6f14460fb4da096928ae249e",
- "07fb3a2c8315494e97b447e672dfae06",
- "ec030fc3c346426f9abc3a89892258d3",
- "e3fb3fc6afe04b3c9b7ac61809ce78fa",
- "c3be9109d63c485d9c0ef4f9bc0f9218",
- "12815f401eba44658caa7b2e490137a8",
- "30e02aa2d0d241979369e598287f2639",
- "dfd2a2649b8341ef913207526708aff1",
- "4f1977d7e4824ef1a14b65f0f42bba10",
- "c6164e05a1914ae48083db9ad7f4ef7c",
- "813621384dc748b0ad06775e22761c0b",
- "dc892a596f6942d7973c616c38f0eebb",
- "c84cc07789be48aebb322c23d355289e",
- "bed8726b8069434687c75452e21f19e5",
- "16a188a0b06d45f980dcf3933509fe0a",
- "60c1a0d765c14a1d888317e6a507e4ea",
- "0077aedc3d174560bce924ee89e9c006",
- "00321cce58884f6f9b3855a21fcd9187",
- "fa864b41586f4a7aa56aeafd1d84eb75",
- "3225603166b54e7aab766b9964a2f660",
- "349eee9f56d64f0cba6fc24ff2c50c9b",
- "7e5d3774060e4589aa65982da5ea4ef4",
- "7c2485c6cdfe463da6fdb35982a1070d",
- "ad1236893754446881e153adc9d5c962",
- "daee63fd167e4441a32324b51b00ad2b",
- "fe41858c6bd04c58840112b67c19a336",
- "d262c82138024169b9f3aa034ca756fa",
- "62e302ebdad64aada0ffe64ae1c873f3",
- "bd1b0dfed6d34d16af33a4a58330f5ec",
- "d07c8b97d3314f1c852e44bdd40f61ed",
- "ebb69a2c3d0a4299a484698287b3087c",
- "e5a82df528bb4e408797a3b6c2758f4a",
- "f113ebd8c1c34806bea4dd7ed3035173"
- ]
- },
- "id": "KQQhgK8FoDfF",
- "outputId": "f69441d8-95f9-4885-c306-6c8709090ff6"
- },
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "b82aa8c57f7c422a9a9c90f333ed2a99",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "tokenizer_config.json: 0%| | 0.00/9.68k [00:00, ?B/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "95caff42f08a4c2aa14c867b8f37f231",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "vocab.json: 0%| | 0.00/2.78M [00:00, ?B/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "a138859f19b74fc0928dc236ab5359db",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "merges.txt: 0%| | 0.00/1.67M [00:00, ?B/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "5f86cd894de94c3280fadc1e2fd0ee13",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "tokenizer.json: 0%| | 0.00/11.4M [00:00, ?B/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[2025-05-08 13:41:00,844] [DEBUG] [axolotl.utils.models.load_tokenizer:441] [PID:174] [RANK:0] EOS: 151645 / <|im_end|>\u001b[39m\n",
- "[2025-05-08 13:41:00,845] [DEBUG] [axolotl.utils.models.load_tokenizer:442] [PID:174] [RANK:0] BOS: None / None\u001b[39m\n",
- "[2025-05-08 13:41:00,846] [DEBUG] [axolotl.utils.models.load_tokenizer:443] [PID:174] [RANK:0] PAD: 151643 / <|endoftext|>\u001b[39m\n",
- "[2025-05-08 13:41:00,847] [DEBUG] [axolotl.utils.models.load_tokenizer:444] [PID:174] [RANK:0] UNK: None / None\u001b[39m\n",
- "[2025-05-08 13:41:00,869] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:271] [PID:174] [RANK:0] Unable to find prepared dataset in last_run_prepared/97037817611d38b3a9c681753c3c4c95\u001b[39m\n",
- "[2025-05-08 13:41:00,870] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:272] [PID:174] [RANK:0] Loading raw datasets...\u001b[39m\n",
- "\u001b[33m[2025-05-08 13:41:00,870] [WARNING] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:274] [PID:174] [RANK:0] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset.\u001b[39m\n",
- "[2025-05-08 13:41:00,871] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:281] [PID:174] [RANK:0] No seed provided, using default seed of 42\u001b[39m\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "7cd0b85ebd204b7aba908417811ce4e0",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "train.jsonl: 0%| | 0.00/27.3M [00:00, ?B/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "03b093d592ba4386aa61f7b8483da660",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Generating train split: 0 examples [00:00, ? examples/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[2025-05-08 13:41:04,196] [INFO] [axolotl.utils.data.sft.get_dataset_wrapper:484] [PID:174] [RANK:0] Loading dataset with base_type: chat_template and prompt_style: None\u001b[39m\n",
- "[2025-05-08 13:41:04,233] [INFO] [axolotl.__call__:761] [PID:174] [RANK:0] Using chat template:\n",
- "---\n",
- "{%- if tools %}\n",
- " {{- '<|im_start|>system\\n' }}\n",
- " {%- if messages[0].role == 'system' %}\n",
- " {{- messages[0].content + '\\n\\n' }}\n",
- " {%- endif %}\n",
- " {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within \n",
- " \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " Step \n",
- " Training Loss \n",
- " \n",
- " \n",
- " 1 \n",
- " 1.092300 \n",
- " \n",
- " \n",
- " 2 \n",
- " 1.554200 \n",
- " \n",
- " \n",
- " 3 \n",
- " 1.041400 \n",
- " \n",
- " \n",
- " 4 \n",
- " 1.733800 \n",
- " \n",
- " \n",
- " 5 \n",
- " 1.430000 \n",
- " \n",
- " \n",
- " 6 \n",
- " 1.258500 \n",
- " \n",
- " \n",
- " 7 \n",
- " 1.343600 \n",
- " \n",
- " \n",
- " 8 \n",
- " 1.101700 \n",
- " \n",
- " \n",
- " 9 \n",
- " 1.086500 \n",
- " \n",
- " \n",
- " 10 \n",
- " 0.813200 \n",
- " \n",
- " \n",
- " 11 \n",
- " 0.689600 \n",
- " \n",
- " \n",
- " 12 \n",
- " 0.826700 \n",
- " \n",
- " \n",
- " 13 \n",
- " 1.541800 \n",
- " \n",
- " \n",
- " 14 \n",
- " 0.948000 \n",
- " \n",
- " \n",
- " 15 \n",
- " 1.357000 \n",
- " \n",
- " \n",
- " 16 \n",
- " 1.085800 \n",
- " \n",
- " \n",
- " 17 \n",
- " 1.516800 \n",
- " \n",
- " \n",
- " 18 \n",
- " 1.146800 \n",
- " \n",
- " \n",
- " 19 \n",
- " 0.834800 \n",
- " \n",
- " \n",
- " 20 \n",
- " 0.968000 \n",
- " \n",
- " \n",
- " 21 \n",
- " 1.388800 \n",
- " \n",
- " \n",
- " 22 \n",
- " 1.511500 \n",
- " \n",
- " \n",
- " 23 \n",
- " 1.338500 \n",
- " \n",
- " \n",
- " 24 \n",
- " 1.206600 \n",
- " \n",
- " \n",
- " \n",
- "25 \n",
- " 1.504600 \n",
- " \n",
- " sys.exit(main())\n",
- " ^^^^^^\n",
- " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/huggingface_cli.py\", line 57, in main\n",
- " service.run()\n",
- " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 207, in run\n",
- " print(self._upload())\n",
- " ^^^^^^^^^^^^^^\n",
- " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 302, in _upload\n",
- " return self.api.upload_folder(\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
- " return fn(*args, **kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^\n",
- " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n",
- " return fn(self, *args, **kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4942, in upload_folder\n",
- " commit_info = self.create_commit(\n",
- " ^^^^^^^^^^^^^^^^^^^\n",
- " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
- " return fn(*args, **kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^\n",
- " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n",
- " return fn(self, *args, **kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4202, in create_commit\n",
- " self.preupload_lfs_files(\n",
- " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4483, in preupload_lfs_files\n",
- " _upload_xet_files(**upload_kwargs, create_pr=create_pr) # type: ignore [arg-type]\n",
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
- " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
- " return fn(*args, **kwargs)\n",
- " ^^^^^^^^^^^^^^^^^^^\n",
- " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/_commit_api.py\", line 592, in _upload_xet_files\n",
- " with progress_cm as progress:\n",
- " File \"/usr/local/lib/python3.11/dist-packages/tqdm/std.py\", line 1138, in __exit__\n",
- " def __exit__(self, exc_type, exc_value, traceback):\n",
- "\n",
- "KeyboardInterrupt\n",
- "^C\n"
- ]
- }
- ],
- "source": [
- "from huggingface_hub import notebook_login\n",
- "# remove the partial epoch checkpoints\n",
- "!rm -rf \"./outputs/qwen-sft-pirate-rrr/checkpoint-*\"\n",
- "\n",
- "# HF Notebook login widget\n",
- "notebook_login()\n",
- "\n",
- "# upload the LoRA adapter for your model to HF, remember to update the username/model-name below\n",
- "!huggingface-cli upload --repo-type=model winglian/pirate-qwen-14B \"./outputs/qwen-sft-pirate-rrr\""
- ]
- }
- ],
- "metadata": {
- "accelerator": "GPU",
- "colab": {
- "gpuType": "T4",
- "provenance": []
- },
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
- },
- "language_info": {
- "name": "python"
- },
- "widgets": {
- "application/vnd.jupyter.widget-state+json": {
- "00321cce58884f6f9b3855a21fcd9187": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "004d9177a6a14118a5930dc3cc13147b": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HBoxModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_a80410b919e442c49aea15acc1ce1a72",
- "IPY_MODEL_c6e00f5224364822bc4239b176686919",
- "IPY_MODEL_ec11d1e5ae7b42c883d9b1f38a65356e"
- ],
- "layout": "IPY_MODEL_734185351eb543fa9a00a881dcbb9fe7"
- }
- },
- "0077aedc3d174560bce924ee89e9c006": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "03a3c744d716431488163b4358b80f92": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "03b093d592ba4386aa61f7b8483da660": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HBoxModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_b8766a88716948cf968f4563531a76d9",
- "IPY_MODEL_6f3a28b912714c6e931003549664bfa3",
- "IPY_MODEL_16d1283741404b7bb319094c992fce01"
- ],
- "layout": "IPY_MODEL_2a5bb0e818ab47be8cf6465988328503"
- }
- },
- "042e091f75694c47aee761e760e76773": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "0546d04aae644dde846c58a4afb598a6": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "054c8dffadba48c6b895a6cc62448ecc": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "bar_color": null,
- "description_width": ""
- }
- },
- "07fb3a2c8315494e97b447e672dfae06": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_12815f401eba44658caa7b2e490137a8",
- "placeholder": "",
- "style": "IPY_MODEL_30e02aa2d0d241979369e598287f2639",
- "value": "Drop Samples with Zero Trainable Tokens (num_proc=2): 100%"
- }
- },
- "083f9cda8d754c168beee10d2f8955a2": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "ProgressView",
- "bar_style": "success",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_a0a11e929edd4189b79723d618522c33",
- "max": 728,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_e87ea87fcff247b5bbcc331ba79a8dc2",
- "value": 728
- }
- },
- "09007681cf8d42aeb8c1d2f6a74e470a": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_b195f160ca20442fadd8b5aed0ee41af",
- "placeholder": "",
- "style": "IPY_MODEL_ca65e32eb52f48c09a84b33cb18f22cd",
- "value": " 11.4M/11.4M [00:00<00:00, 21.8MB/s]"
- }
- },
- "0a46ad75c198463d843fb35e813642cb": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "ProgressView",
- "bar_style": "success",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_b8e39e4dddc3497fbc29ae45c66da759",
- "max": 11422654,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_63b4e563e85c4f03b1b72beda9577bcc",
- "value": 11422654
- }
- },
- "0aa8ab56b85f4171a79c3bc210594025": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "bar_color": null,
- "description_width": ""
- }
- },
- "0b4c9753a7cb4354b8e5f187e6e1ad7c": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "0cd7efffbb3c4c4b972e63749f61ab97": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "0dea5caa27384f5689e3cab51f558727": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "0e067d8db8ed48308a718d5f57683fd1": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_b1bea589efa14258a9982071b87938bf",
- "placeholder": "",
- "style": "IPY_MODEL_590eef89881545aa8bbef9a8bbe7fb00",
- "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks.
Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
](https://github.com/axolotl-ai-cloud/axolotl)\n",
+ "\n",
+ "Axolotl is the most performant LLM post-training framework available, delivering faster training with efficient, consistent and stable performance. Train your workload and ship your product 30% faster; saving you both time and money.\n",
+ "\n",
+ "- ⭐ us on [GitHub](https://github.com/axolotl-ai-cloud/axolotl)\n",
+ "- 📜 Read the [Docs](http://docs.axolotl.ai/)\n",
+ "- 💬 Chat with us on [Discord](https://discord.gg/mnpEYgRUmD)\n",
+ "- 📰 Get updates on [X/Twitter](https://x.com/axolotl_ai)\n"
+ ]
},
- "nbformat": 4,
- "nbformat_minor": 0
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "rVjKD7CbxIP3"
+ },
+ "source": [
+ "# Installation\n",
+ "\n",
+ "Axolotl is easy to install from [pip](https://pypi.org/project/axolotl/), or use our [pre-built Docker images](http://docs.axolotl.ai/docs/docker.html) for a hassle free dependency experience. See our [docs](http://docs.axolotl.ai/docs/installation.html) for more information."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "msOCO4NRmRLa"
+ },
+ "outputs": [],
+ "source": [
+ "%%capture\n",
+ "# This step can take ~5-10 minutes to install dependencies\n",
+ "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
+ "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "N0OW0YeksDLr"
+ },
+ "source": [
+ "## Demo: Talk Like a Pirate\n",
+ "\n",
+ "In this demo, we are training the model ***to respond like a pirate***. This was chosen as a way to easily show how to train a model to respond in a certain style of your choosing (without being prompted) and is quite easy to validate within the scope of a Colab."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "8Du2fANTsNCK"
+ },
+ "source": [
+ "### Upload your own dataset or use a Huggingface dataset\n",
+ "\n",
+ "You can choose to use your own JSONL file from your own [Google Drive](https://drive.google.com/drive/home); for example downloading the [Pirate-Ultrachat JSONL](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k/blob/main/train.jsonl) to your Google Drive. JSONL datasets should be formatted similar to the [OpenAI dataset format](https://cookbook.openai.com/examples/chat_finetuning_data_prep).\n",
+ "\n",
+ "You can also simply use the [`winglian/pirate-ultrachat-10k`](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k) dataset directly.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "fGEEjyQ-r_IV"
+ },
+ "outputs": [],
+ "source": [
+ "# Default to HF dataset location\n",
+ "dataset_id = \"winglian/pirate-ultrachat-10k\"\n",
+ "uploaded = {}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "c5MyYqk7vIsG"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "# Optionally, upload your own JSONL to your Google Drive\n",
+ "GOOGLE_DRIVE_PATH = \"\" # ex: \"MyDrive/Colab\\ Notebooks/train.jsonl\"\n",
+ "\n",
+ "# \"Select All\" permissions, or you may get the error:\n",
+ "# \"MessageError: Error: credential propagation was unsuccessful\"\n",
+ "if GOOGLE_DRIVE_PATH:\n",
+ " from google.colab import drive\n",
+ "\n",
+ " # Mount your Google Drive\n",
+ " GOOGLE_DRIVE_MNT = \"/content/drive/\"\n",
+ " drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)\n",
+ " tmp_path = os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip(\"/\"))\n",
+ " # make sure file exists\n",
+ " if not os.path.isfile(tmp_path):\n",
+ " raise ValueError(f\"File {tmp_path} does not exist\")\n",
+ " dataset_id = tmp_path"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "U6pTk3A9xj1W"
+ },
+ "source": [
+ "# Configure for Supervised Fine-Tuning (SFT)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 151,
+ "referenced_widgets": [
+ "388f618924274d21a066f098f4f1e744",
+ "7c95f85a2b1f47a1bd846d110c47bb3c",
+ "083f9cda8d754c168beee10d2f8955a2",
+ "62e1a65582f446a78612eaa804e08a7d",
+ "487a177d020f4605834878b2fdc7afa3",
+ "7fd44cf9ca6e4726bfd7ac21846d6a14",
+ "366a343b62fa47d8985a3bd464d99f9e",
+ "a0a11e929edd4189b79723d618522c33",
+ "e87ea87fcff247b5bbcc331ba79a8dc2",
+ "5e18768f7ad6434ba8b8b8a2e853e204",
+ "bb33aec33a6447078c31bfd728942994"
+ ]
+ },
+ "id": "fdRioqytmTtX",
+ "outputId": "f0acdcec-4b41-4a3f-ffed-c2d2d929158e"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2025-05-08 13:40:27,488] [INFO] [root.register:348] [PID:174] Attempting to load plugin: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n",
+ "[2025-05-08 13:40:27,493] [INFO] [root.register:351] [PID:174] Plugin loaded successfully: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n",
+ "[2025-05-08 13:40:27,959] [INFO] [axolotl.utils.schemas.config.check_eval_packing:721] [PID:174] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`\u001b[39m\n",
+ "[2025-05-08 13:40:27,960] [INFO] [axolotl.utils.schemas.config.hint_sample_packing_padding:514] [PID:174] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing\u001b[39m\n",
+ "[2025-05-08 13:40:27,961] [INFO] [axolotl.utils.schemas.config.check_bf16:1251] [PID:174] [RANK:0] bf16 support detected, but not enabled for this configuration.\u001b[39m\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "388f618924274d21a066f098f4f1e744",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "config.json: 0%| | 0.00/728 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2025-05-08 13:40:28,590] [INFO] [axolotl.normalize_config:237] [PID:174] [RANK:0] cuda memory usage baseline: 0.000GB (+0.002GB cache, +0.359GB misc)\u001b[39m\n"
+ ]
+ }
+ ],
+ "source": [
+ "from axolotl.cli.config import load_cfg\n",
+ "from axolotl.utils.dict import DictDefault\n",
+ "\n",
+ "# Axolotl provides full control and transparency over model and training configuration\n",
+ "config = DictDefault(\n",
+ " base_model=\"Qwen/Qwen3-14B\", # Use the instruct tuned model, but we're aligning it to be a pirate\n",
+ " load_in_4bit=True, # set to True for qLoRA\n",
+ " adapter=\"qlora\",\n",
+ " lora_r=32,\n",
+ " lora_alpha=64,\n",
+ " lora_target_modules=[\n",
+ " \"q_proj\",\n",
+ " \"k_proj\",\n",
+ " \"v_proj\",\n",
+ " \"o_proj\", # train self_attn linear modules\n",
+ " \"gate_proj\",\n",
+ " \"down_proj\",\n",
+ " \"up_proj\", # train MLP linear modules\n",
+ " ],\n",
+ " lora_qkv_kernel=True, # optimized triton kernels for LoRA\n",
+ " lora_o_kernel=True,\n",
+ " lora_mlp_kernel=True,\n",
+ " embeddings_skip_upcast=True, # keep embeddings in fp16 so the model fits in 15GB VRAM\n",
+ " xformers_attention=True, # use xformers on Colab w/ T4 for memory efficient attention, flash_attention only on Ampere or above\n",
+ " plugins=[\n",
+ " # more efficient training using Apple's Cut Cross Entropy; https://github.com/apple/ml-cross-entropy\n",
+ " \"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\",\n",
+ " ],\n",
+ " sample_packing=True, # 2-6x increase in tokens per micro-batch\n",
+ " # when using packing, use a slightly higher learning rate to account for fewer steps\n",
+ " # alternatively, reduce the micro_batch_size + gradient_accumulation_steps to achieve closer to the same number of steps/epoch\n",
+ " learning_rate=0.00019,\n",
+ " sequence_len=4096, # larger sequence length improves packing efficiency for more tokens/sec\n",
+ " micro_batch_size=1,\n",
+ " gradient_accumulation_steps=1,\n",
+ " gradient_checkpointing=True, # tradeoff reduced VRAM for increased time\n",
+ " gradient_checkpointing_kwargs={\n",
+ " \"use_reentrant\": False,\n",
+ " },\n",
+ " optimizer=\"paged_adamw_8bit\",\n",
+ " lr_scheduler=\"cosine\",\n",
+ " warmup_steps=5,\n",
+ " fp16=True, # use float16 + automatic mixed precision, bfloat16 not supported on Colab w/ T4\n",
+ " bf16=False,\n",
+ " max_grad_norm=0.1, # gradient clipping\n",
+ " num_epochs=1,\n",
+ " saves_per_epoch=2, # how many checkpoints to save over one epoch\n",
+ " logging_steps=1,\n",
+ " output_dir=\"./outputs/qwen-sft-pirate-rrr\",\n",
+ " chat_template=\"qwen3\",\n",
+ " datasets=[\n",
+ " {\n",
+ " \"path\": dataset_id, # Huggingface Dataset id or path to train.jsonl\n",
+ " \"type\": \"chat_template\",\n",
+ " \"split\": \"train\",\n",
+ " \"eot_tokens\": [\"<|im_end|>\"],\n",
+ " }\n",
+ " ],\n",
+ " dataloader_prefetch_factor=8, # dataloader optimizations\n",
+ " dataloader_num_workers=2,\n",
+ " dataloader_pin_memory=True,\n",
+ ")\n",
+ "\n",
+ "# validates the configuration\n",
+ "cfg = load_cfg(config)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "715UpvnSoBIS"
+ },
+ "outputs": [],
+ "source": [
+ "from axolotl.utils import set_pytorch_cuda_alloc_conf\n",
+ "\n",
+ "# Set \"PYTORCH_CUDA_ALLOC_CONF\" env to save memory\n",
+ "set_pytorch_cuda_alloc_conf()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Vc6MC-hwyH-n"
+ },
+ "source": [
+ "# Datasets\n",
+ "\n",
+ "Axolotl has a robust suite of loaders and transforms to parse most open datasets of any format into the appropriate chat template for your model. Axolotl will mask input tokens from the user's prompt so that the train loss is only calculated against the model's response. For more information, [see our documentation](http://docs.axolotl.ai/docs/dataset-formats/conversation.html) on dataset preparation.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000,
+ "referenced_widgets": [
+ "b82aa8c57f7c422a9a9c90f333ed2a99",
+ "c0991cf63ee6458b96e9a75e7a88b61a",
+ "71c8af139cd248b1b51101fd46a93f35",
+ "1d5117195d4b49eb8f1a73b18419f7ce",
+ "3c21e4a511b4441192c03b7f1d0976e9",
+ "ed28e2e0410d4e0b855467e798e53d66",
+ "d93f134f802b4b69b575bdaf07dbd27c",
+ "d0e9dce55cec4c1ca619a0ccf209d924",
+ "4c727d40ef0443449afc31724ee79f0c",
+ "0dea5caa27384f5689e3cab51f558727",
+ "a6f48410b9964fefba0c3009a77dc838",
+ "95caff42f08a4c2aa14c867b8f37f231",
+ "de7c37ee83e24f0c889e84d07279c2ec",
+ "9d4897eefb5f48259ffb2d23e332f752",
+ "253017b0d0534e54ab44e181f6d7c82d",
+ "27beaf06e41b472abdb544a43c720c5a",
+ "34cf3df51fbc41cabfdbba153c007f0e",
+ "ac764024cf1c4e08ba7749afd2cd20ac",
+ "30a81da86f8043eca301e86a8651201a",
+ "e8b7a81040904c1e89e58978223b1737",
+ "1c6f1f10667545aaab958016ba7e2c94",
+ "e6e969610738449887259063967f82b0",
+ "a138859f19b74fc0928dc236ab5359db",
+ "9b42e08b3c9548818488268768a118b1",
+ "12b56912736849fea2ad8124456fdc5c",
+ "879c8ab5873847a8833bd74123be90a4",
+ "20352e5f58d24bb8b1f3940efd14fe4a",
+ "d955dcaa0e944e719f3a06139dd54a03",
+ "d3de2662c7964f1ba96e58da382af720",
+ "97e36007e1304e1583fd81bfb13f0edd",
+ "c65dc74c7d6f4bab8f7dd28455161dd8",
+ "ef223e8504b64e3592589880326aaf41",
+ "598da69727bd4fb8b1caf465ac736d7a",
+ "5f86cd894de94c3280fadc1e2fd0ee13",
+ "a20927bf5f2c41f58c1e31ac858ab36c",
+ "0a46ad75c198463d843fb35e813642cb",
+ "09007681cf8d42aeb8c1d2f6a74e470a",
+ "ebc80d1a55fa47f4a5ea2756588569ec",
+ "1811cda0644e4190a9469d1774435d82",
+ "35c811d2ae8e43f3b5cecbdd3cfa857f",
+ "b8e39e4dddc3497fbc29ae45c66da759",
+ "63b4e563e85c4f03b1b72beda9577bcc",
+ "b195f160ca20442fadd8b5aed0ee41af",
+ "ca65e32eb52f48c09a84b33cb18f22cd",
+ "7cd0b85ebd204b7aba908417811ce4e0",
+ "7baeab52d6694c32b1efd1ea1a0a7782",
+ "519a7b154022443db6703f04a9142bae",
+ "d4183e9715f34d249942b8271cca3bdf",
+ "da2347ac94764a3fa2743343cf0d3cd2",
+ "93a44a11aa4846fa8efc6c1413ef1627",
+ "a55060adc3564407ac81ad7297d34aaa",
+ "d02274afd47b462291c745f261209d42",
+ "0f417447a7bd4a33acca96fa37aec877",
+ "63580b6fb30642479fe3000915bf551a",
+ "8f726dbfb45d4528afa33e36a6313267",
+ "03b093d592ba4386aa61f7b8483da660",
+ "b8766a88716948cf968f4563531a76d9",
+ "6f3a28b912714c6e931003549664bfa3",
+ "16d1283741404b7bb319094c992fce01",
+ "2a5bb0e818ab47be8cf6465988328503",
+ "2b3a2659b12244bd8548320320016dbf",
+ "0cd7efffbb3c4c4b972e63749f61ab97",
+ "5ca240f31e6b44e3882c5eb37cd5a309",
+ "5eb06edeb58e4930b1affef2a59eae81",
+ "a4e5789584564049b83df7c6c54a3e08",
+ "ff3a94b146a948b6907f5d80c7157f99",
+ "258b7c635c1045329d4669e48c46ccd5",
+ "6f68ed9889f54ad2ae8a3b95ac263a83",
+ "80366349d81e4dcc892db6cd56e384f3",
+ "c73055099c084dca996159e23e162d0b",
+ "977f799afaac4a55b2dc1cffa7d5b63b",
+ "41f3b32c2f6b4034ae7a3b9124e28bc7",
+ "a10d0a76010f4e508c65a9b69ebc5156",
+ "f8ef805b776145c3bfa9ba8d90972058",
+ "cc587493c33c4f118d1b1170f85be24c",
+ "e40d1c1ac9494b3bade9858324e7ffdf",
+ "d65b6b060d9845779299491ac5599c31",
+ "0f6907ebbc6242c8bde059cef1e1bd29",
+ "5bdfd87fc6cd4f9dabef7cfee29c8060",
+ "64f54d4a744a4627a07c3c0120276f3b",
+ "65b75b9b8bc143cf997796af68ff6668",
+ "d6fe74e4255444368f8f90a62157d869",
+ "4d468f96ec924681ad65eb671674b93e",
+ "ad7599de524549c48bf2d3124ad4b299",
+ "0546d04aae644dde846c58a4afb598a6",
+ "897b77a56c09479bb11d7f2a30997e55",
+ "81c3db71ac704280ad030072655f1537",
+ "042e091f75694c47aee761e760e76773",
+ "ef0a3c7a6f14460fb4da096928ae249e",
+ "07fb3a2c8315494e97b447e672dfae06",
+ "ec030fc3c346426f9abc3a89892258d3",
+ "e3fb3fc6afe04b3c9b7ac61809ce78fa",
+ "c3be9109d63c485d9c0ef4f9bc0f9218",
+ "12815f401eba44658caa7b2e490137a8",
+ "30e02aa2d0d241979369e598287f2639",
+ "dfd2a2649b8341ef913207526708aff1",
+ "4f1977d7e4824ef1a14b65f0f42bba10",
+ "c6164e05a1914ae48083db9ad7f4ef7c",
+ "813621384dc748b0ad06775e22761c0b",
+ "dc892a596f6942d7973c616c38f0eebb",
+ "c84cc07789be48aebb322c23d355289e",
+ "bed8726b8069434687c75452e21f19e5",
+ "16a188a0b06d45f980dcf3933509fe0a",
+ "60c1a0d765c14a1d888317e6a507e4ea",
+ "0077aedc3d174560bce924ee89e9c006",
+ "00321cce58884f6f9b3855a21fcd9187",
+ "fa864b41586f4a7aa56aeafd1d84eb75",
+ "3225603166b54e7aab766b9964a2f660",
+ "349eee9f56d64f0cba6fc24ff2c50c9b",
+ "7e5d3774060e4589aa65982da5ea4ef4",
+ "7c2485c6cdfe463da6fdb35982a1070d",
+ "ad1236893754446881e153adc9d5c962",
+ "daee63fd167e4441a32324b51b00ad2b",
+ "fe41858c6bd04c58840112b67c19a336",
+ "d262c82138024169b9f3aa034ca756fa",
+ "62e302ebdad64aada0ffe64ae1c873f3",
+ "bd1b0dfed6d34d16af33a4a58330f5ec",
+ "d07c8b97d3314f1c852e44bdd40f61ed",
+ "ebb69a2c3d0a4299a484698287b3087c",
+ "e5a82df528bb4e408797a3b6c2758f4a",
+ "f113ebd8c1c34806bea4dd7ed3035173"
+ ]
+ },
+ "id": "KQQhgK8FoDfF",
+ "outputId": "f69441d8-95f9-4885-c306-6c8709090ff6"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b82aa8c57f7c422a9a9c90f333ed2a99",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "tokenizer_config.json: 0%| | 0.00/9.68k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "95caff42f08a4c2aa14c867b8f37f231",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "vocab.json: 0%| | 0.00/2.78M [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a138859f19b74fc0928dc236ab5359db",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "merges.txt: 0%| | 0.00/1.67M [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "5f86cd894de94c3280fadc1e2fd0ee13",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "tokenizer.json: 0%| | 0.00/11.4M [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2025-05-08 13:41:00,844] [DEBUG] [axolotl.utils.models.load_tokenizer:441] [PID:174] [RANK:0] EOS: 151645 / <|im_end|>\u001b[39m\n",
+ "[2025-05-08 13:41:00,845] [DEBUG] [axolotl.utils.models.load_tokenizer:442] [PID:174] [RANK:0] BOS: None / None\u001b[39m\n",
+ "[2025-05-08 13:41:00,846] [DEBUG] [axolotl.utils.models.load_tokenizer:443] [PID:174] [RANK:0] PAD: 151643 / <|endoftext|>\u001b[39m\n",
+ "[2025-05-08 13:41:00,847] [DEBUG] [axolotl.utils.models.load_tokenizer:444] [PID:174] [RANK:0] UNK: None / None\u001b[39m\n",
+ "[2025-05-08 13:41:00,869] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:271] [PID:174] [RANK:0] Unable to find prepared dataset in last_run_prepared/97037817611d38b3a9c681753c3c4c95\u001b[39m\n",
+ "[2025-05-08 13:41:00,870] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:272] [PID:174] [RANK:0] Loading raw datasets...\u001b[39m\n",
+ "\u001b[33m[2025-05-08 13:41:00,870] [WARNING] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:274] [PID:174] [RANK:0] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset.\u001b[39m\n",
+ "[2025-05-08 13:41:00,871] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:281] [PID:174] [RANK:0] No seed provided, using default seed of 42\u001b[39m\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7cd0b85ebd204b7aba908417811ce4e0",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "train.jsonl: 0%| | 0.00/27.3M [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "03b093d592ba4386aa61f7b8483da660",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Generating train split: 0 examples [00:00, ? examples/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2025-05-08 13:41:04,196] [INFO] [axolotl.utils.data.sft.get_dataset_wrapper:484] [PID:174] [RANK:0] Loading dataset with base_type: chat_template and prompt_style: None\u001b[39m\n",
+ "[2025-05-08 13:41:04,233] [INFO] [axolotl.__call__:761] [PID:174] [RANK:0] Using chat template:\n",
+ "---\n",
+ "{%- if tools %}\n",
+ " {{- '<|im_start|>system\\n' }}\n",
+ " {%- if messages[0].role == 'system' %}\n",
+ " {{- messages[0].content + '\\n\\n' }}\n",
+ " {%- endif %}\n",
+ " {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within \n",
+ " \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Step \n",
+ " Training Loss \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 1.092300 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 1.554200 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 1.041400 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 1.733800 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " 1.430000 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " 1.258500 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " 1.343600 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " 1.101700 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " 1.086500 \n",
+ " \n",
+ " \n",
+ " 10 \n",
+ " 0.813200 \n",
+ " \n",
+ " \n",
+ " 11 \n",
+ " 0.689600 \n",
+ " \n",
+ " \n",
+ " 12 \n",
+ " 0.826700 \n",
+ " \n",
+ " \n",
+ " 13 \n",
+ " 1.541800 \n",
+ " \n",
+ " \n",
+ " 14 \n",
+ " 0.948000 \n",
+ " \n",
+ " \n",
+ " 15 \n",
+ " 1.357000 \n",
+ " \n",
+ " \n",
+ " 16 \n",
+ " 1.085800 \n",
+ " \n",
+ " \n",
+ " 17 \n",
+ " 1.516800 \n",
+ " \n",
+ " \n",
+ " 18 \n",
+ " 1.146800 \n",
+ " \n",
+ " \n",
+ " 19 \n",
+ " 0.834800 \n",
+ " \n",
+ " \n",
+ " 20 \n",
+ " 0.968000 \n",
+ " \n",
+ " \n",
+ " 21 \n",
+ " 1.388800 \n",
+ " \n",
+ " \n",
+ " 22 \n",
+ " 1.511500 \n",
+ " \n",
+ " \n",
+ " 23 \n",
+ " 1.338500 \n",
+ " \n",
+ " \n",
+ " 24 \n",
+ " 1.206600 \n",
+ " \n",
+ " \n",
+ " \n",
+ "25 \n",
+ " 1.504600 \n",
+ " \n",
+ " sys.exit(main())\n",
+ " ^^^^^^\n",
+ " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/huggingface_cli.py\", line 57, in main\n",
+ " service.run()\n",
+ " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 207, in run\n",
+ " print(self._upload())\n",
+ " ^^^^^^^^^^^^^^\n",
+ " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 302, in _upload\n",
+ " return self.api.upload_folder(\n",
+ " ^^^^^^^^^^^^^^^^^^^^^^^\n",
+ " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
+ " return fn(*args, **kwargs)\n",
+ " ^^^^^^^^^^^^^^^^^^^\n",
+ " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n",
+ " return fn(self, *args, **kwargs)\n",
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+ " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4942, in upload_folder\n",
+ " commit_info = self.create_commit(\n",
+ " ^^^^^^^^^^^^^^^^^^^\n",
+ " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
+ " return fn(*args, **kwargs)\n",
+ " ^^^^^^^^^^^^^^^^^^^\n",
+ " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n",
+ " return fn(self, *args, **kwargs)\n",
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+ " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4202, in create_commit\n",
+ " self.preupload_lfs_files(\n",
+ " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4483, in preupload_lfs_files\n",
+ " _upload_xet_files(**upload_kwargs, create_pr=create_pr) # type: ignore [arg-type]\n",
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+ " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
+ " return fn(*args, **kwargs)\n",
+ " ^^^^^^^^^^^^^^^^^^^\n",
+ " File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/_commit_api.py\", line 592, in _upload_xet_files\n",
+ " with progress_cm as progress:\n",
+ " File \"/usr/local/lib/python3.11/dist-packages/tqdm/std.py\", line 1138, in __exit__\n",
+ " def __exit__(self, exc_type, exc_value, traceback):\n",
+ "\n",
+ "KeyboardInterrupt\n",
+ "^C\n"
+ ]
+ }
+ ],
+ "source": [
+ "from huggingface_hub import notebook_login\n",
+ "\n",
+ "# remove the partial epoch checkpoints\n",
+ "!rm -rf \"./outputs/qwen-sft-pirate-rrr/checkpoint-*\"\n",
+ "\n",
+ "# HF Notebook login widget\n",
+ "notebook_login()\n",
+ "\n",
+ "# upload the LoRA adapter for your model to HF, remember to update the username/model-name below\n",
+ "!huggingface-cli upload --repo-type=model winglian/pirate-qwen-14B \"./outputs/qwen-sft-pirate-rrr\""
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "gpuType": "T4",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
+ },
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "00321cce58884f6f9b3855a21fcd9187": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "004d9177a6a14118a5930dc3cc13147b": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HBoxModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_a80410b919e442c49aea15acc1ce1a72",
+ "IPY_MODEL_c6e00f5224364822bc4239b176686919",
+ "IPY_MODEL_ec11d1e5ae7b42c883d9b1f38a65356e"
+ ],
+ "layout": "IPY_MODEL_734185351eb543fa9a00a881dcbb9fe7"
+ }
+ },
+ "0077aedc3d174560bce924ee89e9c006": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "03a3c744d716431488163b4358b80f92": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "03b093d592ba4386aa61f7b8483da660": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HBoxModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_b8766a88716948cf968f4563531a76d9",
+ "IPY_MODEL_6f3a28b912714c6e931003549664bfa3",
+ "IPY_MODEL_16d1283741404b7bb319094c992fce01"
+ ],
+ "layout": "IPY_MODEL_2a5bb0e818ab47be8cf6465988328503"
+ }
+ },
+ "042e091f75694c47aee761e760e76773": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "0546d04aae644dde846c58a4afb598a6": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "054c8dffadba48c6b895a6cc62448ecc": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "bar_color": null,
+ "description_width": ""
+ }
+ },
+ "07fb3a2c8315494e97b447e672dfae06": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_12815f401eba44658caa7b2e490137a8",
+ "placeholder": "",
+ "style": "IPY_MODEL_30e02aa2d0d241979369e598287f2639",
+ "value": "Drop Samples with Zero Trainable Tokens (num_proc=2): 100%"
+ }
+ },
+ "083f9cda8d754c168beee10d2f8955a2": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "ProgressView",
+ "bar_style": "success",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_a0a11e929edd4189b79723d618522c33",
+ "max": 728,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_e87ea87fcff247b5bbcc331ba79a8dc2",
+ "value": 728
+ }
+ },
+ "09007681cf8d42aeb8c1d2f6a74e470a": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_b195f160ca20442fadd8b5aed0ee41af",
+ "placeholder": "",
+ "style": "IPY_MODEL_ca65e32eb52f48c09a84b33cb18f22cd",
+ "value": " 11.4M/11.4M [00:00<00:00, 21.8MB/s]"
+ }
+ },
+ "0a46ad75c198463d843fb35e813642cb": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "ProgressView",
+ "bar_style": "success",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_b8e39e4dddc3497fbc29ae45c66da759",
+ "max": 11422654,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_63b4e563e85c4f03b1b72beda9577bcc",
+ "value": 11422654
+ }
+ },
+ "0aa8ab56b85f4171a79c3bc210594025": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "bar_color": null,
+ "description_width": ""
+ }
+ },
+ "0b4c9753a7cb4354b8e5f187e6e1ad7c": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "0cd7efffbb3c4c4b972e63749f61ab97": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "0dea5caa27384f5689e3cab51f558727": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "0e067d8db8ed48308a718d5f57683fd1": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_b1bea589efa14258a9982071b87938bf",
+ "placeholder": "",
+ "style": "IPY_MODEL_590eef89881545aa8bbef9a8bbe7fb00",
+ "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks.
Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. Generated:\n(no output)
"
+
+ def _style_for(i: int, tid: int) -> str:
+ if i in masked_positions:
+ if i < len(orig_ids) and tid == orig_ids[i]:
+ return "green"
+ if i < len(orig_ids):
+ return "red"
+ return "normal"
+ same = i < len(orig_ids) and tid == orig_ids[i]
+ return "dim" if same else "normal"
+
+ # Group contiguous spans by style to reduce HTML size
+ spans: list[tuple[str, int, int]] = []
+ if generated_ids:
+ cur = _style_for(0, generated_ids[0])
+ start = 0
+ for i in range(1, len(generated_ids)):
+ s = _style_for(i, generated_ids[i])
+ if s != cur:
+ spans.append((cur, start, i))
+ cur, start = s, i
+ spans.append((cur, start, len(generated_ids)))
+
+ html_parts = []
+ for style_name, a, b in spans:
+ txt = tokenizer.decode(generated_ids[a:b], skip_special_tokens=False)
+ if style_name == "green":
+ html_parts.append(f'{txt}')
+ elif style_name == "red":
+ html_parts.append(f'{txt}')
+ elif style_name == "dim":
+ html_parts.append(f'{txt}')
+ else:
+ html_parts.append(txt)
+
+ legend = (
+ 'Generated:\n'
+ + "".join(html_parts)
+ + "
"
+ )
+
+
+def launch_diffusion_gradio_ui(
+ *,
+ model,
+ tokenizer,
+ cfg: DictDefault,
+ prompter_module=None,
+ chat_template_str: str | None = None,
+):
+ """Build and launch a simple Gradio UI for diffusion inference."""
+ with gr.Blocks(
+ title=cfg.get("gradio_title", "Axolotl Diffusion Interface")
+ ) as demo:
+ gr.Markdown(
+ """
+ ## Axolotl Diffusion Inference
+ - Mode "Random" masks tokens at a target ratio and fills them.
+ - Mode "Completion" appends N masked tokens at the end and fills them.
+ """
+ )
+
+ with gr.Row():
+ mode = gr.Radio(
+ choices=["random", "completion"],
+ value="random",
+ label="Mode",
+ )
+ mask_ratio = gr.Slider(
+ minimum=0.0,
+ maximum=1.0,
+ step=0.05,
+ value=0.4,
+ label="Mask ratio (random mode)",
+ interactive=True,
+ )
+ completion_tokens = gr.Number(
+ value=64,
+ precision=0,
+ label="Completion tokens (completion mode)",
+ interactive=True,
+ visible=False,
+ )
+
+ instruction = gr.Textbox(label="Instruction", lines=6)
+ run_btn = gr.Button("Generate")
+
+ masked_preview = gr.Textbox(label="Masked preview", lines=6)
+ html_out = gr.HTML(label="Generated")
+
+ def _toggle_controls(selected_mode: str):
+ return (
+ gr.update(visible=(selected_mode == "random")),
+ gr.update(visible=(selected_mode == "completion")),
+ )
+
+ mode.change(
+ _toggle_controls,
+ inputs=[mode],
+ outputs=[mask_ratio, completion_tokens],
+ )
+
+ def _gen(instruction_text: str, selected_mode: str, mratio: float, ctoks: int):
+ if not instruction_text:
+ return "", "Generated:\n(no output)
"
+
+ if prompter_module:
+ prompt: str = next(
+ prompter_module().build_prompt(
+ instruction=instruction_text.strip("\n")
+ )
+ )
+ else:
+ prompt = instruction_text.strip()
+
+ info = run_diffusion(
+ model=model,
+ tokenizer=tokenizer,
+ cfg=cfg,
+ prompt=prompt,
+ chat_template_str=chat_template_str,
+ mode=selected_mode,
+ target_mask_ratio=mratio if selected_mode == "random" else None,
+ completion_tokens=int(ctoks) if selected_mode == "completion" else 0,
+ )
+
+ masked_text = info.get("masked_text")
+ mask_ratio_val = info.get("mask_ratio")
+ generated_ids = info.get("generated_ids")
+ masked_positions = info.get("masked_positions") or set()
+ orig_ids = info.get("orig_ids") or []
+
+ preview = (
+ f"Masked ({mask_ratio_val:.1%}):\n{masked_text}"
+ if masked_text is not None and mask_ratio_val is not None
+ else ""
+ )
+ html = render_html(
+ generated_ids=generated_ids,
+ orig_ids=orig_ids,
+ masked_positions=masked_positions,
+ tokenizer=tokenizer,
+ )
+ return preview, html
+
+ run_btn.click(
+ _gen,
+ inputs=[instruction, mode, mask_ratio, completion_tokens],
+ outputs=[masked_preview, html_out],
+ )
+
+ demo.queue().launch(
+ show_api=False,
+ share=cfg.get("gradio_share", True),
+ server_name=cfg.get("gradio_server_name", "127.0.0.1"),
+ server_port=cfg.get("gradio_server_port", None),
+ )
diff --git a/src/axolotl/cli/utils/sweeps.py b/src/axolotl/cli/utils/sweeps.py
index d21664964..2a0aa1367 100644
--- a/src/axolotl/cli/utils/sweeps.py
+++ b/src/axolotl/cli/utils/sweeps.py
@@ -3,11 +3,12 @@
import random
from copy import deepcopy
from itertools import product
+from typing import Any
def generate_sweep_configs(
base_config: dict[str, list], sweeps_config: dict[str, list]
-) -> list[dict[str, list]]:
+) -> list[dict[str, Any]]:
"""
Recursively generates all possible configurations by applying sweeps to the base config.
@@ -48,7 +49,10 @@ def generate_sweep_configs(
new_config = {}
# new_config = deepcopy(base_config)
# Combine regular parameters with paired parameters
- full_combo = {**dict(zip(param_names, reg_combo)), **paired_set}
+ full_combo = {
+ **dict(zip(param_names, reg_combo, strict=False)),
+ **paired_set,
+ }
for param_name, param_value in full_combo.items():
new_config[param_name] = param_value
print(new_config)
@@ -57,7 +61,7 @@ def generate_sweep_configs(
# If no paired values, just use regular combinations
# new_config = deepcopy(base_config)
new_config = {}
- for param_name, param_value in zip(param_names, reg_combo):
+ for param_name, param_value in zip(param_names, reg_combo, strict=False):
new_config[param_name] = param_value
print(new_config)
all_combinations.append(new_config)
diff --git a/src/axolotl/cli/utils/train.py b/src/axolotl/cli/utils/train.py
index f1ac857b3..6ce7d8df3 100644
--- a/src/axolotl/cli/utils/train.py
+++ b/src/axolotl/cli/utils/train.py
@@ -4,6 +4,7 @@ import os
import subprocess # nosec
import sys
import tempfile
+from pathlib import Path
from typing import Any, Iterator, Literal
import yaml
@@ -67,14 +68,12 @@ def build_command(base_cmd: list[str], options: dict[str, Any]) -> list[str]:
def generate_config_files(config: str, sweep: str | None) -> Iterator[tuple[str, bool]]:
"""
- Generate list of configuration files to process.
+ Generate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating
+ whether this is a group of configurations (i.e., a sweep).
Args:
config: Base configuration file
sweep: Sweep configuration file
-
- Yields:
- Tuple of configuration file name and whether this is a group of configurations
"""
if not sweep:
@@ -90,8 +89,12 @@ def generate_config_files(config: str, sweep: str | None) -> Iterator[tuple[str,
# Generate all possible configurations
permutations = generate_sweep_configs(base_config, sweep_config)
is_group = len(permutations) > 1
- for permutation in permutations:
- # pylint: disable=consider-using-with
+ base_output_dir = base_config.get("output_dir", "./model-out")
+ for idx, permutation in enumerate(permutations, start=1):
+ permutation_dir = Path(permutation.get("output_dir", base_output_dir))
+ permutation_id = f"sweep{idx:04d}"
+ permutation["output_dir"] = str(permutation_dir / permutation_id)
+
temp_file = tempfile.NamedTemporaryFile(
mode="w",
suffix=".yaml",
diff --git a/src/axolotl/cli/vllm_serve.py b/src/axolotl/cli/vllm_serve.py
index cf687bea2..ea454fc96 100644
--- a/src/axolotl/cli/vllm_serve.py
+++ b/src/axolotl/cli/vllm_serve.py
@@ -39,7 +39,7 @@ def do_vllm_serve(
model = cfg.base_model
serve_module = cli_args.get("serve_module", "trl.scripts.vllm_serve")
- vllm_serve_main = getattr(__import__(serve_module, fromlist=["main"]), "main")
+ vllm_serve_main = __import__(serve_module, fromlist=["main"]).main
tensor_parallel_size = 1
data_parallel_size = 1
@@ -68,7 +68,6 @@ def do_vllm_serve(
cli_args.get("enable_reasoning") or cfg.vllm.enable_reasoning or False
)
- # pylint: disable=unexpected-keyword-arg
vllm_script_args = AxolotlScriptArguments(
model=model,
tensor_parallel_size=tensor_parallel_size,
diff --git a/src/axolotl/common/architectures.py b/src/axolotl/common/architectures.py
index 616b4159b..231829cab 100644
--- a/src/axolotl/common/architectures.py
+++ b/src/axolotl/common/architectures.py
@@ -12,8 +12,10 @@ MOE_ARCH_BLOCK = {
"mixtral": "MixtralSparseMoeBlock",
"qwen2_moe": "Qwen2MoeSparseMoeBlock",
"qwen3_moe": "Qwen3MoeSparseMoeBlock",
+ "qwen3_vl_moe": "Qwen3VLMoeTextSparseMoeBlock",
"deepseek_v2": "DeepseekV2MoE",
- "gpt_oss": "GptOssExperts",
- "deepseek_v3": "DeepseekV3MoE",
"glm4_moe": "Glm4MoeMoE",
+ "deepseek_v3": "DeepseekV3MoE",
+ "gpt_oss": "GptOssDecoderLayer",
+ "lfm2_moe": "Lfm2MoeSparseMoeBlock",
}
diff --git a/src/axolotl/common/datasets.py b/src/axolotl/common/datasets.py
index 761317dfb..8d7758e66 100644
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -6,7 +6,7 @@ from dataclasses import dataclass
from datasets import Dataset
-import axolotl.monkeypatch.data.batch_dataset_fetcher # pylint: disable=unused-import # noqa: F401
+import axolotl.monkeypatch.data.batch_dataset_fetcher # noqa: F401
from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
from axolotl.loaders import load_processor, load_tokenizer
from axolotl.utils.data import prepare_datasets, prepare_preference_datasets
@@ -55,13 +55,11 @@ def load_datasets(
"""
tokenizer = load_tokenizer(cfg)
processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None
- preprocess_iterable = getattr(cli_args, "iterable", False)
train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets(
cfg,
tokenizer,
processor=processor,
- preprocess_iterable=preprocess_iterable,
)
if (
diff --git a/src/axolotl/convert.py b/src/axolotl/convert.py
index d1bdb34db..9e09b37dc 100644
--- a/src/axolotl/convert.py
+++ b/src/axolotl/convert.py
@@ -67,9 +67,7 @@ class JsonToJsonlConverter:
self.json_parser = json_parser
self.jsonl_serializer = jsonl_serializer
- def convert(
- self, input_file_path, output_file_path
- ): # pylint: disable=unused-argument
+ def convert(self, input_file_path, output_file_path):
content = self.file_reader.read(input_file_path)
data = self.json_parser.parse(content)
# data = [r for r in data if r["conversations"]] # vicuna cleaned has rows with empty conversations
diff --git a/src/axolotl/core/attention/flex_block_mask.py b/src/axolotl/core/attention/flex_block_mask.py
index fb9820f35..37149983c 100644
--- a/src/axolotl/core/attention/flex_block_mask.py
+++ b/src/axolotl/core/attention/flex_block_mask.py
@@ -84,9 +84,7 @@ def create_causal_mask(
batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
if attention_mask is not None:
- def causal_doc_mask_mod(
- batch_idx, head_idx, q_idx, kv_idx
- ): # pylint: disable=unused-argument
+ def causal_doc_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
"""
Defines the logic of a block causal mask by combining both a standard causal mask
and a block diagonal document mask.
@@ -103,9 +101,7 @@ def create_causal_mask(
mask_factory_function = causal_doc_mask_mod
else:
mask_factory_function = causal_mask_function
- mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[
- config._attn_implementation # pylint: disable=protected-access
- ]
+ mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
# Do not allow skip if we are compiling (this is to match BC)
allow_is_causal_skip = (
diff --git a/src/axolotl/core/builders/base.py b/src/axolotl/core/builders/base.py
index e1f649715..2c949f8e7 100644
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -24,14 +24,16 @@ from pathlib import Path
from typing import Any
import torch
-from transformers import (
- TrainerCallback,
-)
+from transformers import TrainerCallback
from transformers.trainer_pt_utils import AcceleratorConfig
from axolotl.integrations.base import PluginManager
from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
-from axolotl.utils import is_comet_available, is_mlflow_available
+from axolotl.utils import (
+ is_comet_available,
+ is_mlflow_available,
+ is_opentelemetry_available,
+)
from axolotl.utils.callbacks import (
GCCallback,
SaveAxolotlConfigtoWandBCallback,
@@ -44,7 +46,7 @@ from axolotl.utils.schemas.enums import CustomSupportedOptimizers
LOG = logging.getLogger(__name__)
with suppress(ImportError):
- import torch._dynamo # pylint: disable=ungrouped-imports
+ import torch._dynamo
class TrainerBuilderBase(abc.ABC):
@@ -136,6 +138,12 @@ class TrainerBuilderBase(abc.ABC):
callbacks.append(
SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path)
)
+ if self.cfg.use_otel_metrics and is_opentelemetry_available():
+ from axolotl.utils.callbacks.opentelemetry import (
+ OpenTelemetryMetricsCallback,
+ )
+
+ callbacks.append(OpenTelemetryMetricsCallback(self.cfg))
if self.cfg.save_first_step:
callbacks.append(SaveModelOnFirstStepCallback())
@@ -260,14 +268,14 @@ class TrainerBuilderBase(abc.ABC):
adam_kwargs["eps"] = training_args_kwargs.get("adam_epsilon")
if self.cfg.optimizer == "muon":
- from axolotl.contribs.mit.muon import ( # pylint: disable=no-name-in-module
+ from axolotl.contribs.mit.muon import (
MuonOptimizerFactory,
)
optimizer_cls = MuonOptimizerFactory
optimizer_kwargs.update(adam_kwargs)
elif self.cfg.optimizer == "dion":
- from axolotl.contribs.mit.dion import ( # pylint: disable=no-name-in-module
+ from axolotl.contribs.mit.dion import (
DionOptimizerFactory,
)
@@ -414,12 +422,8 @@ class TrainerBuilderBase(abc.ABC):
def _configure_torch_compile(self, training_args_kwargs: dict):
if self.cfg.torch_compile and getattr(torch, "_dynamo", None):
- torch._dynamo.config.suppress_errors = ( # pylint: disable=protected-access
- True
- )
- torch._dynamo.config.accumulated_cache_size_limit = ( # pylint: disable=protected-access
- 256
- )
+ torch._dynamo.config.suppress_errors = True
+ torch._dynamo.config.accumulated_cache_size_limit = 256
training_args_kwargs["torch_compile"] = self.cfg.torch_compile
if self.cfg.torch_compile_backend:
training_args_kwargs["torch_compile_backend"] = (
@@ -441,7 +445,7 @@ class TrainerBuilderBase(abc.ABC):
# don't use the HF gradient checkpointing, manually wrap
training_args_kwargs["gradient_checkpointing"] = False
training_args_kwargs["activation_offloading"] = True
- elif self.cfg.gradient_checkpointing:
+ elif self.cfg.gradient_checkpointing is not None:
training_args_kwargs["gradient_checkpointing"] = (
self.cfg.gradient_checkpointing
)
@@ -497,6 +501,7 @@ class TrainerBuilderBase(abc.ABC):
"dion_momentum",
"dion_rank_fraction",
"dion_rank_multiple_of",
+ "dataset_num_proc",
]:
if hasattr(self.cfg, arg) and getattr(self.cfg, arg) is not None:
training_args_kwargs[arg] = getattr(self.cfg, arg)
@@ -516,12 +521,10 @@ class TrainerBuilderBase(abc.ABC):
self.cfg.eval_batch_size
)
+ training_args_kwargs["include_tkps"] = self.cfg.include_tkps
training_args_kwargs["max_steps"] = self.cfg.max_steps or total_num_steps or -1
training_args_kwargs["num_train_epochs"] = self.cfg.num_epochs
- if self.cfg.dataset_processes:
- training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
-
# max_length is not used in CausalTrainer
if self.cfg.reward_model or self.cfg.rl:
training_args_kwargs["max_length"] = self.cfg.sequence_len
diff --git a/src/axolotl/core/builders/causal.py b/src/axolotl/core/builders/causal.py
index e5bc21762..7a06431dc 100644
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -10,8 +10,9 @@ import transformers
from transformers import (
DataCollatorWithFlattening,
EarlyStoppingCallback,
+ Trainer,
)
-from trl.trainer.utils import RewardDataCollatorWithPadding
+from trl.trainer.reward_trainer import DataCollatorForPreference
from axolotl.core.builders.base import TrainerBuilderBase
from axolotl.core.trainers import (
@@ -27,7 +28,6 @@ from axolotl.processing_strategies import get_processing_strategy
from axolotl.utils import is_comet_available, is_mlflow_available
from axolotl.utils.callbacks import (
LossWatchDogCallback,
- SaveBetterTransformerModelCallback,
bench_eval_callback_factory,
causal_lm_bench_eval_callback_factory,
colab_inference_post_train_callback,
@@ -35,6 +35,7 @@ from axolotl.utils.callbacks import (
)
from axolotl.utils.callbacks.lisa import lisa_callback_factory
from axolotl.utils.callbacks.qat import QATCallback
+from axolotl.utils.callbacks.tokens_per_second import TokensPerSecondCallback
from axolotl.utils.chat_templates import get_chat_template_from_config
from axolotl.utils.collators import (
BatchSamplerDataCollatorForSeq2Seq,
@@ -61,12 +62,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
if self.cfg.relora:
callbacks.append(ReLoRACallback(self.cfg))
- if (
- hasattr(self.model, "use_bettertransformer")
- and self.model.use_bettertransformer is True
- ):
- callbacks.append(SaveBetterTransformerModelCallback())
-
# TODO: check if can move to base class
if self.cfg.loss_watchdog_threshold is not None:
callbacks.append(LossWatchDogCallback(self.cfg))
@@ -74,6 +69,12 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
if self.cfg.qat:
callbacks.append(QATCallback(self.cfg.qat))
+ if self.cfg.include_tkps:
+ callbacks.append(
+ TokensPerSecondCallback(
+ self.cfg.tensor_parallel_size, self.cfg.context_parallel_size
+ )
+ )
return callbacks
def get_post_trainer_create_callbacks(self, trainer):
@@ -340,20 +341,22 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
if self.cfg.reward_model:
training_args_cls = AxolotlRewardConfig
+ if self.cfg.center_rewards_coefficient is not None:
+ training_arguments_kwargs["center_rewards_coefficient"] = (
+ self.cfg.center_rewards_coefficient
+ )
elif self.cfg.process_reward_model:
training_args_cls = AxolotlPRMConfig
else:
training_args_cls = AxolotlTrainingArguments
- training_args = training_args_cls( # pylint: disable=unexpected-keyword-arg
+ training_args = training_args_cls(
**training_arguments_kwargs,
)
training_args = self.hook_post_create_training_args(training_args)
# unset run_name so wandb sets up experiment names
if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
- training_args.run_name = ( # pylint: disable=attribute-defined-outside-init
- None
- )
+ training_args.run_name = None
data_collator_kwargs = {
"padding": True, # True/"longest" is the default
@@ -385,10 +388,11 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
**data_collator_kwargs,
)
sig = inspect.signature(trainer_cls)
- if "processing_class" in sig.parameters:
+ if "processing_class" in sig.parameters or issubclass(trainer_cls, Trainer):
trainer_kwargs["processing_class"] = self.tokenizer
elif "tokenizer" in sig.parameters:
trainer_kwargs["tokenizer"] = self.tokenizer
+
if (
trainer_cls not in [AxolotlRewardTrainer, AxolotlPRMTrainer]
and self.cfg.datasets is not None
@@ -406,6 +410,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
**trainer_kwargs,
)
trainer = self.hook_post_create_trainer(trainer)
+ # if the trainer has the `axolotl_cfg` property, set it
+ if hasattr(trainer, "axolotl_cfg"):
+ trainer.axolotl_cfg = self.cfg
for callback in self.get_post_trainer_create_callbacks(trainer):
trainer.add_callback(callback)
@@ -446,7 +453,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
BatchSamplerDataCollatorForSeq2Seq,
DataCollatorForSeq2Seq,
DataCollatorWithFlattening,
- RewardDataCollatorWithPadding,
+ DataCollatorForPreference,
]
]
collator_args = [self.tokenizer]
@@ -463,7 +470,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
if kwargs and isinstance(kwargs, dict):
kwargs.update(collator_cls_and_kwargs[1])
elif self.cfg.reward_model:
- collator = RewardDataCollatorWithPadding
+ collator = DataCollatorForPreference
+ tokenizer = collator_args.pop(0)
+ kwargs["pad_token_id"] = tokenizer.pad_token_id
+ kwargs.pop("padding")
elif use_batch_sampler_collator:
# Use V2BatchSamplerDataCollatorForSeq2Seq for flex attention,
# supported multipack models, or non-flash-attention llama
diff --git a/src/axolotl/core/builders/rl.py b/src/axolotl/core/builders/rl.py
index bc7816807..0ceb80008 100644
--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -120,6 +120,11 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
if self.cfg.use_wandb:
training_args_kwargs["run_name"] = self.cfg.wandb_name
+ if self.cfg.max_prompt_len:
+ training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
+ else:
+ training_args_kwargs["max_prompt_length"] = self.cfg.sequence_len
+
training_args_cls = None
blocklist_args_kwargs = []
if self.cfg.rl is RLType.SIMPO:
@@ -129,10 +134,16 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
if self.cfg.cpo_alpha is not None:
training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha
+ # Handle when max_prompt_length == max_length from defaults
+ # CPOTrainer requires strictly less than
+ if (
+ training_args_kwargs["max_prompt_length"]
+ == training_args_kwargs["max_length"]
+ ):
+ training_args_kwargs["max_prompt_length"] -= 1
+
elif self.cfg.rl is RLType.ORPO:
training_args_cls = AxolotlORPOConfig
- if self.cfg.max_prompt_len:
- training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
elif self.cfg.rl is RLType.KTO:
training_args_cls = AxolotlKTOConfig
@@ -144,9 +155,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
self.cfg.kto_undesirable_weight or 1.0
)
- if self.cfg.max_prompt_len:
- training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
-
elif self.cfg.rl is RLType.GRPO:
training_args_cls = GRPOStrategy.get_training_args_class()
training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg))
@@ -168,16 +176,14 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
if plugin_training_args:
training_args_kwargs.update(plugin_training_args)
- training_args = training_args_cls( # pylint: disable=unexpected-keyword-arg
+ training_args = training_args_cls(
logging_first_step=True,
**training_args_kwargs,
)
# unset run_name so wandb sets up experiment names
if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
- training_args.run_name = ( # pylint: disable=attribute-defined-outside-init
- None
- )
+ training_args.run_name = None
return training_args, trainer_kwargs
diff --git a/src/axolotl/core/chat/format/chatml.py b/src/axolotl/core/chat/format/chatml.py
index 04c398fe8..deb8a9997 100644
--- a/src/axolotl/core/chat/format/chatml.py
+++ b/src/axolotl/core/chat/format/chatml.py
@@ -10,7 +10,7 @@ from .shared import wrap_tools
def format_message(
message: Messages,
- message_index: Optional[int] = None, # pylint: disable=unused-argument
+ message_index: Optional[int] = None,
) -> Messages:
if message.is_chat_formatted:
return message
diff --git a/src/axolotl/core/chat/messages.py b/src/axolotl/core/chat/messages.py
index 923b177c1..912a12ca1 100644
--- a/src/axolotl/core/chat/messages.py
+++ b/src/axolotl/core/chat/messages.py
@@ -15,11 +15,11 @@ class MessageRoles(str, Enum):
Message roles for the system, user, assistant, and tools
"""
- system = "system" # pylint: disable=invalid-name
- user = "user" # pylint: disable=invalid-name
- assistant = "assistant" # pylint: disable=invalid-name
- tool = "tool" # pylint: disable=invalid-name
- ipython = ( # pylint: disable=invalid-name
+ system = "system"
+ user = "user"
+ assistant = "assistant"
+ tool = "tool"
+ ipython = (
# for responses from builtin tools
"ipython"
)
@@ -30,12 +30,12 @@ class MessageContentTypes(str, Enum):
Message content types for text, image, audio, tool calls, and tool responses
"""
- special_token = "special_token" # pylint: disable=invalid-name # nosec B105
- text = "text" # pylint: disable=invalid-name
- image = "image" # pylint: disable=invalid-name
- audio = "audio" # pylint: disable=invalid-name
- tool_call = "tool_call" # pylint: disable=invalid-name # to differentiate regular responses from tool calls from the assistant
- tool_response = "tool_response" # pylint: disable=invalid-name
+ special_token = "special_token" # nosec B105
+ text = "text"
+ image = "image"
+ audio = "audio"
+ tool_call = "tool_call"
+ tool_response = "tool_response"
class SpecialToken(str, Enum):
@@ -43,8 +43,8 @@ class SpecialToken(str, Enum):
Special tokens for beginning of string and end of string
"""
- bos_token = "bos_token" # pylint: disable=invalid-name # nosec B105
- eos_token = "eos_token" # pylint: disable=invalid-name # nosec B105
+ bos_token = "bos_token" # nosec B105
+ eos_token = "eos_token" # nosec B105
class ToolCallFunction(BaseModel):
@@ -73,7 +73,7 @@ class ToolCallContents(BaseModel):
name: str
arguments: dict[str, Union[str, int]]
- id: Optional[str] = None # pylint: disable=invalid-name
+ id: Optional[str] = None
def __str__(self) -> str:
data = {"name": self.name, "arguments": self.arguments}
@@ -89,7 +89,7 @@ class ToolResponseContents(BaseModel):
name: str
content: Union[str, dict[str, Union[str, int, float]]]
- id: Optional[str] = None # pylint: disable=invalid-name
+ id: Optional[str] = None
def __str__(self) -> str:
data = {"name": self.name, "content": self.content}
diff --git a/src/axolotl/core/datasets/transforms/chat_builder.py b/src/axolotl/core/datasets/transforms/chat_builder.py
index 692fe3ebb..0de0ecb40 100644
--- a/src/axolotl/core/datasets/transforms/chat_builder.py
+++ b/src/axolotl/core/datasets/transforms/chat_builder.py
@@ -1,23 +1,17 @@
"""
-This module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.
+This module contains a function that builds a transform that takes a row from the
+dataset and converts it to a Chat.
"""
-from typing import Any, Mapping, Union
+from typing import Any, Mapping
-def chat_message_transform_builder( # pylint: disable=dangerous-default-value
+def chat_message_transform_builder(
train_on_inputs=False,
- conversations_field: str = "conversations",
- message_field_role: Union[str, list[str]] = ["role", "from"], # commonly "role"
- message_field_content: Union[str, list[str]] = [
- "value",
- "text",
- "content",
- ], # commonly "content"
- message_field_training: Union[str, list[str]] = [
- "train",
- "weight",
- ], # commonly "weight"
+ conversations_field: str = "messages",
+ message_field_role: str | list[str] | None = None, # commonly "role"
+ message_field_content: str | list[str] | None = None, # commonly "content"
+ message_field_training: str | list[str] | None = None, # commonly "weight"
):
"""Builds a transform that takes a row from the dataset and converts it to a Chat
@@ -26,19 +20,25 @@ def chat_message_transform_builder( # pylint: disable=dangerous-default-value
If True, the transform will train on the inputs. If False, the transform will train on the targets.
Defaults to False.
conversations_field (str, optional):
- The field name of the conversations. Defaults to "conversations".
+ The field name of the conversations. Defaults to "messages".
message_field_role (str | list[str], optional):
- The field name of the role. Defaults to "role".
+ The field name of the role.
message_field_content (str | list[str], optional):
- The field name of the message content. Defaults to "content".
+ The field name of the message content.
message_field_training (str | list[str], optional):
- The field name of the train/weight. Defaults to "weight".
+ The field name of the train/weight.
Returns:
Callable:
A function that takes a list of conversations and returns a list of messages.
"""
+ if message_field_training is None:
+ message_field_training = ["train", "weight"]
+ if message_field_content is None:
+ message_field_content = ["value", "text", "content"]
+ if message_field_role is None:
+ message_field_role = ["role", "from"]
message_field_role = (
[message_field_role]
if isinstance(message_field_role, str)
diff --git a/src/axolotl/core/trainers/__init__.py b/src/axolotl/core/trainers/__init__.py
index 5f97e387a..22d8b64f6 100644
--- a/src/axolotl/core/trainers/__init__.py
+++ b/src/axolotl/core/trainers/__init__.py
@@ -1,11 +1,9 @@
"""Init for axolotl.core.trainers"""
-# pylint: disable=unused-import
# flake8: noqa
from .base import AxolotlTrainer
from .dpo.trainer import AxolotlDPOTrainer
-from .grpo.trainer import AxolotlGRPOSequenceParallelTrainer, AxolotlGRPOTrainer
from .mamba import AxolotlMambaTrainer
from .trl import (
AxolotlCPOTrainer,
diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index 0f9f6e4c4..7896c6088 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -1,7 +1,5 @@
"""Module for customized trainers"""
-# pylint: disable=too-many-lines
-
from __future__ import annotations
import os
@@ -44,12 +42,20 @@ from axolotl.core.trainers.utils import (
)
from axolotl.utils import get_not_null
from axolotl.utils.bench import get_gpu_memory_usage
-from axolotl.utils.distributed import is_main_process
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.distributed import is_distributed, is_main_process
from axolotl.utils.logging import get_logger
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
LOG = get_logger(__name__)
+REDUCTION_FNS = {
+ "mean": torch.mean,
+ "min": torch.min,
+ "max": torch.max,
+ "sum": torch.sum,
+}
+
class AxolotlTrainer(
PackingMixin,
@@ -65,6 +71,15 @@ class AxolotlTrainer(
args = None # type: "AxolotlTrainingArguments" # type: ignore[name-defined]
tag_names = ["axolotl"]
+ _axolotl_cfg: DictDefault | None = None
+
+ @property
+ def axolotl_cfg(self):
+ return self._axolotl_cfg
+
+ @axolotl_cfg.setter
+ def axolotl_cfg(self, cfg):
+ self._axolotl_cfg = cfg
def __init__(
self,
@@ -80,9 +95,10 @@ class AxolotlTrainer(
self._signature_columns = None # workaround for pylint
super().__init__(*_args, **kwargs)
-
self.train_data_collator = self.data_collator
- self._stored_metrics = defaultdict(lambda: defaultdict(list))
+ self._stored_metrics = defaultdict(
+ lambda: defaultdict(lambda: {"values": [], "reduction": "mean"})
+ )
if self.args.orpo_alpha:
self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
@@ -209,17 +225,6 @@ class AxolotlTrainer(
data_collator = self.data_collator if is_training else self.eval_data_collator
- if dataset.column_names and "length" in dataset.column_names:
- dataset = dataset.remove_columns(["length"])
- if (
- dataset.column_names
- and "position_ids" in dataset.column_names
- and "attention_mask" in dataset.column_names
- and self.args.sample_packing
- and self.args.sample_packing_drop_attention_mask
- ):
- dataset = dataset.remove_columns(["attention_mask"])
-
if isinstance(dataset, datasets.Dataset):
if is_training:
if not self.args.sample_packing or self.args.pretraining:
@@ -278,6 +283,18 @@ class AxolotlTrainer(
):
self.accelerator.even_batches = False
+ if dataset.column_names and "length" in dataset.column_names:
+ dataset = dataset.remove_columns(["length"])
+
+ if (
+ dataset.column_names
+ and "position_ids" in dataset.column_names
+ and "attention_mask" in dataset.column_names
+ and self.args.sample_packing
+ and self.args.sample_packing_drop_attention_mask
+ ):
+ dataset = dataset.remove_columns(["attention_mask"])
+
dataloader = DataLoader(dataset, **dataloader_params)
# Accelerator.free_memory() will destroy the references, so
@@ -285,9 +302,9 @@ class AxolotlTrainer(
# fmt: off
if dataloader_key is not None and self.args.dataloader_persistent_workers:
if hasattr(self, "_eval_dataloaders"):
- self._eval_dataloaders[dataloader_key] = dataloader # type: ignore # pylint: disable=access-member-before-definition
+ self._eval_dataloaders[dataloader_key] = dataloader # type: ignore
else:
- self._eval_dataloaders = {dataloader_key: dataloader} # pylint: disable=attribute-defined-outside-init
+ self._eval_dataloaders = {dataloader_key: dataloader}
# fmt: on
return self.accelerator.prepare(dataloader)
@@ -329,6 +346,27 @@ class AxolotlTrainer(
# outputs = model(**inputs)
# loss = trainer_weighted_loss(outputs, labels, shift_labels=True)
# return (loss, outputs) if return_outputs else loss
+
+ # track number of tokens for tokens per second calculation
+ if self.args.include_tkps:
+ inputs_key = "labels" if "labels" in inputs else "input_ids"
+ num_tokens = (inputs[inputs_key] != -100).sum()
+ if is_distributed():
+ torch.distributed.all_reduce(
+ num_tokens, op=torch.distributed.ReduceOp.SUM
+ )
+ if hasattr(self.state, "num_tokens"):
+ self.state.num_tokens = (
+ self.state.num_tokens + (inputs[inputs_key] != -100).sum().cpu()
+ )
+ else:
+ self.state.num_tokens = (inputs[inputs_key] != -100).sum().cpu()
+
+ if hasattr(self.state, "total_tokens"):
+ self.state.total_tokens += num_tokens
+ else:
+ self.state.total_tokens = num_tokens
+
if self.args.orpo_alpha:
return self.orpo_compute_loss(
model,
@@ -344,6 +382,11 @@ class AxolotlTrainer(
num_items_in_batch=num_items_in_batch,
)
+ @override
+ def evaluate(self, *args, **kwargs):
+ LOG.info("Running evaluation step...")
+ return super().evaluate(*args, **kwargs)
+
@staticmethod
def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
concatenated_batch = {}
@@ -443,7 +486,7 @@ class AxolotlTrainer(
model,
inputs,
return_outputs=False,
- num_items_in_batch=None, # pylint: disable=unused-argument
+ num_items_in_batch=None,
):
concat_inputs = AxolotlTrainer.orpo_concatenate_inputs(
inputs,
@@ -524,23 +567,10 @@ class AxolotlTrainer(
accelerator_config = self.args.accelerator_config.to_dict()
use_configured_state = accelerator_config.get("use_configured_state", False)
if not use_configured_state:
- AcceleratorState._reset_state( # pylint: disable=protected-access
- reset_partial_state=True
- )
+ AcceleratorState._reset_state(reset_partial_state=True)
super().create_accelerator_and_postprocess()
- # now we need to put parallelism_config back on the PartialState since we rely on that info in other places
- # PartialState().parallelism_config = self.accelerator.state.parallelism_config
-
- if self.is_fsdp_enabled:
- if (
- "limit_all_gathers" in self.args.fsdp_config
- and self.args.fsdp_config["limit_all_gathers"]
- ):
- self.accelerator.state.fsdp_plugin.limit_all_gathers = True
-
- # pylint: disable=unused-argument
def additional_accelerator_args(
self, fp8: bool = False, enable_fsdp_float8_all_gather: bool = False, **kwargs
) -> dict[str, Any]:
@@ -573,29 +603,62 @@ class AxolotlTrainer(
"""
# logs either has 'loss' or 'eval_loss'
train_eval = "train" if "loss" in logs else "eval"
- # Add averaged stored metrics to logs
- for key, metrics in self._stored_metrics[train_eval].items():
- logs[key] = torch.tensor(metrics).mean().item()
+
+ for key, metric_data in self._stored_metrics[train_eval].items():
+ values = torch.tensor(metric_data["values"]) # type: ignore[arg-type]
+ reduction_type = metric_data["reduction"]
+
+ fn = REDUCTION_FNS.get(reduction_type)
+ if fn is None:
+ raise NotImplementedError(
+ "Metric reduction must be one of [mean, min, max, sum]"
+ )
+ logs[key] = round(fn(values).item(), 4)
if is_main_process():
# Add memory usage
try:
active, allocated, reserved = get_gpu_memory_usage()
- logs["memory/max_mem_active(gib)"] = round(active, 2)
- logs["memory/max_mem_allocated(gib)"] = round(allocated, 2)
- logs["memory/device_mem_reserved(gib)"] = round(reserved, 2)
+ logs["memory/max_active (GiB)"] = round(active, 2)
+ logs["memory/max_allocated (GiB)"] = round(allocated, 2)
+ logs["memory/device_reserved (GiB)"] = round(reserved, 2)
except (ValueError, TypeError, FileNotFoundError):
pass
+ if self.args.include_tkps and train_eval == "train":
+ # each rank will log its own tokens per second
+ # for logging_steps > 1 we obtain a moving average of this metric
+ logs["tokens_per_second_per_gpu"] = round(
+ self.state.last_tokens_per_second.item() / self.args.logging_steps, 2
+ )
+ logs["total_tokens"] = int(self.state.total_tokens.item())
+
del self._stored_metrics[train_eval]
return super().log(logs, start_time)
def store_metrics(
- self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train"
+ self,
+ metrics: dict[str, float] | dict[str, tuple[int | float, str]],
+ train_eval: Literal["train", "eval"] = "train",
+ reduction: Literal["mean", "min", "max", "sum"] = "mean",
) -> None:
+ """
+ Store metrics with specified reduction type.
+
+ Args:
+ metrics: Dictionary of metric names to values, or metric names to (value,
+ reduction_type) tuples.
+ train_eval: Whether this is for training or evaluation.
+ """
for key, value in metrics.items():
- self._stored_metrics[train_eval][key].append(value)
+ if isinstance(value, tuple):
+ value, _reduction = value # type: ignore[assignment]
+ else:
+ value, _reduction = value, reduction
+
+ self._stored_metrics[train_eval][key]["values"].append(value)
+ self._stored_metrics[train_eval][key]["reduction"] = _reduction
def _save_checkpoint(self, model, trial, **kwargs):
# make sure the checkpoint dir exists, since trainer is flakey
@@ -662,6 +725,11 @@ class AxolotlTrainer(
LOG.info(
"Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`"
)
- self.data_collator.tokenizer.save_pretrained(output_dir)
+ save_jinja_files = True
+ if self.axolotl_cfg:
+ save_jinja_files = self.axolotl_cfg.tokenizer_save_jinja_files
+ self.data_collator.tokenizer.save_pretrained(
+ output_dir, save_jinja_files=save_jinja_files
+ )
# Good practice: save your training arguments together with the trained model
torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
diff --git a/src/axolotl/core/trainers/dpo/__init__.py b/src/axolotl/core/trainers/dpo/__init__.py
index 4b40d4085..3aa79c484 100644
--- a/src/axolotl/core/trainers/dpo/__init__.py
+++ b/src/axolotl/core/trainers/dpo/__init__.py
@@ -27,7 +27,6 @@ class DPOStrategy:
training_args_kwargs["label_smoothing"] = cfg.dpo_label_smoothing
training_args_kwargs["max_completion_length"] = None
training_args_kwargs["max_length"] = cfg.sequence_len
- training_args_kwargs["max_prompt_length"] = cfg.sequence_len
training_args_kwargs["generate_during_eval"] = cfg.dpo_generate_during_eval
if cfg.dpo_use_weighting is not None:
training_args_kwargs["use_weighting"] = cfg.dpo_use_weighting
diff --git a/src/axolotl/core/trainers/dpo/trainer.py b/src/axolotl/core/trainers/dpo/trainer.py
index b3067bb46..b04505d89 100644
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -101,11 +101,11 @@ class AxolotlDPOTrainer(
) -> dict[str, torch.Tensor]:
if self.args.dpo_norm_loss:
# fmt: off
- loss_type: str = self.loss_type # type: ignore[has-type] # pylint: disable=access-member-before-definition
+ loss_type: str = self.loss_type # type: ignore[has-type]
# fmt: on
# concatenated_forward handles avg token logprob for ipo case already
- self.loss_type = "ipo" # pylint: disable=attribute-defined-outside-init
+ self.loss_type = "ipo"
res = super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
- self.loss_type = loss_type # pylint: disable=attribute-defined-outside-init
+ self.loss_type = loss_type
return res
return super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
diff --git a/src/axolotl/core/trainers/grpo/__init__.py b/src/axolotl/core/trainers/grpo/__init__.py
index 4106a2a7d..7f28cb8d4 100644
--- a/src/axolotl/core/trainers/grpo/__init__.py
+++ b/src/axolotl/core/trainers/grpo/__init__.py
@@ -52,6 +52,7 @@ class GRPOStrategy:
if trl.vllm_mode:
grpo_args_kwargs["vllm_mode"] = trl.vllm_mode
if trl.vllm_mode == "colocate":
+ grpo_args_kwargs["vllm_enable_sleep_mode"] = trl.vllm_enable_sleep_mode # type: ignore[attr-defined]
grpo_args_kwargs["vllm_gpu_memory_utilization"] = (
vllm_cfg.gpu_memory_utilization
)
@@ -125,12 +126,13 @@ class GRPOStrategy:
if trl.use_liger_loss is not None:
grpo_args_kwargs["use_liger_loss"] = trl.use_liger_loss
+ if trl.rollout_func:
+ grpo_args_kwargs["rollout_func"] = cls.get_rollout_func(trl.rollout_func)
+
return grpo_args_kwargs
@classmethod
- def set_trainer_args(
- cls, cfg: DictDefault
- ) -> list[Any]: # pylint: disable=unused-argument
+ def set_trainer_args(cls, cfg: DictDefault) -> list[Any]:
trainer_args = []
if cfg.trl and cfg.trl.reward_funcs:
reward_funcs = []
@@ -151,7 +153,7 @@ class GRPOStrategy:
return trainer_kwargs
@classmethod
- def get_collator(cls, *args, **kwargs): # pylint: disable=unused-argument
+ def get_collator(cls, *args, **kwargs):
# No data collation is needed in GRPO, handled by trl's trainer __init__
return None
@@ -202,3 +204,32 @@ class GRPOStrategy:
raise ValueError(
f"Reward function {reward_func_fqn} not found."
) from exc
+
+ @classmethod
+ def get_rollout_func(cls, rollout_func_fqn: str):
+ """
+ Returns the rollout function from the given fully qualified name.
+
+ Args:
+ rollout_func_fqn (str): Fully qualified name of the rollout function
+ (e.g. my_module.my_rollout_func)
+
+ Returns:
+ Callable rollout function
+ """
+ try:
+ rollout_func_module_name = rollout_func_fqn.split(".")[-1]
+ rollout_func_module = importlib.import_module(
+ ".".join(rollout_func_fqn.split(".")[:-1])
+ )
+ rollout_func = getattr(rollout_func_module, rollout_func_module_name)
+
+ if not callable(rollout_func):
+ raise ValueError(
+ f"Rollout function {rollout_func_fqn} must be callable"
+ )
+
+ return rollout_func
+
+ except ModuleNotFoundError as exc:
+ raise ValueError(f"Rollout function {rollout_func_fqn} not found.") from exc
diff --git a/src/axolotl/core/trainers/grpo/trainer.py b/src/axolotl/core/trainers/grpo/trainer.py
index 49caa6406..f9f5a695b 100644
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -1,7 +1,5 @@
"""Axolotl GRPO trainers (with and without sequence parallelism handling)"""
-# pylint: disable=too-many-lines,duplicate-code,protected-access,no-member
-
import warnings
from functools import partial
from typing import Any
@@ -52,7 +50,6 @@ from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, Optimizer
from axolotl.monkeypatch.ring_attn import get_ring_attn_group
if is_peft_available():
- # pylint: disable=unused-import
from peft import PeftConfig
@@ -253,7 +250,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
def get_train_dataloader(self) -> DataLoader:
"""Get dataloader for training"""
train_dataset = self.train_dataset
- # pylint: disable=access-member-before-definition
+
data_collator = self.data_collator # type: ignore
# Handle dataset preprocessing
@@ -266,7 +263,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
train_dataset, description="training"
)
else:
- self.data_collator = self._get_collator_with_removed_columns( # pylint: disable=attribute-defined-outside-init
+ self.data_collator = self._get_collator_with_removed_columns(
data_collator,
description="training",
)
@@ -308,10 +305,10 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
# Generate completions using either vLLM or regular generation
if self.args.use_vllm:
# First, have main process load weights if needed
- # pylint: disable=access-member-before-definition
+
if self.state.global_step != self._last_loaded_step: # type: ignore[has-type]
self._move_model_to_vllm()
- # pylint: disable=attribute-defined-outside-init
+
self._last_loaded_step = self.state.global_step
# Generate completions using vLLM: gather all prompts and use them in a single call in the main process
@@ -333,8 +330,9 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
# Extract prompts from this SP group, accounting for num_generations duplicates
# We only need prompts from one rank in each SP group
group_prompts = all_prompts_text[
- group_leader_rank
- * len(prompts_text) : (group_leader_rank + 1)
+ group_leader_rank * len(prompts_text) : (
+ group_leader_rank + 1
+ )
* len(prompts_text) : self.num_generations
]
@@ -485,7 +483,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
)
if is_conversational(inputs[0]):
completions = []
- for prompt, completion in zip(prompts, completions_text):
+ for prompt, completion in zip(prompts, completions_text, strict=False):
bootstrap = (
prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
)
@@ -503,6 +501,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
self.reward_funcs,
self.reward_processing_classes,
self.reward_func_names,
+ strict=False,
)
):
with profiling_context(self, reward_func_name):
@@ -511,14 +510,17 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
): # Module instead of PretrainedModel for compat with compiled models
if is_conversational(inputs[0]):
messages = [
- {"messages": p + c} for p, c in zip(prompts, completions)
+ {"messages": p + c}
+ for p, c in zip(prompts, completions, strict=False)
]
texts = [
apply_chat_template(x, reward_processing_class)["text"]
for x in messages
]
else:
- texts = [p + c for p, c in zip(prompts, completions)]
+ texts = [
+ p + c for p, c in zip(prompts, completions, strict=False)
+ ]
reward_inputs = reward_processing_class(
text=texts,
return_tensors="pt",
@@ -564,7 +566,8 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
row_reward_kwargs["completion"] = completions[nan_row_idx]
warnings.warn(
f"All reward functions returned None for the following kwargs: {row_reward_kwargs}. "
- "Please ensure that at least one reward function returns a valid reward."
+ "Please ensure that at least one reward function returns a valid reward.",
+ stacklevel=2,
)
# Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
diff --git a/src/axolotl/core/trainers/mamba.py b/src/axolotl/core/trainers/mamba.py
index b475b26d9..dedda1b29 100644
--- a/src/axolotl/core/trainers/mamba.py
+++ b/src/axolotl/core/trainers/mamba.py
@@ -5,7 +5,6 @@ import torch
from axolotl.core.trainers.base import AxolotlTrainer
-# pylint: disable=too-many-ancestors
class AxolotlMambaTrainer(AxolotlTrainer):
"""Mamba specific trainer to handle loss calculation"""
@@ -15,8 +14,8 @@ class AxolotlMambaTrainer(AxolotlTrainer):
self,
model,
inputs,
- return_outputs=False, # pylint: disable=unused-argument
- num_items_in_batch=None, # pylint: disable=unused-argument
+ return_outputs=False,
+ num_items_in_batch=None,
):
input_ids = inputs.pop("input_ids")
lm_logits = model(input_ids).logits
diff --git a/src/axolotl/core/trainers/mixins/__init__.py b/src/axolotl/core/trainers/mixins/__init__.py
index b54577765..5fced1692 100644
--- a/src/axolotl/core/trainers/mixins/__init__.py
+++ b/src/axolotl/core/trainers/mixins/__init__.py
@@ -1,6 +1,5 @@
"""Init for axolotl.core.trainers.mixins"""
-# pylint: disable=unused-import
# flake8: noqa
from .activation_checkpointing import ActivationOffloadingMixin
diff --git a/src/axolotl/core/trainers/mixins/activation_checkpointing.py b/src/axolotl/core/trainers/mixins/activation_checkpointing.py
index 1bfdb49f7..b61c45fee 100644
--- a/src/axolotl/core/trainers/mixins/activation_checkpointing.py
+++ b/src/axolotl/core/trainers/mixins/activation_checkpointing.py
@@ -92,7 +92,7 @@ def get_lora_act_offloading_ctx_manager(
`contextlib.ContextDecorator`:
Activation offloading context manager for the model.
"""
- # pylint: disable=unnecessary-dunder-call
+
activations_handling_ctx = OffloadActivations(
use_pin_memory=use_pin_memory,
use_streams=use_streams,
diff --git a/src/axolotl/core/trainers/mixins/distributed_parallel.py b/src/axolotl/core/trainers/mixins/distributed_parallel.py
index d163e4eb5..77aee5236 100644
--- a/src/axolotl/core/trainers/mixins/distributed_parallel.py
+++ b/src/axolotl/core/trainers/mixins/distributed_parallel.py
@@ -26,7 +26,6 @@ class DistributedParallelMixin(Trainer):
self.accelerator.distributed_type == "FSDP"
and self.accelerator.state.fsdp_plugin is None
):
- # pylint: disable=protected-access
# handle Context Parallelism without FSDP
self.accelerator.state.distributed_type = "MULTI_GPU"
self.accelerator.state._shared_state["distributed_type"] = "MULTI_GPU"
diff --git a/src/axolotl/core/trainers/mixins/optimizer.py b/src/axolotl/core/trainers/mixins/optimizer.py
index a9a9a3992..850442c60 100644
--- a/src/axolotl/core/trainers/mixins/optimizer.py
+++ b/src/axolotl/core/trainers/mixins/optimizer.py
@@ -70,11 +70,11 @@ class OptimizerMixin(Trainer):
}
)
if params["embeddings"]:
- lr = optimizer_kwargs["lr"] # pylint: disable=invalid-name
+ lr = optimizer_kwargs["lr"]
if self.args.embedding_lr_scale:
- lr *= self.args.embedding_lr_scale # pylint: disable=invalid-name
+ lr *= self.args.embedding_lr_scale
elif self.args.embedding_lr:
- lr = self.args.embedding_lr # pylint: disable=invalid-name
+ lr = self.args.embedding_lr
optimizer_grouped_parameters.append(
{
"params": list(params["embeddings"].values()),
@@ -143,7 +143,7 @@ class OptimizerMixin(Trainer):
loraplus_lr_embedding = getattr(
self.args, "loraplus_lr_embedding", 1e-6
)
- self.optimizer = create_loraplus_optimizer( # pylint: disable=attribute-defined-outside-init
+ self.optimizer = create_loraplus_optimizer(
opt_model,
optimizer_cls,
loraplus_lr_ratio=loraplus_lr_ratio,
@@ -185,17 +185,15 @@ class OptimizerMixin(Trainer):
p.data_ptr(): p.numel() for p in module.parameters()
}.values()
)
- LOG.info(f"skipped {module}: {skipped/2**20}M params")
+ LOG.info(f"skipped {module}: {skipped / 2**20}M params")
manager.register_module_override(
module, "weight", {"optim_bits": 32}
)
LOG.debug(f"bitsandbytes: will optimize {module} in fp32")
- LOG.info(f"skipped: {skipped/2**20}M params")
+ LOG.info(f"skipped: {skipped / 2**20}M params")
if is_sagemaker_mp_enabled():
- self.optimizer = smp.DistributedOptimizer( # pylint: disable=attribute-defined-outside-init
- self.optimizer
- )
+ self.optimizer = smp.DistributedOptimizer(self.optimizer)
return self.optimizer
diff --git a/src/axolotl/core/trainers/mixins/scheduler.py b/src/axolotl/core/trainers/mixins/scheduler.py
index 399bf5947..fc2b0e59d 100644
--- a/src/axolotl/core/trainers/mixins/scheduler.py
+++ b/src/axolotl/core/trainers/mixins/scheduler.py
@@ -46,7 +46,7 @@ class SchedulerMixin(Trainer):
)
# fmt: off
- if self.lr_scheduler is None: # type: ignore # pylint: disable=access-member-before-definition
+ if self.lr_scheduler is None: # type: ignore
# fmt: on
plugin_manager = PluginManager.get_instance()
lr_scheduler: LRScheduler | None = plugin_manager.create_lr_scheduler(
@@ -90,7 +90,7 @@ class SchedulerMixin(Trainer):
LOG.warning(
"Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
- self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup( # pylint: disable=attribute-defined-outside-init
+ self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(
optimizer,
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
num_training_steps=num_training_steps,
@@ -98,7 +98,7 @@ class SchedulerMixin(Trainer):
elif self.args.cosine_min_lr_ratio and self.args.cosine_constant_lr_ratio and use_cosine_min_lr:
assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
assert 0 <= self.args.cosine_constant_lr_ratio <= 1.0, "cosine_constant_lr_ratio must be between 0.0 and 1.0"
- self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant( # pylint: disable=attribute-defined-outside-init
+ self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(
optimizer,
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
num_training_steps=num_training_steps,
@@ -107,7 +107,7 @@ class SchedulerMixin(Trainer):
)
elif self.args.cosine_min_lr_ratio and use_cosine_min_lr:
assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
- self.lr_scheduler = get_cosine_schedule_with_min_lr( # pylint: disable=attribute-defined-outside-init
+ self.lr_scheduler = get_cosine_schedule_with_min_lr(
optimizer,
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
num_training_steps=num_training_steps,
@@ -133,7 +133,7 @@ class SchedulerMixin(Trainer):
)
if not self.lr_scheduler:
super().create_scheduler(num_training_steps, optimizer)
- self.lr_scheduler = JaggedLRRestartScheduler( # pylint: disable=attribute-defined-outside-init
+ self.lr_scheduler = JaggedLRRestartScheduler(
optimizer,
self.lr_scheduler,
self.args.jagged_restart_steps,
diff --git a/src/axolotl/core/training_args_base.py b/src/axolotl/core/training_args_base.py
index fd0859ae9..41ee8e91e 100644
--- a/src/axolotl/core/training_args_base.py
+++ b/src/axolotl/core/training_args_base.py
@@ -14,7 +14,6 @@ class AxolotlTrainingMixins:
Mixin class for the Axolotl training args.
"""
- # pylint: disable=duplicate-code
model_type: Optional[str] = field(
default=None, metadata={"help": "HF model configuration model_type."}
)
@@ -50,6 +49,12 @@ class AxolotlTrainingMixins:
default=False,
metadata={"help": "Use real batches for efficient training."},
)
+ include_tkps: bool = field(
+ default=True,
+ metadata={
+ "help": "Whether to include tokens per second in the training metrics."
+ },
+ )
eval_sample_packing: Optional[bool] = field(
default=None,
metadata={"help": "Use sample packing for efficient evals."},
diff --git a/src/axolotl/datasets.py b/src/axolotl/datasets.py
index c9d006ac8..20acb8521 100644
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -1,18 +1,17 @@
-"""Module containing Dataset functionality"""
+"""
+Module containing dataset functionality.
+
+We want this to be a wrapper for an existing dataset that we have loaded. Lets use the
+concept of middlewares to wrap each dataset. We'll use the collators later on to pad the
+datasets.
+"""
-import torch
from datasets import Dataset, IterableDataset
from axolotl.utils.logging import get_logger
from .prompt_tokenizers import PromptTokenizingStrategy
-# We want this to be a wrapper for an existing dataset that we have loaded
-# lets use the concept of middlewares to wrap each dataset, for example
-# ConstantLengthDataset(ShuffledDataset([TokenizedPromptDataset(alpaca_dataset)]))
-# let's check to ensure we don't truncate an item in the middle, we'll use
-# the collators later on to pad the datasets
-
LOG = get_logger(__name__)
@@ -26,7 +25,7 @@ class TokenizedPromptDataset(Dataset):
keep_in_memory: Whether to keep the tokenized dataset in memory.
"""
- def __init__( # pylint: disable=super-init-not-called
+ def __init__(
self,
prompt_tokenizer: PromptTokenizingStrategy,
dataset: Dataset,
@@ -86,133 +85,3 @@ def wrap_dataset_for_tokenized_prompt(
**map_kwargs,
)
return TokenizedPromptDataset(prompt_tokenizer, dataset, **kwargs)
-
-
-# TODO this isn't the best since it can't interleave datasets
-class ConstantLengthDataset(IterableDataset):
- """Iterable dataset that returns constant length chunks of tokens from stream of
- text files.
-
- Args:
- tokenizer: The processor used for processing the data.
- dataset: Dataset with text files.
- seq_length: Length of token sequences to return.
- """
-
- def __init__( # pylint: disable=super-init-not-called
- self,
- tokenizer,
- datasets,
- seq_length=2048,
- ):
- self.tokenizer = tokenizer
- self.concat_token_id = tokenizer.eos_token_id
- self.datasets: list[IterableDataset] = datasets
- self.seq_length = seq_length
-
- vocab_size = len(tokenizer.get_vocab())
-
- if vocab_size <= torch.iinfo(torch.int16).max:
- self.tokens_dtype = torch.int16
- elif vocab_size <= torch.iinfo(torch.int32).max:
- self.tokens_dtype = torch.int32
- else:
- self.tokens_dtype = torch.int64
-
- def __iter__(self):
- buffer = {
- "input_ids": [],
- "attention_mask": [],
- "labels": [],
- "position_ids": [],
- }
- buffer_len = 0
- for dataset in self.datasets:
- idx = 0
- iterator = iter(dataset)
- more_examples = True
- while more_examples:
- try:
- example = next(iterator)
- idx += 1
- except StopIteration:
- more_examples = False
- example = None
-
- add_concat_token = False
- if example:
- example_len = len(example["input_ids"])
- add_concat_token = example["input_ids"][-1] != self.concat_token_id
- else:
- example_len = 0
-
- if not example_len or (
- buffer_len + int(add_concat_token) + example_len > self.seq_length
- ):
- if buffer["input_ids"]:
- input_ids = torch.cat(buffer["input_ids"], dim=-1)[
- : self.seq_length
- ]
- attention_mask = torch.cat(buffer["attention_mask"], dim=-1)[
- : self.seq_length
- ]
- position_ids = torch.cat(buffer["position_ids"], dim=-1)[
- : self.seq_length
- ]
- labels = torch.cat(buffer["labels"], dim=-1)[: self.seq_length]
- if labels.size() == input_ids.size() and (
- attention_mask.size() == input_ids.size()
- ):
- yield {
- "input_ids": input_ids,
- "labels": labels,
- "attention_mask": attention_mask,
- "position_ids": position_ids,
- }
- else:
- LOG.warning(
- "Dropping batch due to tensor size mismatch "
- f"input_ids: {input_ids.size()}, "
- f"labels: {labels.size()}, "
- f"attention_mask: {attention_mask.size()}"
- )
- buffer = {
- "input_ids": [],
- "attention_mask": [],
- "labels": [],
- "position_ids": [],
- }
- buffer_len = 0
- idx = 1
-
- if example:
- # FIXME
- # just going to drop data points that are too long
- if len(example["input_ids"]) <= self.seq_length:
- input_ids = example["input_ids"]
- attention_mask = example["attention_mask"]
- labels = example["labels"]
-
- if add_concat_token:
- input_ids.append(self.concat_token_id)
- attention_mask.append(1)
- labels.append(self.concat_token_id)
-
- input_ids_with_concat = torch.tensor(
- input_ids, dtype=self.tokens_dtype
- )
- attention_mask_with_concat = torch.tensor(
- [idx * m for m in attention_mask], dtype=torch.int16
- )
- labels_with_concat = torch.tensor(
- labels, dtype=self.tokens_dtype
- )
- position_ids = torch.arange(
- len(input_ids), dtype=self.tokens_dtype
- )
-
- buffer["input_ids"].append(input_ids_with_concat)
- buffer["attention_mask"].append(attention_mask_with_concat)
- buffer["labels"].append(labels_with_concat)
- buffer["position_ids"].append(position_ids)
- buffer_len += len(input_ids)
diff --git a/src/axolotl/evaluate.py b/src/axolotl/evaluate.py
index 2b5869939..e4496bee6 100644
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -79,7 +79,7 @@ def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, f
model, tokenizer, _, processor = setup_model_and_tokenizer(cfg)
# Get datasets
- # pylint: disable=duplicate-code
+
train_dataset = dataset_meta.train_dataset
eval_dataset = dataset_meta.eval_dataset
total_num_steps = dataset_meta.total_num_steps
diff --git a/src/axolotl/integrations/base.py b/src/axolotl/integrations/base.py
index 94ee8d4b1..c66bc01c6 100644
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -76,7 +76,7 @@ class BasePlugin:
def __init__(self):
"""Initializes the BasePlugin."""
- def register(self, cfg: dict): # pylint: disable=unused-argument
+ def register(self, cfg: dict):
"""Registers the plugin with the given configuration as an unparsed dict.
Args:
@@ -104,14 +104,13 @@ class BasePlugin:
dataset_meta: The metadata for the training dataset.
"""
- def pre_model_load(self, cfg: DictDefault): # pylint: disable=unused-argument
+ def pre_model_load(self, cfg: DictDefault):
"""Performs actions before the model is loaded.
Args:
cfg: The configuration for the plugin.
"""
- # pylint: disable=unused-argument
def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
"""Performs actions after the model is built/loaded, but before any adapters are applied.
@@ -119,7 +118,6 @@ class BasePlugin:
cfg: The configuration for the plugin.
"""
- # pylint: disable=unused-argument
def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
"""Performs actions before LoRA weights are loaded.
@@ -128,7 +126,6 @@ class BasePlugin:
model: The loaded model.
"""
- # pylint: disable=unused-argument
def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
"""Performs actions after LoRA weights are loaded.
@@ -137,7 +134,6 @@ class BasePlugin:
model: The loaded model.
"""
- # pylint: disable=unused-argument
def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
"""Performs actions after the model is loaded.
@@ -146,8 +142,7 @@ class BasePlugin:
model: The loaded model.
"""
- # pylint: disable=unused-argument
- def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
+ def get_trainer_cls(self, cfg: DictDefault) -> type[Trainer] | None:
"""Returns a custom class for the trainer.
Args:
@@ -157,7 +152,6 @@ class BasePlugin:
The first non-`None` trainer class returned by a plugin.
"""
- # pylint: disable=unused-argument
def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
"""Performs actions after the trainer is created.
@@ -166,7 +160,7 @@ class BasePlugin:
trainer: The trainer object for training.
"""
- def get_training_args(self, cfg: DictDefault): # pylint: disable=unused-argument):
+ def get_training_args(self, cfg: DictDefault):
"""
Returns custom training arguments to set on TrainingArgs.
@@ -177,9 +171,7 @@ class BasePlugin:
object: dict containing the training arguments.
"""
- def get_collator_cls_and_kwargs(
- self, cfg: DictDefault, is_eval: bool = False
- ): # pylint: disable=unused-argument):
+ def get_collator_cls_and_kwargs(self, cfg: DictDefault, is_eval: bool = False):
"""
Returns a custom class for the collator.
@@ -191,7 +183,6 @@ class BasePlugin:
class: The class for the collator.
"""
- # pylint: disable=unused-argument
def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
"""Creates and returns an optimizer for training.
@@ -203,7 +194,6 @@ class BasePlugin:
The created optimizer.
"""
- # pylint: disable=unused-argument
def create_lr_scheduler(
self,
cfg: DictDefault,
@@ -223,7 +213,6 @@ class BasePlugin:
The created learning rate scheduler.
"""
- # pylint: disable=unused-argument
def add_callbacks_pre_trainer(
self, cfg: DictDefault, model: PreTrainedModel
) -> list[Callable]:
@@ -238,7 +227,6 @@ class BasePlugin:
"""
return []
- # pylint: disable=unused-argument
def add_callbacks_post_trainer(
self, cfg: DictDefault, trainer: Trainer
) -> list[Callable]:
@@ -254,7 +242,6 @@ class BasePlugin:
"""
return []
- # pylint: disable=unused-argument
def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
"""Performs actions after training is complete.
@@ -263,7 +250,7 @@ class BasePlugin:
model: The loaded model.
"""
- def post_train_unload(self, cfg: DictDefault): # pylint: disable=unused-argument
+ def post_train_unload(self, cfg: DictDefault):
"""Performs actions after training is complete and the model is unloaded.
Args:
@@ -311,7 +298,7 @@ def load_plugin(plugin_name: str) -> BasePlugin:
return plugin
-class PluginManager: # pylint: disable=too-many-public-methods
+class PluginManager:
"""The `PluginManager` class is responsible for loading and managing plugins. It
should be a singleton so it can be accessed from anywhere in the codebase.
diff --git a/src/axolotl/integrations/config.py b/src/axolotl/integrations/config.py
index f5fc07e9e..8ae8aab39 100644
--- a/src/axolotl/integrations/config.py
+++ b/src/axolotl/integrations/config.py
@@ -20,8 +20,8 @@ from typing import Any, Dict, List, Type
from axolotl.utils.schemas.config import (
AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
+ AxolotlInputConfig as AxolotlInputConfigBase,
)
-from axolotl.utils.schemas.config import AxolotlInputConfig as AxolotlInputConfigBase
def merge_input_args():
@@ -50,15 +50,9 @@ def merge_input_args():
dynamic_input += f"class AxolotlInputConfig(AxolotlInputConfigBase, {', '.join(plugin_classes)}):\n pass\n"
namespace: Dict[Any, Any] = {}
- exec( # pylint: disable=exec-used # nosec B102
- dynamic_input, globals(), namespace
- )
- AxolotlInputConfig = namespace[ # pylint: disable=invalid-name
- "AxolotlInputConfig"
- ]
- AxolotlConfigWCapabilities = namespace[ # pylint: disable=invalid-name
- "AxolotlConfigWCapabilities"
- ]
+ exec(dynamic_input, globals(), namespace) # nosec B102
+ AxolotlInputConfig = namespace["AxolotlInputConfig"]
+ AxolotlConfigWCapabilities = namespace["AxolotlConfigWCapabilities"]
return AxolotlConfigWCapabilities, AxolotlInputConfig
return AxolotlConfigWCapabilitiesBase, AxolotlInputConfigBase
@@ -74,7 +68,7 @@ def merge_training_args() -> Type:
Returns:
tuple: A tuple containing the newly created classes, AxolotlTrainingMixins.
"""
- # pylint: disable=duplicate-code
+
from axolotl.core.training_args_base import (
AxolotlTrainingMixins as AxolotlTrainingMixinsBase,
)
@@ -93,11 +87,7 @@ def merge_training_args() -> Type:
namespace: Dict[Any, Any] = {}
local_vars = {"AxolotlTrainingMixinsBase": AxolotlTrainingMixinsBase}
- exec( # pylint: disable=exec-used # nosec B102
- dynamic_input, {**globals(), **local_vars}, namespace
- )
- AxolotlTrainingMixins = namespace[ # pylint: disable=invalid-name
- "AxolotlTrainingMixins"
- ]
+ exec(dynamic_input, {**globals(), **local_vars}, namespace) # nosec B102
+ AxolotlTrainingMixins = namespace["AxolotlTrainingMixins"]
return AxolotlTrainingMixins
return AxolotlTrainingMixinsBase
diff --git a/src/axolotl/integrations/cut_cross_entropy/README.md b/src/axolotl/integrations/cut_cross_entropy/README.md
index ac67ebf93..81dd6a3a3 100644
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
- If you are installing from pip
```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"
```
## Usage
@@ -31,6 +31,7 @@ plugins:
## Supported Models
+- apertus
- arcee
- cohere
- cohere2
@@ -44,14 +45,23 @@ plugins:
- glm
- glm4
- glm_moe
+- glm4_moe
+- glm4v
+- glm4v_moe
- gpt_oss
- granite
- granitemoe
+- granitemoeshared
+- granitemoehybrid
- hunyuan_v1_dense
- hunyuan_v1_moe
+- lfm2
+- lfm2_moe
+- lfm2_vl
- llama
- llama4
- llama4_text
+- llava
- mistral
- mistral3
- mixtral
@@ -65,7 +75,11 @@ plugins:
- qwen2_5_vl
- qwen3
- qwen3_moe
+- qwen3_vl
+- qwen3_vl_moe
+- qwen3_next
- smollm3
+- seed_oss
- voxtral
## Citation
diff --git a/src/axolotl/integrations/cut_cross_entropy/__init__.py b/src/axolotl/integrations/cut_cross_entropy/__init__.py
index 4689cc9a8..bd0124b93 100644
--- a/src/axolotl/integrations/cut_cross_entropy/__init__.py
+++ b/src/axolotl/integrations/cut_cross_entropy/__init__.py
@@ -18,6 +18,7 @@ Module for the Plugin for Cut Cross Entropy integration with Axolotl.
Cut Cross Entropy is an optimized implementation of cross entropy loss
from Apple's ML team.
"""
+
import importlib
from functools import partial
@@ -28,13 +29,13 @@ from axolotl.utils import get_pytorch_version
from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
from axolotl.utils.logging import get_logger
-from .args import CutCrossEntropyArgs # pylint: disable=unused-import. # noqa: F401
+from .args import CutCrossEntropyArgs as CutCrossEntropyArgs
LOG = get_logger(__name__)
_CCE_INSTALL_MESSAGE = (
"Please install Axolotl's fork of cut_cross_entropy with transformers support using "
- '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8"`'
+ '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"`'
)
@@ -106,9 +107,7 @@ class CutCrossEntropyPlugin(BasePlugin):
"""
from cut_cross_entropy.transformers.patch import PATCH_FNS
- def patch_generic(
- maybe_model, patch_options, model_type: str
- ): # pylint: disable=unused-argument
+ def patch_generic(maybe_model, patch_options, model_type: str):
import cut_cross_entropy.transformers.llama
from cut_cross_entropy.transformers.llama import cce_forward
@@ -121,12 +120,10 @@ class CutCrossEntropyPlugin(BasePlugin):
)
model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
- cut_cross_entropy.transformers.llama._PATCH_OPTS = ( # pylint: disable=protected-access
- patch_options
- )
+ cut_cross_entropy.transformers.llama._PATCH_OPTS = patch_options
model_cls.forward = cce_forward
- # pylint: disable=duplicate-code
+
except (ImportError, AttributeError) as e:
raise RuntimeError(
f"Could not import ForCausalLM class for model_type: {model_type}. "
diff --git a/src/axolotl/integrations/cut_cross_entropy/args.py b/src/axolotl/integrations/cut_cross_entropy/args.py
index 22852479a..3eeb9fac7 100644
--- a/src/axolotl/integrations/cut_cross_entropy/args.py
+++ b/src/axolotl/integrations/cut_cross_entropy/args.py
@@ -15,6 +15,7 @@
"""
Module for handling Cut Cross Entropy input arguments.
"""
+
from typing import Optional
from pydantic import BaseModel, model_validator
diff --git a/src/axolotl/integrations/diffusion/README.md b/src/axolotl/integrations/diffusion/README.md
new file mode 100644
index 000000000..c27f33de1
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/README.md
@@ -0,0 +1,154 @@
+# Diffusion LM Training Plugin for Axolotl
+
+This plugin enables diffusion language model training using an approach inspired by
+LLaDA (Large Language Diffusion Models) within Axolotl.
+
+## Overview
+
+LLaDA is a diffusion-based approach to language model training that uses:
+- **Random token masking** during training instead of next-token prediction
+- **Bidirectional attention** to allow the model to attend to the full context
+- **Importance weighting** based on masking probabilities for stable training
+
+This approach can lead to more robust language models with better understanding of
+bidirectional context.
+
+## Installation
+
+The plugin is included with Axolotl. See our
+[installation docs](https://docs.axolotl.ai/docs/installation.html).
+
+## Quickstart
+
+Train with an example config (Llama‑3.2 1B):
+ - Pretrain: `axolotl train examples/llama-3/diffusion-3.2-1b-pretrain.yaml`
+ - SFT: `axolotl train examples/llama-3/diffusion-3.2-1b-sft.yaml`
+
+### Basic Configuration
+
+You can also modify your existing configs to enable / customize diffusion training.
+
+Add the following to your Axolotl config:
+
+```yaml
+# Enable diffusion LM training plugin
+plugins:
+ - axolotl.integrations.diffusion.DiffusionPlugin
+```
+
+And, configure the nested `diffusion` block (defaults shown):
+
+```yaml
+diffusion:
+ noise_schedule: linear # or "cosine"
+ min_mask_ratio: 0.1
+ max_mask_ratio: 0.9
+ num_diffusion_steps: 128
+ eps: 1e-3
+ importance_weighting: true
+
+ # Mask token (training auto-adds if missing, avoid pad/eos)
+ mask_token_str: "<|diffusion_mask|>"
+ # Or use an existing special token id (e.g., 128002 for Llama-3.x)
+ # mask_token_id: 128002
+
+ # Sample generation during training (optional)
+ generate_samples: true
+ generation_interval: 100
+ num_generation_samples: 3
+ generation_steps: 128
+ generation_temperature: 0.0
+ generation_max_length: 100
+```
+
+## Supported Models
+
+Any models that support 4D attention masks should work out of the box. If not, please
+create an [issue](https://github.com/axolotl-ai-cloud/axolotl/issues) or open a
+[PR](https://github.com/axolotl-ai-cloud/axolotl/compare)!
+
+## How It Works
+
+### Random Masking
+During training, tokens are randomly masked:
+- Sample timestep `t` uniformly from [0, 1]
+- Calculate masking probability: `p = (1 - eps) * t + eps`
+- Randomly mask tokens with probability `p`
+
+### Diffusion Loss
+
+Loss is computed only on masked tokens with (optional) importance weighting:
+
+```python
+loss = sum(cross_entropy(pred, target) / p_mask) / total_tokens
+```
+
+## Sample Generation
+
+When `diffusion.generate_samples: true`, the plugin generates samples during training:
+
+```
+Sample 1:
+ Original (45 tokens): The quick brown fox jumps over the lazy dog...
+ Masked (18/45 tokens, 40.0%): The [MASK] [MASK] fox [MASK] over [MASK] lazy [MASK]...
+ Generated: The quick brown fox jumps over the lazy dog...
+```
+
+Samples are logged to console and wandb (if enabled).
+
+## Inference
+
+Diffusion inference is integrated into the standard Axolotl CLI. Use the same config
+you trained with and run:
+
+```
+axolotl inference path/to/your-config.yaml
+```
+
+Optionally, pass `--gradio` to use a simple web interface.
+
+Interactive controls (prefix the prompt with commands):
+- `:complete N` → completion mode with N new masked tokens appended (default 64)
+- `:mask R` → random masking mode with target mask ratio R in [0.0, 1.0]
+
+Example session:
+
+```
+================================================================================
+Commands:
+:complete N -> completion mode with N tokens (default 64)
+:mask R -> random masking with ratio R (0.0–1.0)
+================================================================================
+Give me an instruction (Ctrl + D to submit):
+
+:mask 0.4 The quick brown fox jumps over the lazy dog
+
+Masked (40.0%):
+The [MASK] brown [MASK] jumps over the [MASK] dog
+
+Generated:
+The quick brown fox jumps over the loud dog
+```
+
+## Metrics and Monitoring
+
+The plugin adds (or modifies) several metrics to track diffusion training:
+
+- `train/loss`: Weighted diffusion loss
+- `train/accuracy`: Accuracy on masked tokens
+- `train/mask_ratio`: Average fraction of tokens masked
+- `train/num_masked_tokens`: Number of tokens masked
+- `train/avg_p_mask`: Average masking probability
+- `train/ce_loss`: Unweighted cross-entropy loss
+- `train/importance_weight_avg`: Average importance weight
+
+## Limitations
+
+- No flash attention support
+- No RL training support
+
+## References
+
+- [LLaDA Paper](https://arxiv.org/abs/2404.10406)
+- [Axolotl Documentation](https://docs.axolotl.ai/)
+- [API reference for plugin](https://docs.axolotl.ai/docs/api/integrations.diffusion.args.html#axolotl.integrations.diffusion.args)
diff --git a/src/axolotl/integrations/diffusion/__init__.py b/src/axolotl/integrations/diffusion/__init__.py
new file mode 100644
index 000000000..9e38cc5c1
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/__init__.py
@@ -0,0 +1,19 @@
+"""Diffusion LM training plugin init."""
+
+from .args import DiffusionArgs, DiffusionConfig
+from .callbacks import DiffusionGenerationCallback
+from .generation import generate
+from .plugin import DiffusionPlugin
+from .trainer import DiffusionTrainer
+from .utils import create_bidirectional_attention_mask, resolve_mask_token_id
+
+__all__ = [
+ "DiffusionArgs",
+ "DiffusionPlugin",
+ "DiffusionTrainer",
+ "generate",
+ "resolve_mask_token_id",
+ "create_bidirectional_attention_mask",
+ "DiffusionGenerationCallback",
+ "DiffusionConfig",
+]
diff --git a/src/axolotl/integrations/diffusion/args.py b/src/axolotl/integrations/diffusion/args.py
new file mode 100644
index 000000000..4f5bfe499
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/args.py
@@ -0,0 +1,95 @@
+"""Config args for diffusion LM training (nested under `diffusion:`)."""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import BaseModel, Field, model_validator
+
+
+class DiffusionConfig(BaseModel):
+ """Nested diffusion configuration available under the `diffusion` key."""
+
+ # Noise schedule config
+ noise_schedule: Literal["linear", "cosine"] = Field(
+ default="linear", description="Type of noise schedule for diffusion training"
+ )
+ min_mask_ratio: float = Field(
+ default=0.1,
+ ge=0.0,
+ le=1.0,
+ description="Minimum masking ratio for diffusion noise schedule",
+ )
+ max_mask_ratio: float = Field(
+ default=0.9,
+ ge=0.0,
+ le=1.0,
+ description="Maximum masking ratio for diffusion noise schedule",
+ )
+ num_diffusion_steps: int = Field(
+ default=128, ge=1, description="Number of diffusion timesteps"
+ )
+ eps: float = Field(
+ default=1e-3,
+ ge=0.0,
+ le=1.0,
+ description="Epsilon value for minimum masking probability in forward process",
+ )
+
+ # Training config
+ importance_weighting: bool = Field(
+ default=True,
+ description="Apply importance weighting to loss based on masking probability",
+ )
+ mask_token_id: int | None = Field(
+ default=None,
+ description=(
+ "Token ID to use for masking. Unset by default; can use one of the "
+ "tokenizer's special tokens here."
+ ),
+ )
+ mask_token_str: str | None = Field(
+ default=None,
+ description=(
+ "Token string to use as a mask. If `mask_token_id` is invalid or unset, "
+ "this token will be ensured to exist as an additional special token and "
+ "used. If absent, a default '<|diffusion_mask|>' will be added."
+ ),
+ )
+
+ # Sample generation config
+ generate_samples: bool = Field(
+ default=True, description="Enable sample generation during training"
+ )
+ generation_interval: int = Field(
+ default=100, ge=1, description="Generate samples every N steps"
+ )
+ num_generation_samples: int = Field(
+ default=3, ge=1, description="Number of samples to generate each time"
+ )
+ generation_steps: int = Field(
+ default=128, ge=1, description="Number of diffusion steps for generation"
+ )
+ generation_temperature: float = Field(
+ default=0.0,
+ ge=0.0,
+ description="Temperature for generation sampling (0.0 = deterministic)",
+ )
+ generation_max_length: int = Field(
+ default=100, ge=1, description="Maximum sequence length for generation"
+ )
+
+ @model_validator(mode="after")
+ def _validate_mask_ratios(self) -> "DiffusionConfig":
+ if self.min_mask_ratio > self.max_mask_ratio:
+ raise ValueError("min_mask_ratio must be ≤ max_mask_ratio")
+ return self
+
+
+class DiffusionArgs(BaseModel):
+ """Plugin entry that exposes the nested `diffusion` block to the core config."""
+
+ diffusion: DiffusionConfig = Field(
+ default_factory=DiffusionConfig,
+ description="Diffusion training configuration. Only nested block is supported.",
+ )
diff --git a/src/axolotl/integrations/diffusion/callbacks.py b/src/axolotl/integrations/diffusion/callbacks.py
new file mode 100644
index 000000000..18a64023b
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/callbacks.py
@@ -0,0 +1,174 @@
+"""Callbacks for diffusion training."""
+
+import logging
+import sys
+
+import wandb
+from colorama import Fore, Style
+from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
+from transformers.training_args import TrainingArguments
+
+from .generation import generate_samples
+
+# Simpler logger for more readable sample generation
+logger = logging.getLogger(__name__)
+if not logger.handlers:
+ handler = logging.StreamHandler(sys.stdout)
+ handler.setFormatter(logging.Formatter("%(message)s"))
+ logger.addHandler(handler)
+ logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+class DiffusionGenerationCallback(TrainerCallback):
+ """Callback for generating samples during diffusion training."""
+
+ def __init__(self, trainer):
+ self.trainer = trainer
+
+ def on_step_end(
+ self,
+ args: TrainingArguments,
+ state: TrainerState,
+ control: TrainerControl,
+ **kwargs,
+ ):
+ """Generate samples at specified intervals."""
+ if (
+ state.global_step > 0
+ and state.global_step % self.trainer.cfg.diffusion.generation_interval == 0
+ ):
+ if not self.trainer.state.is_world_process_zero:
+ return
+
+ # Use eval dataloader if available, otherwise use train dataloader
+ dataloader = None
+ try:
+ if getattr(self.trainer, "eval_dataset", None) is not None:
+ dataloader = self.trainer.get_eval_dataloader()
+ except Exception:
+ dataloader = None
+ if dataloader is None:
+ dataloader = self.trainer.get_train_dataloader()
+
+ # Generate samples
+ diffusion_cfg = self.trainer.cfg.diffusion
+ samples = generate_samples(
+ model=self.trainer.model,
+ tokenizer=self.trainer.processing_class,
+ dataloader=dataloader,
+ num_generation_samples=diffusion_cfg.num_generation_samples,
+ max_length=diffusion_cfg.generation_max_length,
+ num_diffusion_steps=diffusion_cfg.generation_steps,
+ temperature=diffusion_cfg.generation_temperature,
+ mask_token_id=diffusion_cfg.mask_token_id,
+ )
+
+ # Log samples
+ self._log_samples(samples, state.global_step)
+
+ def _log_samples(self, samples: list, step: int):
+ """Log generated samples."""
+ if not samples:
+ return
+
+ logger.info("=" * 60)
+ logger.info("GENERATED SAMPLES")
+ logger.info("=" * 60)
+
+ for i, sample_data in enumerate(samples, 1):
+ original = sample_data["original"]
+ masked = sample_data["masked"]
+ generated = sample_data["generated"]
+ mask_ratio = sample_data["mask_ratio"]
+ masked_tokens = sample_data["masked_tokens"]
+ total_tokens = sample_data["total_tokens"]
+
+ logger.info(f"\nSample {i}:")
+ logger.info(f"\tOriginal ({total_tokens} tokens): {original}")
+ logger.info(
+ f"\tMasked ({masked_tokens}/{total_tokens} tokens, "
+ f"{mask_ratio:.1%}): {masked}"
+ )
+
+ try:
+ gen_ids = sample_data.get("generated_ids")
+ orig_ids = sample_data.get("orig_ids")
+ masked_positions = set(sample_data.get("masked_positions") or [])
+ if isinstance(gen_ids, list) and isinstance(orig_ids, list):
+ styles: list[str] = []
+ for i, tid in enumerate(gen_ids):
+ if i in masked_positions:
+ if i < len(orig_ids) and tid == orig_ids[i]:
+ styles.append("green")
+ elif i < len(orig_ids):
+ styles.append("red")
+ else:
+ styles.append("normal")
+ else:
+ same = i < len(orig_ids) and tid == orig_ids[i]
+ styles.append("dim" if same else "normal")
+
+ spans: list[tuple[str, int, int]] = []
+ if gen_ids:
+ cur = styles[0]
+ start = 0
+ for i in range(1, len(gen_ids)):
+ s = styles[i]
+ if s != cur:
+ spans.append((cur, start, i))
+ cur, start = s, i
+ spans.append((cur, start, len(gen_ids)))
+
+ parts = []
+ for style_name, a, b in spans:
+ chunk_text = self.trainer.processing_class.decode(
+ gen_ids[a:b], skip_special_tokens=False
+ )
+ if style_name == "green":
+ parts.append(Fore.GREEN + chunk_text + Style.RESET_ALL)
+ elif style_name == "red":
+ parts.append(Fore.RED + chunk_text + Style.RESET_ALL)
+ else:
+ if style_name == "dim":
+ parts.append(Style.DIM + chunk_text + Style.RESET_ALL)
+ else:
+ parts.append(chunk_text)
+ logger.info("\tGenerated:\n%s", "".join(parts))
+ else:
+ logger.info(f"\tGenerated: {generated}")
+ except Exception:
+ logger.info(f"\tGenerated: {generated}")
+
+ logger.info("=" * 60)
+
+ if self.trainer.cfg.use_wandb:
+ if wandb.run is not None:
+ wandb.log(
+ {
+ "generated_samples": wandb.Table(
+ columns=[
+ "step",
+ "original",
+ "masked",
+ "generated",
+ "mask_ratio",
+ "masked_tokens",
+ "total_tokens",
+ ],
+ data=[
+ [
+ step,
+ sample["original"],
+ sample["masked"],
+ sample["generated"],
+ f"{sample['mask_ratio']:.1%}",
+ sample["masked_tokens"],
+ sample["total_tokens"],
+ ]
+ for sample in samples
+ ],
+ )
+ },
+ step=step,
+ )
diff --git a/src/axolotl/integrations/diffusion/generation.py b/src/axolotl/integrations/diffusion/generation.py
new file mode 100644
index 000000000..ec517fd23
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/generation.py
@@ -0,0 +1,409 @@
+"""Sample generation utilities for diffusion training."""
+
+import re
+from typing import Any, List, Literal, Optional
+
+import torch
+
+from axolotl.utils.logging import get_logger
+
+from .utils import create_bidirectional_attention_mask, shift_logits_to_input_positions
+
+LOG = get_logger(__name__)
+
+
+def generate_samples(
+ model: torch.nn.Module,
+ tokenizer: Any,
+ dataloader: Optional[Any] = None,
+ num_generation_samples: int = 3,
+ max_length: int = 100,
+ num_diffusion_steps: int = 128,
+ temperature: float = 0.0,
+ mask_token_id: int = 32000,
+ mode: Literal["random", "completion"] = "random",
+ completion_tokens: int = 0,
+ target_mask_ratio: Optional[float] = None,
+) -> List[dict]:
+ """
+ Generate text samples using the diffusion model by randomly masking sequences from
+ the given dataset and running the reverse diffusion process.
+
+ Args:
+ model: The wrapped or unwrapped model
+ tokenizer: Tokenizer for encoding/decoding
+ dataloader: Validation dataloader (for sampling sequences)
+ num_generation_samples: Number of samples to generate
+ max_length: Maximum length of sequences to use
+ num_diffusion_steps: Number of diffusion steps for generation
+ temperature: Temperature for sampling (0.0 = deterministic)
+ mask_token_id: Token ID used for masking
+
+ Returns:
+ List of dictionaries with original text, masked text, and generated text
+ """
+ if dataloader is None:
+ LOG.warning("No validation dataloader provided, cannot generate samples")
+ return []
+
+ unwrapped_model = model.module if hasattr(model, "module") else model
+ training = unwrapped_model.training
+ unwrapped_model.eval()
+
+ # Resolve device robustly (some modules don't expose `.device`)
+ device = getattr(unwrapped_model, "device", None)
+ if device is None:
+ try:
+ device = next(unwrapped_model.parameters()).device
+ except StopIteration:
+ device = torch.device("cpu")
+ generations = []
+
+ # Sample sequences from validation dataset
+ sampled_sequences = _sample_sequences_from_dataloader(
+ dataloader, num_generation_samples, max_length, device
+ )
+ LOG.info(f"Sampled {len(sampled_sequences)} sequences from validation dataset")
+
+ # Generate samples using reverse diffusion process
+ with torch.no_grad():
+ for sample in sampled_sequences:
+ if isinstance(sample, dict):
+ original_sequence = sample.get("input_ids")
+ labels_seq = sample.get("labels")
+ attn_seq = sample.get("attention_mask")
+ else:
+ original_sequence = sample
+ labels_seq = None
+ attn_seq = None
+ generation_result = generate(
+ unwrapped_model,
+ tokenizer,
+ original_sequence,
+ num_diffusion_steps,
+ temperature,
+ mask_token_id,
+ mode=mode,
+ completion_tokens=completion_tokens,
+ target_mask_ratio=target_mask_ratio,
+ labels=labels_seq,
+ attention_mask=attn_seq,
+ )
+ generations.append(generation_result)
+
+ # Restore prior training state
+ if training:
+ unwrapped_model.train()
+ else:
+ unwrapped_model.eval()
+
+ return generations
+
+
+def _sample_sequences_from_dataloader(
+ dataloader: Any, num_samples: int, max_length: int, device: torch.device
+) -> List[Any]:
+ """Sample sequences from validation dataloader."""
+ sampled_sequences: list[dict[str, torch.Tensor] | torch.Tensor] = []
+ sample_count = 0
+
+ # Skip a random number of batches (we could be more clever about this)
+ skip_batches = torch.randint(0, 10, (1,)).item()
+ batch_count = 0
+
+ for batch in dataloader:
+ # Skip some batches for variety
+ if batch_count < skip_batches:
+ batch_count += 1
+ continue
+
+ if sample_count >= num_samples:
+ break
+
+ batch_count += 1
+ input_ids = batch["input_ids"]
+ attention_mask = batch.get("attention_mask")
+ labels = batch.get("labels")
+
+ # Randomly sample from sequences in this batch
+ batch_indices = torch.randperm(input_ids.size(0)).tolist()
+
+ for i in batch_indices:
+ if sample_count >= num_samples:
+ break
+
+ # Get actual sequence length (non-padded)
+ if attention_mask is not None:
+ seq_len = attention_mask[i].sum().item()
+ else:
+ seq_len = input_ids.size(1)
+
+ if seq_len < 10:
+ continue
+
+ # Determine truncation length
+ max_total = min(seq_len, max_length)
+ if labels is not None:
+ labels_i = labels[i][:seq_len]
+ answer_mask = labels_i != -100
+ if not answer_mask.any():
+ # No answer tokens; skip for SFT masking
+ continue
+ first_ans_idx = int(
+ torch.nonzero(answer_mask, as_tuple=False)[0].item()
+ )
+ prompt_len = first_ans_idx
+ if prompt_len >= max_total:
+ # Prompt alone reaches cap; cannot include any answer
+ continue
+ remaining_answer = int(answer_mask[prompt_len:].sum().item())
+ allowed_answer = max_total - prompt_len
+ take_answer = min(remaining_answer, allowed_answer)
+ if take_answer <= 0:
+ continue
+ actual_length = prompt_len + take_answer
+ else:
+ actual_length = max_total
+
+ # Extract the (possibly truncated) sequence
+ sequence = input_ids[i][:actual_length].unsqueeze(0).to(device)
+ attn_seq = (
+ attention_mask[i][:actual_length].unsqueeze(0).to(device)
+ if attention_mask is not None
+ else None
+ )
+ if labels is not None:
+ labels_seq = labels[i][:actual_length].unsqueeze(0).to(device)
+ sampled_sequences.append(
+ {
+ "input_ids": sequence,
+ "labels": labels_seq,
+ "attention_mask": attn_seq,
+ }
+ )
+ else:
+ if attn_seq is not None:
+ sampled_sequences.append(
+ {"input_ids": sequence, "attention_mask": attn_seq}
+ )
+ else:
+ sampled_sequences.append(sequence)
+ sample_count += 1
+
+ return sampled_sequences
+
+
+def generate(
+ model: torch.nn.Module,
+ tokenizer: Any,
+ original_sequence: torch.Tensor,
+ num_diffusion_steps: int,
+ temperature: float,
+ mask_token_id: int,
+ *,
+ mode: Literal["random", "completion"] = "random",
+ completion_tokens: int = 0,
+ target_mask_ratio: Optional[float] = None,
+ labels: Optional[torch.Tensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+) -> dict:
+ """Generate a single sample using reverse diffusion."""
+ # Get original text for comparison
+ original_text = tokenizer.decode(
+ original_sequence[0].cpu(), skip_special_tokens=True
+ )
+
+ # Build masked sequence
+ if (
+ labels is not None
+ and labels.numel() > 0
+ and (labels == -100).any()
+ and (labels != -100).any()
+ ):
+ # SFT case: completely mask all answer tokens (labels != -100)
+ total_tokens = original_sequence.size(1)
+ masked_indices = (labels != -100).to(dtype=torch.bool)
+ masked_sequence = original_sequence.clone()
+ masked_sequence[masked_indices] = mask_token_id
+ masked_tokens = int(masked_indices.sum().item())
+ mask_ratio = masked_tokens / max(int(total_tokens), 1)
+ elif mode == "completion" and completion_tokens > 0:
+ # Append mask tokens to the right for completion
+ total_tokens = original_sequence.size(1) + int(completion_tokens)
+ masked_indices = torch.zeros(
+ 1, total_tokens, dtype=torch.bool, device=original_sequence.device
+ )
+ masked_indices[0, -int(completion_tokens) :] = True
+
+ append = torch.full(
+ (1, int(completion_tokens)), mask_token_id, device=original_sequence.device
+ )
+ masked_sequence = torch.cat([original_sequence, append], dim=1)
+ masked_tokens = int(completion_tokens)
+ mask_ratio = masked_tokens / total_tokens
+ else:
+ # Apply random masking with optional fixed ratio
+ total_tokens = original_sequence.size(1)
+ if target_mask_ratio is None:
+ min_ratio, max_ratio = 0.1, 0.7
+ target_mask_ratio = (
+ torch.rand(1).item() * (max_ratio - min_ratio) + min_ratio
+ )
+ target_masked_tokens = max(1, int(total_tokens * float(target_mask_ratio)))
+
+ # Create random mask indices
+ mask_positions = torch.randperm(total_tokens)[:target_masked_tokens]
+ masked_indices = torch.zeros(
+ 1, total_tokens, dtype=torch.bool, device=original_sequence.device
+ )
+ masked_indices[0, mask_positions] = True
+
+ # Create masked sequence
+ masked_sequence = original_sequence.clone()
+ masked_sequence[masked_indices] = mask_token_id
+
+ # Calculate actual mask ratio
+ masked_tokens = masked_indices.sum().item()
+ mask_ratio = masked_tokens / total_tokens
+
+ # Get masked text for comparison
+ masked_text = tokenizer.decode(masked_sequence[0].cpu(), skip_special_tokens=False)
+ masked_text = _clean_masked_text(masked_text, tokenizer, mask_token_id)
+
+ # Run reverse diffusion process
+ sequence = masked_sequence.clone()
+ attention_mask = create_bidirectional_attention_mask(
+ sequence, attention_mask, sample_packing=attention_mask is not None
+ )
+ for step in range(num_diffusion_steps):
+ sequence = _diffusion_step(
+ model,
+ sequence,
+ step,
+ num_diffusion_steps,
+ temperature,
+ mask_token_id,
+ attention_mask,
+ )
+ generated_text = tokenizer.decode(sequence[0].cpu(), skip_special_tokens=True)
+
+ # Collect diagnostic info
+ final_ids = sequence[0].detach().cpu().tolist()
+ orig_ids_for_render = original_sequence[0].detach().cpu().tolist()
+ if masked_indices is not None:
+ masked_positions = (
+ torch.where(masked_indices[0])[0].detach().cpu().tolist()
+ if masked_indices.ndim == 2
+ else []
+ )
+ else:
+ masked_positions = []
+
+ result = {
+ "original": original_text,
+ "masked": masked_text,
+ "generated": generated_text,
+ "mask_ratio": mask_ratio,
+ "masked_tokens": masked_tokens,
+ "total_tokens": total_tokens,
+ "generated_ids": final_ids,
+ "masked_positions": masked_positions,
+ "orig_ids": orig_ids_for_render,
+ "formatted": (
+ f"Original: '{original_text}' → Masked: '{masked_text}' "
+ f"({mask_ratio:.1%}) → Generated: '{generated_text}'"
+ ),
+ }
+
+ return result
+
+
+def _clean_masked_text(masked_text: str, tokenizer: Any, mask_token_id: int) -> str:
+ """Clean up masked text for display."""
+ mask_token_repr = tokenizer.decode([mask_token_id], skip_special_tokens=False)
+ cleaned = masked_text.replace(mask_token_repr, "[MASK]")
+
+ # Remove literal special token strings
+ if hasattr(tokenizer, "special_tokens_map"):
+ for token_value in tokenizer.special_tokens_map.values():
+ if token_value and isinstance(token_value, str):
+ cleaned = cleaned.replace(token_value, "")
+
+ # Normalize whitespace but preserve newlines
+ cleaned = cleaned.replace("\r\n", "\n").replace("\r", "\n")
+ cleaned = re.sub(r"[ \t]+", " ", cleaned)
+ cleaned = "\n".join(line.rstrip() for line in cleaned.split("\n")).strip()
+ return cleaned
+
+
+def _diffusion_step(
+ model: torch.nn.Module,
+ sequence: torch.Tensor,
+ step: int,
+ num_diffusion_steps: int,
+ temperature: float,
+ mask_token_id: int,
+ attention_mask: torch.Tensor | None = None,
+) -> torch.Tensor:
+ """Perform a single diffusion step with remasking."""
+ # Only process if there are masked tokens remaining
+ current_mask = sequence == mask_token_id
+ if not current_mask.any():
+ return sequence
+
+ # Create or use provided attention mask
+ if attention_mask is None:
+ batch_size, seq_len = sequence.shape
+ attention_mask = torch.ones(
+ batch_size, 1, seq_len, seq_len, dtype=torch.bool, device=sequence.device
+ )
+
+ # Forward pass
+ outputs = model(input_ids=sequence, attention_mask=attention_mask)
+ logits = shift_logits_to_input_positions(outputs.logits)
+
+ # Only sample at currently masked positions
+ if current_mask.any():
+ masked_logits = logits[current_mask]
+
+ # Apply temperature scaling
+ if temperature > 0:
+ scaled_logits = masked_logits / temperature
+ else:
+ scaled_logits = masked_logits
+
+ # Suppress mask token in outputs
+ scaled_logits[:, mask_token_id] = -float("inf")
+
+ if temperature > 0:
+ # Add Gumbel noise for sampling
+ gumbel_noise = -torch.log(
+ -torch.log(torch.rand_like(scaled_logits, dtype=torch.float32))
+ )
+ gumbel_logits = scaled_logits + gumbel_noise
+ predicted_tokens = torch.argmax(gumbel_logits, dim=-1)
+ else:
+ predicted_tokens = torch.argmax(scaled_logits, dim=-1)
+
+ # Calculate probabilities for confidence scoring
+ probs = torch.softmax(scaled_logits, dim=-1)
+ predicted_token_probs = probs[range(len(predicted_tokens)), predicted_tokens]
+
+ # Determine how many tokens to unmask this step
+ remaining_masked = current_mask.sum().item()
+ if step == num_diffusion_steps - 1:
+ num_to_unmask = remaining_masked
+ else:
+ unmask_ratio = 1.0 / (num_diffusion_steps - step)
+ num_to_unmask = max(1, int(remaining_masked * unmask_ratio))
+
+ # Select highest confidence predictions to unmask
+ if num_to_unmask >= remaining_masked:
+ sequence[current_mask] = predicted_tokens
+ else:
+ _, top_indices = predicted_token_probs.topk(num_to_unmask)
+ mask_positions = torch.where(current_mask)[1]
+ positions_to_unmask = mask_positions[top_indices]
+ sequence[0, positions_to_unmask] = predicted_tokens[top_indices]
+
+ return sequence
diff --git a/src/axolotl/integrations/diffusion/plugin.py b/src/axolotl/integrations/diffusion/plugin.py
new file mode 100644
index 000000000..c31f48b03
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/plugin.py
@@ -0,0 +1,41 @@
+"""Diffusion LM training plugin for Axolotl."""
+
+from peft import PeftModel
+from transformers import PreTrainedModel
+
+from axolotl.integrations.base import BasePlugin
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
+
+from .trainer import DiffusionTrainer
+
+LOG = get_logger(__name__)
+
+
+class DiffusionPlugin(BasePlugin):
+ """
+ Plugin for diffusion language model training.
+
+ This plugin enables diffusion-based training using the LLaDA approach, which uses
+ random masking and bidirectional attention to train language models.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.cfg = None
+
+ def get_input_args(self) -> str:
+ """Returns the pydantic model for LLaDA plugin arguments."""
+ return "axolotl.integrations.diffusion.DiffusionArgs"
+
+ def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+ """Perform actions after model is loaded."""
+ self.cfg = cfg
+
+ def get_trainer_cls(self, cfg: DictDefault) -> type[DiffusionTrainer] | None:
+ """Return custom trainer class for diffusion training."""
+ return DiffusionTrainer
+
+ def post_trainer_create(self, cfg: DictDefault, trainer: DiffusionTrainer):
+ """Configure trainer after creation."""
+ trainer.set_config(cfg)
diff --git a/src/axolotl/integrations/diffusion/trainer.py b/src/axolotl/integrations/diffusion/trainer.py
new file mode 100644
index 000000000..dfaef2a48
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/trainer.py
@@ -0,0 +1,301 @@
+"""Custom trainer for diffusion LM training."""
+
+from typing import Any, Literal
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from axolotl.core.trainers.base import AxolotlTrainer
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
+
+from .callbacks import DiffusionGenerationCallback
+from .utils import create_bidirectional_attention_mask, shift_logits_to_input_positions
+
+LOG = get_logger(__name__)
+
+
+class DiffusionTrainer(AxolotlTrainer):
+ """Custom trainer for diffusion LM training that overrides loss computation."""
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.cfg = None
+ self._special_token_ids = None
+
+ def set_config(self, config: DictDefault):
+ """Set config for diffusion training."""
+ self.cfg = config
+ self._cache_special_token_ids()
+ self._resolve_mask_token_id()
+
+ token_id = int(getattr(self.cfg.diffusion, "mask_token_id", 0))
+ LOG.info(f"Diffusion: using mask_token_id={token_id}")
+
+ if getattr(config.diffusion, "generate_samples", True):
+ generation_callback = DiffusionGenerationCallback(self)
+ self.add_callback(generation_callback)
+
+ def _resolve_mask_token_id(self) -> None:
+ """Ensure mask_token_id is valid for the current tokenizer."""
+ from .utils import resolve_mask_token_id
+
+ tokenizer = getattr(self, "processing_class", None)
+ if tokenizer is None:
+ return
+
+ mid = resolve_mask_token_id(
+ tokenizer,
+ self.cfg,
+ allow_add=True,
+ model=getattr(self, "model", None),
+ )
+ try:
+ self.cfg.diffusion.mask_token_id = int(mid)
+ except Exception:
+ pass
+
+ def compute_loss(
+ self,
+ model: nn.Module,
+ inputs: dict[str, torch.Tensor],
+ return_outputs: bool = False,
+ num_items_in_batch: torch.Tensor | None = None,
+ ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
+ """Override compute_loss to use diffusion loss."""
+ input_ids = inputs.get("input_ids")
+ attention_mask = inputs.get("attention_mask")
+ labels = inputs.get("labels")
+
+ if input_ids is None:
+ raise ValueError("input_ids is required for diffusion training")
+
+ loss, outputs = self._compute_diffusion_loss(
+ model, input_ids, attention_mask, labels
+ )
+
+ if return_outputs:
+ return loss, outputs
+ return loss
+
+ def _cache_special_token_ids(self):
+ """Cache special token IDs to avoid repeated tokenizer access."""
+ if self.processing_class is None:
+ self._special_token_ids = set()
+ return
+
+ tokenizer = self.processing_class
+ special_tokens = set()
+
+ if hasattr(tokenizer, "bos_token_id") and tokenizer.bos_token_id is not None:
+ special_tokens.add(tokenizer.bos_token_id)
+ if hasattr(tokenizer, "eos_token_id") and tokenizer.eos_token_id is not None:
+ special_tokens.add(tokenizer.eos_token_id)
+ if hasattr(tokenizer, "pad_token_id") and tokenizer.pad_token_id is not None:
+ special_tokens.add(tokenizer.pad_token_id)
+
+ self._special_token_ids = special_tokens
+
+ def _forward_process(
+ self,
+ input_ids: torch.Tensor,
+ attention_mask: torch.Tensor | None = None,
+ labels: torch.Tensor | None = None,
+ eps: float = 1e-3,
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ """
+ Forward noising process. A timestep is sampled along the process, and tokens are
+ masked with probability determined by the configured noise schedule.
+
+ Args:
+ input_ids: Input token ids [batch_size, seq_len].
+ attention_mask: Attention mask [batch_size, seq_len].
+ labels: Labels for SFT training [batch_size, seq_len].
+ eps: Small epsilon value for minimum masking probability.
+
+ Returns:
+ noisy_batch: Input with some tokens masked.
+ masked_indices: Boolean mask indicating which tokens were masked.
+ p_mask: Masking probabilities for each token [batch_size, seq_len].
+ """
+ batch_size, seq_len = input_ids.shape
+ device = input_ids.device
+
+ # Sample random timesteps for each sample in batch
+ t = torch.rand(batch_size, device=device)
+ p_mask = (1 - eps) * t + eps # [batch_size]
+ p_mask = p_mask[:, None].repeat(1, seq_len) # [batch_size, seq_len]
+
+ # Don't mask padding tokens if attention_mask is provided
+ if attention_mask is not None:
+ valid_mask = attention_mask.bool()
+ p_mask = p_mask * valid_mask.float()
+
+ # Create mask to exclude special tokens
+ special_token_mask = torch.zeros_like(input_ids, dtype=torch.bool)
+ if self._special_token_ids:
+ for token_id in self._special_token_ids:
+ special_token_mask |= input_ids == token_id
+
+ # Create random mask based on p_mask
+ masked_indices = torch.rand((batch_size, seq_len), device=device) < p_mask
+ masked_indices = masked_indices & ~special_token_mask
+ if attention_mask is not None:
+ masked_indices = masked_indices & attention_mask.bool()
+
+ # For SFT data, only mask answer tokens
+ if labels is not None:
+ answer_mask = labels != -100
+ masked_indices = masked_indices & answer_mask
+
+ # Create masked input
+ mask_token_id = int(self.cfg.diffusion.mask_token_id)
+ mask_value = torch.full_like(input_ids, mask_token_id)
+ noisy_batch = torch.where(masked_indices, mask_value, input_ids)
+
+ return noisy_batch, masked_indices, p_mask
+
+ def _compute_diffusion_loss(
+ self,
+ model: nn.Module,
+ input_ids: torch.Tensor,
+ attention_mask: torch.Tensor | None = None,
+ labels: torch.Tensor | None = None,
+ ) -> tuple[torch.Tensor, torch.Tensor | Any]:
+ """
+ Compute diffusion loss.
+
+ Args:
+ model: The model to compute loss for.
+ input_ids: Ground truth token ids [batch_size, seq_len].
+ attention_mask: Attention mask [batch_size, seq_len].
+ labels: Labels for SFT training [batch_size, seq_len].
+
+ Returns:
+ loss: Cross-entropy loss.
+ metrics: Dictionary of metrics.
+ """
+ # Short-circuit empty sequences
+ if input_ids is None or input_ids.numel() == 0 or input_ids.shape[1] == 0:
+ zero = torch.tensor(
+ 0.0,
+ device=(input_ids.device if input_ids is not None else None),
+ requires_grad=True,
+ )
+ return zero, {}
+
+ # If an attention_mask is provided and all positions are padding for every
+ # sample in this batch, skip the step.
+ if attention_mask is not None:
+ if attention_mask.dim() == 2 and (attention_mask.sum(dim=1) == 0).all():
+ zero = torch.tensor(0.0, device=input_ids.device, requires_grad=True)
+ return zero, {}
+
+ # Apply forward process
+ noisy_batch, masked_indices, p_mask = self._forward_process(
+ input_ids, attention_mask, labels, self.cfg.diffusion.eps
+ )
+
+ # Create bidirectional attention mask
+ bidirectional_mask = create_bidirectional_attention_mask(
+ input_ids, attention_mask, sample_packing=self.cfg.sample_packing
+ )
+
+ # Forward pass
+ outputs = model(
+ input_ids=noisy_batch.long(),
+ attention_mask=bidirectional_mask,
+ )
+ logits = shift_logits_to_input_positions(outputs.logits)
+
+ if masked_indices.sum() > 0:
+ valid_indices = torch.where(masked_indices)
+ batch_indices, seq_indices = valid_indices
+
+ masked_logits = logits[batch_indices, seq_indices]
+ masked_targets = input_ids[batch_indices, seq_indices]
+ masked_p_mask = p_mask[batch_indices, seq_indices]
+
+ # Compute cross-entropy loss without reduction
+ token_loss = F.cross_entropy(
+ masked_logits.float(), masked_targets, reduction="none"
+ )
+
+ if self.cfg.diffusion.importance_weighting:
+ masked_p_mask = masked_p_mask.float()
+ weighted_loss = token_loss / masked_p_mask
+ else:
+ weighted_loss = token_loss
+
+ if labels is not None:
+ # For SFT data: normalize by answer token count per sample
+ answer_mask = labels != -100
+ answer_lengths = answer_mask.sum(dim=1).float() # [batch_size]
+
+ # Get batch indices for masked tokens
+ masked_batch_indices = batch_indices
+
+ # Sum losses per sample and divide by answer length
+ batch_size = input_ids.shape[0]
+ loss_per_sample = torch.zeros(batch_size, device=input_ids.device)
+ for i in range(batch_size):
+ sample_mask = masked_batch_indices == i
+ if sample_mask.sum() > 0:
+ sample_loss = weighted_loss[sample_mask].sum()
+ denom = answer_lengths[i].clamp(min=1.0)
+ loss_per_sample[i] = sample_loss / denom
+
+ loss = loss_per_sample.mean()
+ else:
+ # Non-SFT: when importance weighting is enabled, use unbiased estimator
+ # (sum(loss/p) / total_tokens). Otherwise, average over masked tokens
+ # for stable scaling across varying mask ratios.
+ if self.cfg.diffusion.importance_weighting:
+ loss = weighted_loss.sum() / (
+ input_ids.shape[0] * input_ids.shape[1]
+ )
+ else:
+ loss = weighted_loss.mean()
+
+ ce_loss = token_loss.mean()
+
+ # Compute accuracy on masked tokens
+ with torch.no_grad():
+ pred_tokens = masked_logits.argmax(dim=-1)
+ accuracy = (pred_tokens == masked_targets).float().mean()
+ else:
+ loss = torch.tensor(0.0, device=input_ids.device, requires_grad=True)
+ accuracy = torch.tensor(0.0, device=input_ids.device)
+ ce_loss = torch.tensor(0.0, device=input_ids.device)
+ masked_p_mask = torch.tensor(1.0, device=input_ids.device)
+
+ avg_p_mask = (
+ p_mask[masked_indices].mean().item() if masked_indices.any() else 0.0
+ )
+ metrics = {
+ "loss": loss.item(),
+ "accuracy": accuracy.item(),
+ "mask_ratio": masked_indices.float().mean().item(),
+ "num_masked_tokens": (masked_indices.sum().item(), "sum"),
+ "avg_p_mask": avg_p_mask,
+ "ce_loss": ce_loss.item(),
+ }
+
+ # If doing SFT training, log answer-specific metrics
+ if self.cfg.datasets is not None:
+ with torch.no_grad():
+ answer_mask = labels != -100
+ answer_lengths = answer_mask.sum(dim=1).float() # type: ignore
+ total_answer_tokens = answer_mask.sum().item() # type: ignore
+ total_tokens = labels.numel() # type: ignore
+ metrics["answer_ratio"] = total_answer_tokens / max(total_tokens, 1)
+ metrics["avg_answer_length"] = answer_lengths.mean().item()
+
+ if self.cfg.diffusion.importance_weighting:
+ metrics["importance_weight_avg"] = (1.0 / masked_p_mask).mean().item()
+
+ train_eval: Literal["train", "eval"] = "train" if model.training else "eval"
+ self.store_metrics(metrics, train_eval=train_eval)
+
+ return loss, outputs
diff --git a/src/axolotl/integrations/diffusion/utils.py b/src/axolotl/integrations/diffusion/utils.py
new file mode 100644
index 000000000..b6f71c07b
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/utils.py
@@ -0,0 +1,166 @@
+"""Shared utilities for diffusion integration."""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+import torch
+
+from axolotl.utils.dict import DictDefault
+
+
+def resolve_mask_token_id(
+ tokenizer: Any,
+ cfg: DictDefault,
+ *,
+ allow_add: bool,
+ model: Any | None = None,
+ default_token: str = "<|diffusion_mask|>",
+) -> int:
+ """Resolve mask token id. Training may add a new special token; inference won't."""
+ # Determine vocab size if available
+ vocab_size = None
+ if tokenizer is not None:
+ if hasattr(tokenizer, "vocab_size") and tokenizer.vocab_size is not None:
+ try:
+ vocab_size = int(tokenizer.vocab_size) # type: ignore[arg-type]
+ except Exception:
+ vocab_size = None
+ elif hasattr(tokenizer, "__len__"):
+ try:
+ vocab_size = int(len(tokenizer))
+ except Exception:
+ vocab_size = None
+
+ # Use explicit id from config if provided
+ diffusion_cfg = getattr(cfg, "diffusion", None)
+ # Fallback to top-level attr names only if nested missing (shouldn't happen)
+ cfg_id = (
+ getattr(diffusion_cfg, "mask_token_id", None)
+ if diffusion_cfg is not None
+ else getattr(cfg, "diffusion_mask_token_id", None)
+ )
+ if isinstance(cfg_id, int) and cfg_id >= 0:
+ if vocab_size is None or cfg_id < vocab_size:
+ return int(cfg_id)
+
+ def _existing_special_token_id(token_str: str | None) -> int | None:
+ """Attempt to resolve an existing special token string to a real ID."""
+ if not token_str or not hasattr(tokenizer, "convert_tokens_to_ids"):
+ return None
+ try:
+ token_id = tokenizer.convert_tokens_to_ids(token_str)
+ except Exception:
+ return None
+
+ if not isinstance(token_id, int) or token_id < 0:
+ return None
+
+ # Ensure it's registered as special and not UNK, and within vocab
+ unk_id = getattr(tokenizer, "unk_token_id", None)
+ specials = set(getattr(tokenizer, "all_special_tokens", []) or [])
+ addl = set(getattr(tokenizer, "additional_special_tokens", []) or [])
+ is_special = token_str in specials or token_str in addl
+ in_vocab = vocab_size is None or token_id < vocab_size
+ if (
+ (unk_id is not None and token_id == unk_id)
+ or not is_special
+ or not in_vocab
+ ):
+ return None
+ return token_id
+
+ # Try mask token string if provided
+ token_str = (
+ getattr(diffusion_cfg, "mask_token_str", None)
+ if diffusion_cfg is not None
+ else getattr(cfg, "diffusion_mask_token_str", None)
+ )
+ for candidate in (token_str, default_token):
+ token_id = _existing_special_token_id(candidate)
+ if isinstance(token_id, int):
+ try:
+ if diffusion_cfg is None:
+ cfg.diffusion_mask_token_id = int(token_id) # legacy fallback
+ else:
+ diffusion_cfg.mask_token_id = int(token_id)
+ except Exception:
+ pass
+ return int(token_id)
+
+ # Optionally add and return a dedicated special token during training
+ if allow_add and hasattr(tokenizer, "add_special_tokens"):
+ token_to_add = token_str or default_token
+ try:
+ tokenizer.add_special_tokens({"additional_special_tokens": [token_to_add]})
+
+ # Resize embeddings if possible
+ if (
+ model is not None
+ and hasattr(tokenizer, "__len__")
+ and hasattr(model, "resize_token_embeddings")
+ ):
+ try:
+ model.resize_token_embeddings(len(tokenizer))
+ except Exception:
+ pass
+ new_id = tokenizer.convert_tokens_to_ids(token_to_add)
+ if isinstance(new_id, int) and new_id >= 0:
+ try:
+ if diffusion_cfg is None:
+ cfg.diffusion_mask_token_id = int(new_id) # legacy fallback
+ else:
+ diffusion_cfg.mask_token_id = int(new_id)
+ except Exception:
+ pass
+ return int(new_id)
+ except Exception:
+ pass
+
+ # Fallback to unk or 0 (do not update cfg)
+ fallback = getattr(tokenizer, "unk_token_id", 0) or 0
+ return int(fallback)
+
+
+def create_bidirectional_attention_mask(
+ input_ids: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ sample_packing: bool = False,
+) -> torch.Tensor:
+ """
+ Create bidirectional attention mask to override default causal masking.
+ Handles sample-packed sequences where different samples are identified
+ by different attention mask values.
+
+ Args:
+ input_ids: Input token ids [batch_size, seq_len]
+ attention_mask: Attention mask [batch_size, seq_len]
+ sample_packing: Whether sample packing is enabled
+
+ Returns:
+ bidirectional_mask: 4D attention mask [batch_size, 1, seq_len, seq_len]
+ """
+ batch_size, seq_len = input_ids.shape
+ device = input_ids.device
+
+ if attention_mask is None or not sample_packing:
+ return torch.ones(
+ batch_size, 1, seq_len, seq_len, dtype=torch.bool, device=device
+ )
+
+ # Handle sample packing: tokens can only attend within their sample
+ mask_i = attention_mask.unsqueeze(2) # [batch_size, seq_len, 1]
+ mask_j = attention_mask.unsqueeze(1) # [batch_size, 1, seq_len]
+
+ # Tokens can attend to each other if they have the same non-zero sample ID
+ bidirectional_mask = (mask_i == mask_j) & (mask_i > 0)
+
+ # Add head dimension: [batch_size, 1, seq_len, seq_len]
+ return bidirectional_mask.unsqueeze(1)
+
+
+def shift_logits_to_input_positions(logits: torch.Tensor) -> torch.Tensor:
+ """Align next-token logits with their input token positions for diffusion."""
+ if logits.size(1) <= 1:
+ return logits
+ return torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
diff --git a/src/axolotl/integrations/grokfast/__init__.py b/src/axolotl/integrations/grokfast/__init__.py
index 234d27226..df8cf2cf3 100644
--- a/src/axolotl/integrations/grokfast/__init__.py
+++ b/src/axolotl/integrations/grokfast/__init__.py
@@ -7,7 +7,7 @@ from transformers.trainer_callback import TrainerCallback
from axolotl.utils.logging import get_logger
from ..base import BasePlugin
-from .args import GrokfastArgs # pylint: disable=unused-import. # noqa: F401
+from .args import GrokfastArgs as GrokfastArgs
from .optimizer import gradfilter_ema
LOG = get_logger(__name__)
@@ -24,12 +24,10 @@ class GrokfastCallbackHandler(TrainerCallback):
self.alpha = alpha
self.lamb = lamb
- def on_train_begin(self, *args_, **kwargs): # pylint: disable=unused-argument
+ def on_train_begin(self, *args_, **kwargs):
self.grads = None
- def on_pre_optimizer_step(
- self, args_, state, control, **kwargs
- ): # pylint: disable=unused-argument
+ def on_pre_optimizer_step(self, args_, state, control, **kwargs):
model = kwargs.pop("model")
self.grads = gradfilter_ema(model, self.grads, alpha=self.alpha, lamb=self.lamb)
return control
diff --git a/src/axolotl/integrations/grokfast/optimizer.py b/src/axolotl/integrations/grokfast/optimizer.py
index 38cda2c93..c83ef43bc 100644
--- a/src/axolotl/integrations/grokfast/optimizer.py
+++ b/src/axolotl/integrations/grokfast/optimizer.py
@@ -1,7 +1,6 @@
# Copyright: MIT License (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
# Reference: https://github.com/ironjr/grokfast
-# pylint: skip-file
from collections import deque
from typing import Dict, Literal, Optional
diff --git a/src/axolotl/integrations/kd/__init__.py b/src/axolotl/integrations/kd/__init__.py
index 4c8535a0a..b1a990553 100644
--- a/src/axolotl/integrations/kd/__init__.py
+++ b/src/axolotl/integrations/kd/__init__.py
@@ -15,6 +15,7 @@
"""
Plugin init to add KD support to Axolotl.
"""
+
from typing import Any
from transformers import Trainer
@@ -22,7 +23,7 @@ from transformers import Trainer
from axolotl.integrations.base import BasePlugin
from axolotl.integrations.kd.callbacks import KDTemperatureSchedulerCallback
-from .args import KDArgs # pylint: disable=unused-import. # noqa: F401
+from .args import KDArgs as KDArgs
class KDPlugin(BasePlugin):
diff --git a/src/axolotl/integrations/kd/args.py b/src/axolotl/integrations/kd/args.py
index 758bc8917..425d8ddf6 100644
--- a/src/axolotl/integrations/kd/args.py
+++ b/src/axolotl/integrations/kd/args.py
@@ -15,6 +15,7 @@
"""
Plugin args for KD support.
"""
+
from dataclasses import dataclass
from enum import Enum
@@ -26,8 +27,8 @@ class InferenceServerType(str, Enum):
Online inferences server types to handle different request args
"""
- vllm = "vllm" # pylint: disable=invalid-name
- sglang = "sglang" # pylint: disable=invalid-name
+ vllm = "vllm"
+ sglang = "sglang"
class KDArgs(BaseModel):
diff --git a/src/axolotl/integrations/kd/callbacks.py b/src/axolotl/integrations/kd/callbacks.py
index 911c3d517..c73d8a8bb 100644
--- a/src/axolotl/integrations/kd/callbacks.py
+++ b/src/axolotl/integrations/kd/callbacks.py
@@ -19,9 +19,7 @@ class KDTemperatureSchedulerCallback(TrainerCallback):
self.trainer = trainer
- def on_step_end(
- self, args, state, control, **kwargs
- ): # pylint: disable=unused-argument
+ def on_step_end(self, args, state, control, **kwargs):
# cosine decay temperature over the max steps
progress = state.global_step / state.max_steps
diff --git a/src/axolotl/integrations/kd/chat_template.py b/src/axolotl/integrations/kd/chat_template.py
index 6376ecb09..04f0f24a4 100644
--- a/src/axolotl/integrations/kd/chat_template.py
+++ b/src/axolotl/integrations/kd/chat_template.py
@@ -15,6 +15,7 @@
"""
Chat template prompt strategy loader with KD support
"""
+
import logging
from typing import Any, Dict
@@ -192,7 +193,6 @@ class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
"""
Transform logprobs to target format for KD training
"""
- # pylint: disable=duplicate-code
logprobs = sample.pop(self.logprobs_field)
target_seq_len = len(logprobs)
@@ -240,7 +240,7 @@ class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
target_mask.append([1] * top_k)
for token_pos_logprobs, pos_target_token_ids in zip(
- logprobs, sample["target_token_ids"]
+ logprobs, sample["target_token_ids"], strict=False
):
# Convert to a tensor for easier manipulation
position_logprobs_tensor = torch.tensor(
@@ -299,7 +299,7 @@ class KDStrategyLoader(StrategyLoader):
Load ChatTemplateStrategy with KD support using StrategyLoader.
"""
- def _get_strategy_cls(self, cfg): # pylint: disable=unused-argument
+ def _get_strategy_cls(self, cfg):
return ChatTemplateStrategyWithKD
def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
@@ -319,7 +319,7 @@ class KDStrategyLoaderV2(KDStrategyLoader):
Load KD chat template datasets with pre-tokenized logprob data
"""
- def _get_strategy_cls(self, cfg): # pylint: disable=unused-argument
+ def _get_strategy_cls(self, cfg):
return ChatTemplateStrategyWithKDv2
diff --git a/src/axolotl/integrations/kd/collator.py b/src/axolotl/integrations/kd/collator.py
index 0cc745b78..675485d9d 100644
--- a/src/axolotl/integrations/kd/collator.py
+++ b/src/axolotl/integrations/kd/collator.py
@@ -37,7 +37,6 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
target_logprobs. It also creates a teacher_mask to indicate which entries are valid.
"""
- # pylint: disable=duplicate-code
tokenizer: PreTrainedTokenizerBase
model: Optional[Any] = None
padding: Union[bool, str, PaddingStrategy] = True
@@ -72,7 +71,7 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
// self.pad_to_multiple_of
) * self.pad_to_multiple_of
- for f in features: # pylint: disable=invalid-name
+ for f in features:
remainder = [pad_token_id] * (max_len - len(f[feature_name]))
if isinstance(f[feature_name], list):
f[feature_name] = (
@@ -101,7 +100,7 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
if has_teacher_data:
# Extract and remove from features
- for f in features: # pylint: disable=invalid-name
+ for f in features:
target_logprobs_list.append(f.pop("target_logprobs"))
target_token_ids_list.append(f.pop("target_token_ids"))
target_mask_list.append(f.pop("target_mask"))
@@ -117,24 +116,25 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
padded_teacher_mask_list = []
for t_logprobs, t_ids, t_mask in zip(
- target_logprobs_list, target_token_ids_list, target_mask_list
+ target_logprobs_list,
+ target_token_ids_list,
+ target_mask_list,
+ strict=False,
):
t_logprobs_padded = []
t_ids_padded = []
t_mask_padded = []
- for lp, ids, mask in zip( # pylint: disable=invalid-name
- t_logprobs, t_ids, t_mask
- ):
+ for lp, ids, mask in zip(t_logprobs, t_ids, t_mask, strict=False):
lp_len = len(lp)
if lp_len < max_k:
# Use -1e9 for padding logprobs and 0 for token_ids
pad_len = max_k - lp_len
- lp = lp + [-1e9] * pad_len # pylint: disable=invalid-name
+ lp = lp + [-1e9] * pad_len
ids = ids + [0] * pad_len
mask = mask + [0] * pad_len
else:
- lp = lp[:max_k] # pylint: disable=invalid-name
+ lp = lp[:max_k]
ids = ids[:max_k]
mask = mask[:max_k]
@@ -216,9 +216,7 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
# We want to produce a single "merged" feature dict for each sub-batch.
out_features = [{} for _ in features]
- for i, sub_features in enumerate( # pylint: disable=too-many-nested-blocks
- features
- ):
+ for i, sub_features in enumerate(features):
# sub_features is a list of dicts, each dict = one sequence’s features
# We'll merge them into out_features[i].
#
@@ -255,9 +253,7 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
if field_name in feat and isinstance(
feat[field_name], (list, torch.Tensor)
):
- if isinstance(
- feat[field_name][0], (dict, str)
- ): # pylint: disable=too-many-nested-blocks
+ if isinstance(feat[field_name][0], (dict, str)):
continue
arr = np.array(feat[field_name])
arrays.append(arr)
diff --git a/src/axolotl/integrations/kd/collator_online_teacher.py b/src/axolotl/integrations/kd/collator_online_teacher.py
index 584ace481..54e55a5e7 100644
--- a/src/axolotl/integrations/kd/collator_online_teacher.py
+++ b/src/axolotl/integrations/kd/collator_online_teacher.py
@@ -144,7 +144,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
}
for sequence_data, seq_input_ids, seq_labels in zip(
- api_data, batch_input_ids, labels
+ api_data, batch_input_ids, labels, strict=False
):
current_target_logprobs = []
current_target_token_ids = []
@@ -165,7 +165,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
assert len(seq_input_ids) == len(input_top_logprobs)
for i, _, label in zip(
- range(len(seq_input_ids)), seq_input_ids, seq_labels
+ range(len(seq_input_ids)), seq_input_ids, seq_labels, strict=False
):
if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
# this is always the case for the first token.
@@ -202,7 +202,8 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
# pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
pos_logprobs_raw, pos_token_ids, _ = [
- list(row) for row in zip(*pos_top_logprobs_data)
+ list(row)
+ for row in zip(*pos_top_logprobs_data, strict=False)
]
# Ensure correct length (top_k)
@@ -317,7 +318,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
}
for sequence_data, seq_input_ids, seq_labels in zip(
- choices, batch_input_ids, labels
+ choices, batch_input_ids, labels, strict=False
):
# seq_input_ids: List[int]
# seq_labels: List[int]
@@ -342,7 +343,9 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
seq_len = len(seq_input_ids)
- for i, _, label in zip(range(seq_len), seq_input_ids, seq_labels):
+ for i, _, label in zip(
+ range(seq_len), seq_input_ids, seq_labels, strict=False
+ ):
if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
# this is always the case for the first token.
# there is never logprob data for the first token since that's a true input
@@ -424,7 +427,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
list(range(self.kd_online_topk))
)
current_target_mask.append([0] * self.kd_online_topk)
- for i in range(max(0, seq_len - len(current_target_logprobs))):
+ for _ in range(max(0, seq_len - len(current_target_logprobs))):
current_target_logprobs.append(
[-float("inf")] * self.kd_online_topk
)
diff --git a/src/axolotl/integrations/kd/kernels/liger.py b/src/axolotl/integrations/kd/kernels/liger.py
index 6356643c2..61ef3e10a 100644
--- a/src/axolotl/integrations/kd/kernels/liger.py
+++ b/src/axolotl/integrations/kd/kernels/liger.py
@@ -197,7 +197,7 @@ class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase):
compute_ce_loss: bool = True,
normalize_topk: bool = True,
):
- CHUNK_SIZE = chunk_size # pylint: disable=invalid-name
+ CHUNK_SIZE = chunk_size
grad_weight_acc = torch.zeros_like(student_lm_head_weight)
grad_inputs_list = []
grad_bias_acc = (
@@ -298,8 +298,8 @@ class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase):
accumulate_chunk_grads_compiled = accumulate_chunk_grads
# Use the same chunking logic as LigerFusedLinearDistillationBase.forward
- B, N, D = student_input.shape # pylint: disable=invalid-name
- K = target_token_ids.shape[-1] # pylint: disable=invalid-name
+ B, N, D = student_input.shape
+ K = target_token_ids.shape[-1]
student_input_flat = student_input.reshape(-1, student_input.shape[-1])
target_token_ids_flat = target_token_ids.reshape(-1, target_token_ids.shape[-1])
diff --git a/src/axolotl/integrations/kd/kernels/models.py b/src/axolotl/integrations/kd/kernels/models.py
index 4319f5f7d..badb3460d 100644
--- a/src/axolotl/integrations/kd/kernels/models.py
+++ b/src/axolotl/integrations/kd/kernels/models.py
@@ -40,10 +40,9 @@ def kldiv_forward_llama_like(
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
- logits_to_keep: Union[int, torch.Tensor] = 0, # pylint: disable=unused-argument
+ logits_to_keep: Union[int, torch.Tensor] = 0,
**kwargs: Unpack[TransformersKwargs], # type: ignore[misc]
) -> CausalLMOutputWithPast:
- # pylint: disable=duplicate-code
output_attentions = (
output_attentions
if output_attentions is not None
@@ -73,9 +72,9 @@ def kldiv_forward_llama_like(
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
# TODO, we can optimize this further by filtering hidden_states on sequence dimension using labels != -100
- # self.loss_function should be LigerFusedLinearKLTopKLogprobLoss
+ # self._loss_function should be LigerFusedLinearKLTopKLogprobLoss
- loss = self.loss_function(
+ loss = self._loss_function(
self.lm_head.weight,
hidden_states,
target_token_ids,
diff --git a/src/axolotl/integrations/kd/topk_logprob/forward_kl.py b/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
index 74184455f..b79ba26f3 100644
--- a/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
+++ b/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
@@ -15,6 +15,7 @@
"""
loss for top_k KL divergence
"""
+
import torch
from torch import nn
@@ -117,7 +118,6 @@ class ChunkedTopKKDLoss(nn.Module):
target_mask: torch.Tensor, # [B, seq_len, K]
num_items_in_batch: int = -1, # optional batch size for normalization
) -> torch.Tensor:
-
# 1. Split along the "token" dimension (dim=1).
student_logits_chunks = student_logits.chunk(self.num_output_chunks, dim=1)
token_ids_chunks = target_token_ids.chunk(self.num_output_chunks, dim=1)
@@ -131,7 +131,11 @@ class ChunkedTopKKDLoss(nn.Module):
# 2. Loop over each chunk and compute a chunk-specific loss.
for st_chunk, tid_chunk, lp_chunk, msk_chunk in zip(
- student_logits_chunks, token_ids_chunks, logprobs_chunks, mask_chunks
+ student_logits_chunks,
+ token_ids_chunks,
+ logprobs_chunks,
+ mask_chunks,
+ strict=False,
):
# We pass num_items_in_batch=-1 so that the kd_loss
# will average over *this chunk's* valid tokens only.
diff --git a/src/axolotl/integrations/kd/trainer.py b/src/axolotl/integrations/kd/trainer.py
index c454b2a2c..0e98497a7 100644
--- a/src/axolotl/integrations/kd/trainer.py
+++ b/src/axolotl/integrations/kd/trainer.py
@@ -21,7 +21,6 @@ from axolotl.core.trainers.base import AxolotlTrainer
from .kernels.liger import LigerFusedLinearKLTopKLogprobLoss
-# pylint: disable=too-many-ancestors
class AxolotlKDTrainer(AxolotlTrainer):
"""
Custom trainer subclass for Knowledge Distillation (KD)
@@ -30,7 +29,8 @@ class AxolotlKDTrainer(AxolotlTrainer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.model_accepts_loss_kwargs = True
- self.model._loss_function = LigerFusedLinearKLTopKLogprobLoss(
+
+ loss_fn = LigerFusedLinearKLTopKLogprobLoss(
self.args.kd_ce_alpha, # hard label loss
self.args.kd_alpha, # kd loss
self.args.kd_temperature,
@@ -38,6 +38,14 @@ class AxolotlKDTrainer(AxolotlTrainer):
compute_ce_loss=bool(self.args.kd_ce_alpha),
normalize_topk=self.args.kd_normalize_topk,
)
+ target = self.model
+
+ # Unwrap PEFT wrapper
+ if hasattr(target, "get_base_model"):
+ target = target.get_base_model()
+
+ # Set on the actual model instance
+ target._loss_function = loss_fn
def _set_signature_columns_if_needed(self):
super()._set_signature_columns_if_needed()
diff --git a/src/axolotl/integrations/liger/README.md b/src/axolotl/integrations/liger/README.md
index c5cce8282..3a2d4bd04 100644
--- a/src/axolotl/integrations/liger/README.md
+++ b/src/axolotl/integrations/liger/README.md
@@ -18,6 +18,9 @@ liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true
+
+# FLCE-specific
+liger_use_token_scaling: true
```
## Supported Models
diff --git a/src/axolotl/integrations/liger/__init__.py b/src/axolotl/integrations/liger/__init__.py
index 86d56be80..c20f4545c 100644
--- a/src/axolotl/integrations/liger/__init__.py
+++ b/src/axolotl/integrations/liger/__init__.py
@@ -18,6 +18,7 @@ Module for the Plugin for LIGER integraton with Axolotl.
Liger Kernel is the collection of Triton-native kernels for LLM Training.
It is designed to be performant, correct, and light-weight.
"""
+
from .args import LigerArgs
from .plugin import LigerPlugin
diff --git a/src/axolotl/integrations/liger/args.py b/src/axolotl/integrations/liger/args.py
index d5bb10cfd..eb7a6c59b 100644
--- a/src/axolotl/integrations/liger/args.py
+++ b/src/axolotl/integrations/liger/args.py
@@ -16,7 +16,7 @@
Module for handling LIGER input arguments.
"""
-from pydantic import BaseModel, model_validator
+from pydantic import BaseModel, Field, model_validator
from axolotl.utils.logging import get_logger
@@ -35,6 +35,15 @@ class LigerArgs(BaseModel):
liger_glu_activation: bool | None = None
liger_cross_entropy: bool | None = None
liger_fused_linear_cross_entropy: bool | None = None
+ liger_use_token_scaling: bool | None = Field(
+ default=None,
+ json_schema_extra={
+ "description": (
+ "Enables use_token_scaling in fused_linear_cross_entropy. "
+ "When True, each token's loss is multiplied by its predicted probability (detached from gradients)."
+ )
+ },
+ )
@model_validator(mode="before")
@classmethod
@@ -75,6 +84,18 @@ class LigerArgs(BaseModel):
)
return data
+ @model_validator(mode="before")
+ @classmethod
+ def check_liger_use_token_scaling_flce(cls, data):
+ if data.get("liger_use_token_scaling") and not data.get(
+ "liger_fused_linear_cross_entropy"
+ ):
+ raise ValueError(
+ "`liger_use_token_scaling: true` requires `liger_fused_linear_cross_entropy` enabled."
+ )
+
+ return data
+
@model_validator(mode="after")
def check_tensor_parallel_size_liger_fused_linear_cross_entropy(self):
# TODO @SalmanMohammadi this is a larger fix - investigate
diff --git a/src/axolotl/integrations/liger/models/base.py b/src/axolotl/integrations/liger/models/base.py
index f3cf4299a..a9dbe9412 100644
--- a/src/axolotl/integrations/liger/models/base.py
+++ b/src/axolotl/integrations/liger/models/base.py
@@ -41,7 +41,6 @@ def lce_forward(
This is useful when using packed tensor format (single dimension for batch and sequence length).
"""
- # pylint: disable=duplicate-code
output_attentions = (
output_attentions
if output_attentions is not None
@@ -181,7 +180,7 @@ def patch_lce_forward(
model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
model_cls.forward = lce_forward
- # pylint: disable=duplicate-code
+
except (ImportError, AttributeError) as e:
raise RuntimeError(
f"Could not import ForCausalLM class for model_type: {model_type}. "
diff --git a/src/axolotl/integrations/liger/models/deepseekv2.py b/src/axolotl/integrations/liger/models/deepseekv2.py
index 2f0d2a704..99adce4a7 100644
--- a/src/axolotl/integrations/liger/models/deepseekv2.py
+++ b/src/axolotl/integrations/liger/models/deepseekv2.py
@@ -2,8 +2,6 @@
DeepseekV2 model with LigerFusedLinearCrossEntropyLoss
"""
-# pylint: disable=duplicate-code
-
from typing import List, Optional, Tuple, Union
import torch
diff --git a/src/axolotl/integrations/liger/models/jamba.py b/src/axolotl/integrations/liger/models/jamba.py
index d25529970..78689e40c 100644
--- a/src/axolotl/integrations/liger/models/jamba.py
+++ b/src/axolotl/integrations/liger/models/jamba.py
@@ -2,8 +2,6 @@
Jamba model with LigerFusedLinearCrossEntropyLoss
"""
-# pylint: disable=duplicate-code
-
from typing import Optional, Tuple, Union
import torch
diff --git a/src/axolotl/integrations/liger/models/llama4.py b/src/axolotl/integrations/liger/models/llama4.py
index 689823bb6..e51140265 100644
--- a/src/axolotl/integrations/liger/models/llama4.py
+++ b/src/axolotl/integrations/liger/models/llama4.py
@@ -46,7 +46,6 @@ def lce_forward(
Returns:
"""
- # pylint: disable=duplicate-code
output_attentions = (
output_attentions
if output_attentions is not None
@@ -78,9 +77,7 @@ def lce_forward(
hidden_states = outputs[0]
if hasattr(self.config, "pretraining_tp") and self.config.pretraining_tp > 1:
- raise Exception( # pylint: disable=broad-exception-raised
- "Liger Kernel does not support pretraining_tp!!"
- )
+ raise Exception("Liger Kernel does not support pretraining_tp!!")
logits = None
loss = None
@@ -128,7 +125,7 @@ def apply_liger_kernel_to_llama4(
rms_norm: bool = False,
glu_activation: bool = False,
layer_norm: bool = False,
- **kwargs, # pylint: disable=unused-argument
+ **kwargs,
) -> None:
"""
Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
@@ -144,15 +141,15 @@ def apply_liger_kernel_to_llama4(
layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
"""
- import transformers.models.llama4.modeling_llama4 # noqa: F401 # pylint: disable=unused-import
+ import transformers.models.llama4.modeling_llama4 # noqa: F401
from liger_kernel.transformers.functional import liger_cross_entropy
from liger_kernel.transformers.layer_norm import LigerLayerNorm
from liger_kernel.transformers.rms_norm import LigerRMSNorm
from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
- assert not (
- cross_entropy and fused_linear_cross_entropy
- ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+ assert not (cross_entropy and fused_linear_cross_entropy), (
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
+ )
modeling_llama4 = sys.modules["transformers.models.llama4.modeling_llama4"]
@@ -165,7 +162,7 @@ def apply_liger_kernel_to_llama4(
# clone config to avoid modifying the original
config = deepcopy(config)
if intermediate_size:
- setattr(config, "intermediate_size", intermediate_size)
+ config.intermediate_size = intermediate_size
return LigerSwiGLUMLP(config, **kwargs)
modeling_llama4.Llama4TextMLP = _liger_swiglu_mlp_wrapper
diff --git a/src/axolotl/integrations/liger/models/qwen3.py b/src/axolotl/integrations/liger/models/qwen3.py
index 1dc19eaf9..b008755da 100644
--- a/src/axolotl/integrations/liger/models/qwen3.py
+++ b/src/axolotl/integrations/liger/models/qwen3.py
@@ -43,7 +43,6 @@ def lce_forward(
Returns:
"""
- # pylint: disable=duplicate-code
output_attentions = (
output_attentions
if output_attentions is not None
@@ -113,9 +112,8 @@ def apply_liger_kernel_to_qwen3(
rms_norm: bool = False,
glu_activation: bool = False,
layer_norm: bool = False,
- **kwargs, # pylint: disable=unused-argument
+ **kwargs,
) -> None:
- # pylint: disable=duplicate-code
"""
Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
@@ -130,15 +128,15 @@ def apply_liger_kernel_to_qwen3(
layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
"""
- import transformers.models.qwen3.modeling_qwen3 # noqa: F401 # pylint: disable=unused-import
+ import transformers.models.qwen3.modeling_qwen3 # noqa: F401
from liger_kernel.transformers.functional import liger_cross_entropy
from liger_kernel.transformers.layer_norm import LigerLayerNorm
from liger_kernel.transformers.rms_norm import LigerRMSNorm
from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
- assert not (
- cross_entropy and fused_linear_cross_entropy
- ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+ assert not (cross_entropy and fused_linear_cross_entropy), (
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
+ )
modeling_qwen3 = sys.modules["transformers.models.qwen3.modeling_qwen3"]
diff --git a/src/axolotl/integrations/liger/models/qwen3_moe.py b/src/axolotl/integrations/liger/models/qwen3_moe.py
index 89bdc5bcc..40bee110c 100644
--- a/src/axolotl/integrations/liger/models/qwen3_moe.py
+++ b/src/axolotl/integrations/liger/models/qwen3_moe.py
@@ -45,7 +45,6 @@ def lce_forward(
Returns:
"""
- # pylint: disable=duplicate-code
output_attentions = (
output_attentions
if output_attentions is not None
@@ -135,9 +134,8 @@ def apply_liger_kernel_to_qwen3_moe(
rms_norm: bool = False,
glu_activation: bool = False,
layer_norm: bool = False,
- **kwargs, # pylint: disable=unused-argument
+ **kwargs,
) -> None:
- # pylint: disable=duplicate-code
"""
Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
@@ -152,15 +150,15 @@ def apply_liger_kernel_to_qwen3_moe(
layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
"""
- import transformers.models.qwen3_moe.modeling_qwen3_moe # noqa: F401 # pylint: disable=unused-import
+ import transformers.models.qwen3_moe.modeling_qwen3_moe # noqa: F401
from liger_kernel.transformers.functional import liger_cross_entropy
from liger_kernel.transformers.layer_norm import LigerLayerNorm
from liger_kernel.transformers.rms_norm import LigerRMSNorm
from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
- assert not (
- cross_entropy and fused_linear_cross_entropy
- ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+ assert not (cross_entropy and fused_linear_cross_entropy), (
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
+ )
modeling_qwen3_moe = sys.modules["transformers.models.qwen3_moe.modeling_qwen3_moe"]
@@ -174,7 +172,7 @@ def apply_liger_kernel_to_qwen3_moe(
# clone config to avoid modifying the original
config = deepcopy(config)
if intermediate_size:
- setattr(config, "intermediate_size", intermediate_size)
+ config.intermediate_size = intermediate_size
return LigerSwiGLUMLP(config, **kwargs)
modeling_qwen3_moe.Qwen3MoeMLP = _liger_swiglu_mlp_wrapper
diff --git a/src/axolotl/integrations/liger/plugin.py b/src/axolotl/integrations/liger/plugin.py
index 89f7c37b7..ac796c2c9 100644
--- a/src/axolotl/integrations/liger/plugin.py
+++ b/src/axolotl/integrations/liger/plugin.py
@@ -48,6 +48,33 @@ class LigerPlugin(BasePlugin):
"Cannot have both `liger_cross_entropy` and `liger_fused_linear_cross_entropy` set."
)
+ if cfg.liger_use_token_scaling:
+ # Patch FLCE to set token_scaling=True for function and class API
+ from liger_kernel.transformers import functional
+ from liger_kernel.transformers.fused_linear_cross_entropy import (
+ LigerFusedLinearCrossEntropyLoss,
+ )
+
+ old_liger_fused_linear_cross_entropy = (
+ functional.liger_fused_linear_cross_entropy
+ )
+
+ def patched_liger_fused_linear_cross_entropy(*args, **kwargs):
+ kwargs["use_token_scaling"] = True
+ return old_liger_fused_linear_cross_entropy(*args, **kwargs)
+
+ functional.liger_fused_linear_cross_entropy = (
+ patched_liger_fused_linear_cross_entropy
+ )
+
+ old_init = LigerFusedLinearCrossEntropyLoss.__init__
+
+ def patched_init(self, *args, **kwargs):
+ kwargs["use_token_scaling"] = True
+ return old_init(self, *args, **kwargs)
+
+ LigerFusedLinearCrossEntropyLoss.__init__ = patched_init
+
if cfg.model_config_type in MODEL_TYPE_TO_APPLY_LIGER_FN:
apply_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[cfg.model_config_type]
liger_fn_sig = inspect.signature(apply_liger_fn)
diff --git a/src/axolotl/integrations/lm_eval/__init__.py b/src/axolotl/integrations/lm_eval/__init__.py
index 8db4dc634..0ab6b8697 100644
--- a/src/axolotl/integrations/lm_eval/__init__.py
+++ b/src/axolotl/integrations/lm_eval/__init__.py
@@ -7,7 +7,7 @@ import subprocess # nosec
from axolotl.integrations.base import BasePlugin
from axolotl.integrations.lm_eval.cli import build_lm_eval_command
-from .args import LMEvalArgs # pylint: disable=unused-import. # noqa: F401
+from .args import LMEvalArgs as LMEvalArgs
class LMEvalPlugin(BasePlugin):
@@ -20,7 +20,6 @@ class LMEvalPlugin(BasePlugin):
def post_train_unload(self, cfg):
if cfg.lm_eval_post_train:
- # pylint: disable=duplicate-code
for lm_eval_args in build_lm_eval_command(
cfg.lm_eval_tasks,
bfloat16=cfg.bfloat16 or cfg.bf16,
diff --git a/src/axolotl/integrations/lm_eval/cli.py b/src/axolotl/integrations/lm_eval/cli.py
index 19608e1d9..ead82dcb7 100644
--- a/src/axolotl/integrations/lm_eval/cli.py
+++ b/src/axolotl/integrations/lm_eval/cli.py
@@ -99,7 +99,6 @@ def lm_eval(config: str, cloud: Optional[str] = None):
with open(config, encoding="utf-8") as file:
cfg: DictDefault = DictDefault(yaml.safe_load(file))
- # pylint: disable=duplicate-code
for lm_eval_args in build_lm_eval_command(
cfg.lm_eval_tasks,
bfloat16=cfg.bfloat16 or cfg.bf16,
diff --git a/src/axolotl/integrations/spectrum/__init__.py b/src/axolotl/integrations/spectrum/__init__.py
index 9f66aef97..5e8f9128d 100644
--- a/src/axolotl/integrations/spectrum/__init__.py
+++ b/src/axolotl/integrations/spectrum/__init__.py
@@ -23,7 +23,7 @@ import requests
from axolotl.integrations.base import BasePlugin
from axolotl.utils.logging import get_logger
-from .args import SpectrumArgs # pylint: disable=unused-import. # noqa: F401
+from .args import SpectrumArgs as SpectrumArgs
LOG = get_logger(__name__)
@@ -46,7 +46,7 @@ def _generate_unfrozen_params_yaml(snr_data, top_fraction=0.5):
"^lm_head.weight$",
"^model.embed_tokens.weight$",
]
- for layer_type, layer_names in top_layers_by_type.items():
+ for _, layer_names in top_layers_by_type.items():
for layer_name in layer_names:
unfrozen_parameters.append(layer_name)
return unfrozen_parameters
@@ -84,7 +84,7 @@ class SpectrumPlugin(BasePlugin):
snr_data = json.load(fin)
except FileNotFoundError:
pass
- except Exception as exc: # pylint: disable=broad-exception-caught
+ except Exception as exc:
LOG.warning(f"Failed to read SNR data from {snr_path}: {exc}")
if not snr_data:
diff --git a/src/axolotl/integrations/spectrum/args.py b/src/axolotl/integrations/spectrum/args.py
index df5756038..be6ca4bfc 100644
--- a/src/axolotl/integrations/spectrum/args.py
+++ b/src/axolotl/integrations/spectrum/args.py
@@ -15,6 +15,7 @@
"""
Module for handling Spectrum input arguments.
"""
+
from typing import Optional
from pydantic import BaseModel, model_validator
diff --git a/src/axolotl/kernels/geglu.py b/src/axolotl/kernels/geglu.py
index 6acbea0d4..ee3260ebd 100644
--- a/src/axolotl/kernels/geglu.py
+++ b/src/axolotl/kernels/geglu.py
@@ -5,8 +5,6 @@ See "GLU Variants Improve Transformer" (https://arxiv.org/abs/2002.05202).
Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation.
"""
-# pylint: disable=invalid-name,unnecessary-lambda-assignment,duplicate-code
-
import torch
import triton
import triton.language as tl
diff --git a/src/axolotl/kernels/lora.py b/src/axolotl/kernels/lora.py
index fb45f2aa7..c3356fb90 100644
--- a/src/axolotl/kernels/lora.py
+++ b/src/axolotl/kernels/lora.py
@@ -7,8 +7,6 @@ See "LoRA: Low-Rank Adaptation of Large Language Models"
Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation.
"""
-# pylint: disable=invalid-name
-
from typing import Callable
import torch
diff --git a/src/axolotl/kernels/quantize.py b/src/axolotl/kernels/quantize.py
index b61603fbc..d094f2381 100644
--- a/src/axolotl/kernels/quantize.py
+++ b/src/axolotl/kernels/quantize.py
@@ -1,7 +1,5 @@
"""Dequantization utilities for `bitsandbytes` integration."""
-# pylint: disable=invalid-name,global-statement
-
import ctypes
import bitsandbytes as bnb
diff --git a/src/axolotl/kernels/swiglu.py b/src/axolotl/kernels/swiglu.py
index 43a798edc..b13bcd350 100644
--- a/src/axolotl/kernels/swiglu.py
+++ b/src/axolotl/kernels/swiglu.py
@@ -99,7 +99,6 @@ def _swiglu_bwd_kernel(
tl.store(up_ptr + offsets, grad_up, mask=mask) # grad wrt up
-# pylint: disable=unnecessary-lambda-assignment
def swiglu_forward(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
"""
SwiGLU forward pass. Computes SwiGLU activation: `x * sigmoid(x) * up`, where
@@ -128,7 +127,6 @@ def swiglu_forward(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
return out
-# pylint: disable=unnecessary-lambda-assignment
def swiglu_backward(
grad_output: torch.Tensor, gate: torch.Tensor, up: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
diff --git a/src/axolotl/loaders/__init__.py b/src/axolotl/loaders/__init__.py
index 3eef75e58..ae99bf16d 100644
--- a/src/axolotl/loaders/__init__.py
+++ b/src/axolotl/loaders/__init__.py
@@ -1,6 +1,5 @@
"""Init for axolotl.loaders module"""
-# pylint: disable=unused-import
# flake8: noqa
from .adapter import load_adapter, load_lora
diff --git a/src/axolotl/loaders/adapter.py b/src/axolotl/loaders/adapter.py
index db28206b6..bcde4bf96 100644
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -14,6 +14,7 @@ from peft import (
PeftConfig,
PeftMixedModel,
PeftModel,
+ TaskType,
get_peft_model,
)
from transformers import PreTrainedModel
@@ -28,14 +29,12 @@ LOG = get_logger(__name__)
def setup_quantized_meta_for_peft(model: torch.nn.Module):
"""Replaces `quant_state.to` with a dummy function to prevent PEFT from moving `quant_state` to meta device"""
- def temp_to_method(self, *args, **kwargs): # pylint: disable=unused-argument
+ def temp_to_method(self, *args, **kwargs):
return self
for param in model.parameters():
if isinstance(param, Params4bit):
- param.quant_state._orig_to = ( # pylint: disable=protected-access
- param.quant_state.to
- )
+ param.quant_state._orig_to = param.quant_state.to
param.quant_state.to = types.MethodType(temp_to_method, param.quant_state)
@@ -43,10 +42,8 @@ def setup_quantized_peft_meta_for_training(model: torch.nn.Module):
"""Replaces dummy `quant_state.to` method with the original function to allow training to continue"""
for param in model.parameters():
if isinstance(param, Params4bit) and hasattr(param.quant_state, "_orig_to"):
- param.quant_state.to = (
- param.quant_state._orig_to # pylint: disable=protected-access
- )
- param.quant_state._orig_to = None # pylint: disable=protected-access
+ param.quant_state.to = param.quant_state._orig_to
+ param.quant_state._orig_to = None
def find_all_linear_names(model):
@@ -102,6 +99,17 @@ def load_lora(
lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora
if cfg.peft_layer_replication:
lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication
+ if cfg.peft_trainable_token_indices:
+ lora_config_kwargs["trainable_token_indices"] = cfg.peft_trainable_token_indices
+
+ # Determine the correct PEFT task type
+ model_cls = type(model).__name__
+ if "SequenceClassification" in model_cls:
+ task_type = TaskType.SEQ_CLS
+ elif "TokenClassification" in model_cls:
+ task_type = TaskType.TOKEN_CLS
+ else:
+ task_type = TaskType.CAUSAL_LM
lora_config = LoraConfig(
r=cfg.lora_r,
@@ -114,7 +122,7 @@ def load_lora(
fan_in_fan_out=cfg.lora_fan_in_fan_out,
modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
bias="none",
- task_type="CAUSAL_LM",
+ task_type=task_type,
**lora_config_kwargs,
)
diff --git a/src/axolotl/loaders/constants.py b/src/axolotl/loaders/constants.py
index 3fabf9d94..4939cb28d 100644
--- a/src/axolotl/loaders/constants.py
+++ b/src/axolotl/loaders/constants.py
@@ -1,26 +1,13 @@
"""Shared constants for axolotl.loaders module"""
-from transformers import (
- Gemma3ForConditionalGeneration,
- Gemma3nForConditionalGeneration,
- Llama4ForConditionalGeneration,
- LlavaForConditionalGeneration,
- Mistral3ForConditionalGeneration,
- MllamaForConditionalGeneration,
- Qwen2_5_VLForConditionalGeneration,
- Qwen2VLForConditionalGeneration,
+from transformers import AutoModelForImageTextToText
+from transformers.models.auto.modeling_auto import (
+ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
)
-MULTIMODAL_AUTO_MODEL_MAPPING = {
- "mllama": MllamaForConditionalGeneration,
- "llama4": Llama4ForConditionalGeneration,
- "llava": LlavaForConditionalGeneration,
- "qwen2_vl": Qwen2VLForConditionalGeneration,
- "qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
- "mistral3": Mistral3ForConditionalGeneration,
- "gemma3": Gemma3ForConditionalGeneration,
- "gemma3n": Gemma3nForConditionalGeneration,
-}
+MULTIMODAL_AUTO_MODEL_MAPPING = dict(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
+
+MULTIMODAL_AUTO_MODEL_MAPPING["lfm2-vl"] = AutoModelForImageTextToText
try:
from transformers import VoxtralForConditionalGeneration
diff --git a/src/axolotl/loaders/model.py b/src/axolotl/loaders/model.py
index 6bf1f149b..aeec46584 100644
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -25,6 +25,7 @@ from peft import (
from torch.distributed import DeviceMesh
from transformers import (
AutoModelForCausalLM,
+ AutoModelForImageTextToText,
AutoModelForVision2Seq,
AwqConfig,
BitsAndBytesConfig,
@@ -101,7 +102,7 @@ class ModelLoader:
*,
inference: bool = False,
reference_model: bool = False,
- **kwargs, # pylint: disable=unused-argument
+ **kwargs,
):
"""Initializes the ModelLoader.
@@ -133,7 +134,7 @@ class ModelLoader:
# Init model config
self.model_config = load_model_config(cfg)
- self.auto_model_loader = AutoModelForCausalLM # pylint: disable=invalid-name
+ self.auto_model_loader = AutoModelForCausalLM
# Initialize the patch manager
self.patch_manager = PatchManager(
@@ -212,6 +213,7 @@ class ModelLoader:
self.model_kwargs["use_kernels"] = self.cfg.use_kernels
self._set_quantization_config()
self._set_attention_config()
+ self._check_model_requirements()
def _apply_post_model_load_setup(self):
"""Configure the model after it has been loaded."""
@@ -432,6 +434,8 @@ class ModelLoader:
self.auto_model_loader = MULTIMODAL_AUTO_MODEL_MAPPING.get(
self.model_config.model_type, AutoModelForVision2Seq
)
+ if isinstance(self.auto_model_loader, str):
+ self.auto_model_loader = AutoModelForImageTextToText
def _set_device_map_config(self):
"""Setup `device_map` according to config"""
@@ -511,9 +515,6 @@ class ModelLoader:
if self.cfg.model_quantization_config_kwargs:
mxfp4_kwargs = self.cfg.model_quantization_config_kwargs
self.model_kwargs["quantization_config"] = Mxfp4Config(**mxfp4_kwargs)
- else:
- self.model_kwargs["load_in_8bit"] = self.cfg.load_in_8bit
- self.model_kwargs["load_in_4bit"] = self.cfg.load_in_4bit
if self.cfg.gptq:
if not hasattr(self.model_config, "quantization_config"):
@@ -548,9 +549,7 @@ class ModelLoader:
self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
**self.model_config.quantization_config
)
- elif self.cfg.adapter == "qlora" and self.model_kwargs.get(
- "load_in_4bit", False
- ):
+ elif self.cfg.adapter == "qlora" and self.cfg.load_in_4bit:
bnb_config = {
"load_in_4bit": True,
"llm_int8_threshold": 6.0,
@@ -576,9 +575,7 @@ class ModelLoader:
self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
**bnb_config,
)
- elif self.cfg.adapter == "lora" and self.model_kwargs.get(
- "load_in_8bit", False
- ):
+ elif self.cfg.adapter == "lora" and self.cfg.load_in_8bit:
bnb_config = {
"load_in_8bit": True,
}
@@ -592,42 +589,39 @@ class ModelLoader:
**bnb_config,
)
- # no longer needed per https://github.com/huggingface/transformers/pull/26610
- if "quantization_config" in self.model_kwargs or self.cfg.gptq:
- self.model_kwargs.pop("load_in_8bit", None)
- self.model_kwargs.pop("load_in_4bit", None)
-
def _set_attention_config(self):
"""Sample packing uses custom FA2 patch"""
if self.cfg.attn_implementation:
self.model_kwargs["attn_implementation"] = self.cfg.attn_implementation
elif self.cfg.flex_attention:
self.model_kwargs["attn_implementation"] = "flex_attention"
- self.model_config._attn_implementation = ( # pylint: disable=protected-access
- "flex_attention"
- )
+ self.model_config._attn_implementation = "flex_attention"
elif self.cfg.flash_attention:
if not self.cfg.sample_packing and self.cfg.s2_attention:
pass
self.model_kwargs["attn_implementation"] = "flash_attention_2"
- self.model_config._attn_implementation = ( # pylint: disable=protected-access
- "flash_attention_2"
- )
+ self.model_config._attn_implementation = "flash_attention_2"
elif self.cfg.sdp_attention:
self.model_kwargs["attn_implementation"] = "sdpa"
- self.model_config._attn_implementation = ( # pylint: disable=protected-access
- "sdpa"
- )
+ self.model_config._attn_implementation = "sdpa"
elif self.cfg.eager_attention:
self.model_kwargs["attn_implementation"] = "eager"
- self.model_config._attn_implementation = ( # pylint: disable=protected-access
- "eager"
- )
+ self.model_config._attn_implementation = "eager"
if self.cfg.low_cpu_mem_usage:
self.model_kwargs["low_cpu_mem_usage"] = True
+ def _check_model_requirements(self):
+ if self.cfg.model_config_type in ["lfm2-vl", "lfm2"]:
+ from transformers.utils.import_utils import is_causal_conv1d_available
+
+ if is_causal_conv1d_available():
+ raise ImportError(
+ "The 'causal-conv1d' package is installed but causes compatibility issues with LFM2 models. "
+ "Please uninstall it by running: `pip uninstall -y causal-conv1d`"
+ )
+
def _configure_zero3_memory_efficient_loading(
self,
) -> HfTrainerDeepSpeedConfig | None:
@@ -667,6 +661,33 @@ class ModelLoader:
return hf_ds_cfg
+ def _load_model_from_config(self, model_loader_class=None) -> PreTrainedModel:
+ """
+ Load model with random initialization using from_config.
+
+ Uses the selected loader when provided; otherwise falls back to the auto loader.
+ """
+ loader = model_loader_class or self.auto_model_loader
+ if loader in [AutoModelForCausalLM, AutoModelForVision2Seq]:
+ model = loader.from_config(
+ config=self.model_config,
+ trust_remote_code=self.cfg.trust_remote_code or False,
+ )
+ else:
+ model = loader(config=self.model_config)
+
+ return model
+
+ def _load_model_from_pretrained(self, model_loader_class=None) -> PreTrainedModel:
+ """Load model from pretrained weights."""
+ loader = model_loader_class or self.auto_model_loader
+ kwargs = {
+ "config": self.model_config,
+ "trust_remote_code": self.cfg.trust_remote_code or False,
+ **self.model_kwargs,
+ }
+ return loader.from_pretrained(self.base_model, **kwargs)
+
def _build_model(self) -> bool:
"""Load model, with load strategy depending on config."""
skip_move_to_device = False
@@ -681,7 +702,8 @@ class ModelLoader:
if self.is_fsdp_enabled:
if self.cfg.fsdp_config.cpu_ram_efficient_loading:
skip_move_to_device = True
- # Don't delete device_map for QLoRA + FSDP - it was set correctly in _set_device_map
+ # Don't delete device_map for QLoRA + FSDP - it was set correctly in
+ # _set_device_map
if (
"device_map" in self.model_kwargs
and not self.is_qlora_and_fsdp_enabled
@@ -710,6 +732,11 @@ class ModelLoader:
or self.cfg.qlora_sharded_model_loading
)
):
+ if self.cfg.reinit_weights:
+ LOG.warning(
+ "reinit_weights is not supported with sharded quantized loading. "
+ "Loading from pretrained weights instead."
+ )
quant_storage = self.cfg.torch_dtype
quantization_config = getattr(
self.model_config, "quantization_config", None
@@ -725,35 +752,14 @@ class ModelLoader:
quantization_config=quantization_config,
)
skip_move_to_device = True
- elif (
- self.model_config.model_type in ["llama", "llama4"]
- and not self.cfg.trust_remote_code
- and not self.cfg.gptq
- ):
- # Please don't remove underscore binding without reading the fn docstring.
- _ = self._configure_zero3_memory_efficient_loading()
-
- # Load model with random initialization if specified
- if self.cfg.random_init_weights:
- # AutoModel classes support the from_config method
- if self.auto_model_loader in [
- AutoModelForCausalLM,
- AutoModelForVision2Seq,
- ]:
- self.model = self.auto_model_loader.from_config(
- config=self.model_config,
- )
- else:
- self.model = self.auto_model_loader(config=self.model_config)
- else:
- self.model = self.auto_model_loader.from_pretrained(
- self.base_model,
- config=self.model_config,
- **self.model_kwargs,
- )
elif self.model_type == "MambaLMHeadModel":
+ if self.cfg.reinit_weights:
+ LOG.warning(
+ "reinit_weights is not supported with MambaLMHeadModel. "
+ "Loading from pretrained weights instead."
+ )
# FIXME this is janky at best and hacked together to make it work
- MambaLMHeadModel = fix_mamba_attn_for_loss() # pylint: disable=invalid-name
+ MambaLMHeadModel = fix_mamba_attn_for_loss()
self.model_kwargs["dtype"] = self.model_kwargs["torch_dtype"]
self.model_kwargs["device"] = torch.cuda.current_device()
@@ -764,45 +770,30 @@ class ModelLoader:
self.base_model,
**self.model_kwargs,
)
- elif (
- self.model_type
- and self.model_type != "AutoModelForCausalLM"
- and not self.cfg.trust_remote_code
- ):
- if self.cfg.gptq:
- self.model = self.auto_model_loader.from_pretrained(
- self.base_model,
- config=self.model_config,
- trust_remote_code=self.cfg.trust_remote_code or False,
- **self.model_kwargs,
- )
- else:
- self.model = getattr(transformers, self.model_type).from_pretrained(
- self.base_model,
- config=self.model_config,
- trust_remote_code=self.cfg.trust_remote_code or False,
- **self.model_kwargs,
- )
- elif self.cfg.gptq:
- self.model = self.auto_model_loader.from_pretrained(
- self.base_model,
- config=self.model_config,
- trust_remote_code=self.cfg.trust_remote_code or False,
- **self.model_kwargs,
- )
else:
- # Please don't remove underscore binding without reading the fn docstring.
+ # Please don't remove underscore binding without reading the fn docstring
_ = self._configure_zero3_memory_efficient_loading()
- self.model = self.auto_model_loader.from_pretrained(
- self.base_model,
- config=self.model_config,
- trust_remote_code=self.cfg.trust_remote_code or False,
- **self.model_kwargs,
- )
+
+ if (
+ self.model_type
+ and self.model_type != "AutoModelForCausalLM"
+ and not self.cfg.trust_remote_code
+ and not self.cfg.gptq
+ ):
+ # Use model type from transformers
+ model_loader_class = getattr(transformers, self.model_type)
+ else:
+ # Use auto model loader (handles gptq and default cases)
+ model_loader_class = self.auto_model_loader
+
+ if self.cfg.reinit_weights:
+ self.model = self._load_model_from_config(model_loader_class)
+ else:
+ self.model = self._load_model_from_pretrained(model_loader_class)
+
if is_deepspeed_zero3_enabled():
skip_move_to_device = True
- # pylint: disable=protected-access
if self.cfg.tensor_parallel_size > 1:
# workaround for upstream 4.54.0 not setting _tp_size or _device_mesh
# TODO(wing): remove once 4.54.1 is released
diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index f1ca3c725..81e4dd786 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -4,6 +4,7 @@ Applies pre- and post-model load patches for various fixes and optimizations.
"""
import importlib.util
+import os
from functools import cached_property
import addict
@@ -66,11 +67,13 @@ class PatchManager:
self._apply_mistral_cross_entropy_patch()
self._apply_self_attention_lora_patch()
self._apply_fsdp2_bnb_patches()
+ self._apply_patch_deepspeed_zero3()
+ self._apply_voxtral_patches()
+ self._apply_apertus_patches()
def apply_post_plugin_pre_model_load_patches(self):
"""Apply post plugin-pre_model_load load patches based on config."""
self._apply_tiled_mlp(self.cfg.model_config_type)
- self._apply_voxtral_patches()
def _apply_transformers_patches(self):
from axolotl.monkeypatch.transformers.trainer_loss_calc import (
@@ -78,15 +81,16 @@ class PatchManager:
patch_maybe_log_save_evaluate,
)
- patch_fsdp2 = (
- self.cfg.torch_compile
- and self.cfg.fsdp_config
- and self.cfg.fsdp_version == 2
- )
-
- patch_evaluation_loop(patch_fsdp2)
+ patch_evaluation_loop()
patch_maybe_log_save_evaluate()
+ if self.cfg.context_parallel_size > 1:
+ from axolotl.monkeypatch.transformers.trainer_context_parallel import (
+ patch_prepare_context_parallel_inputs,
+ )
+
+ patch_prepare_context_parallel_inputs()
+
def apply_post_model_load_patches(self, model: PreTrainedModel):
"""Apply patches that require the model instance."""
self._apply_llama_flash_attn_patches(model)
@@ -147,14 +151,12 @@ class PatchManager:
def _apply_flex_attention_patches(self):
"""Apply patches for flexible attention."""
if self.cfg.flex_attention:
- # from axolotl.monkeypatch.attention.flex_attn import (
- # patch_flex_make_mask,
- # patch_flex_wrapper,
- # )
- #
- # flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
- # patch_flex_wrapper(**flex_attn_compile_kwargs)
- # patch_flex_make_mask()
+ from axolotl.monkeypatch.attention.flex_attn import (
+ patch_flex_wrapper,
+ )
+
+ flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
+ patch_flex_wrapper(**flex_attn_compile_kwargs)
if self.cfg.sample_packing:
from axolotl.core.attention.flex_block_mask import (
patch_create_causal_mask,
@@ -174,6 +176,20 @@ class PatchManager:
patch_llama4_linearized_modeling()
+ if self.cfg.model_config_type == "qwen3_next" and self.cfg.sample_packing:
+ from axolotl.monkeypatch.models.qwen3_next.modeling import (
+ patch_qwen3_next_modeling_packing,
+ )
+
+ patch_qwen3_next_modeling_packing()
+
+ if self.cfg.model_config_type == "mistral3" and self.cfg.processor_type:
+ from axolotl.monkeypatch.models.mistral3.mistral_common_tokenizer import (
+ apply_mistral_tokenizer_image_patch,
+ )
+
+ apply_mistral_tokenizer_image_patch()
+
def _apply_fp8_patches(self):
"""Apply patches for FP8 support."""
if self.cfg.fp8:
@@ -277,6 +293,14 @@ class PatchManager:
has_remote_code=has_remote_code,
)
+ if self.cfg.sample_packing:
+ from axolotl.monkeypatch.data.batch_dataset_fetcher import (
+ apply_multipack_dataloader_patch,
+ )
+
+ LOG.info("Applying multipack dataloader patch for sample packing...")
+ apply_multipack_dataloader_patch()
+
def _apply_fsdp2_bnb_patches(self):
"""Apply FSDP2 BNB patches."""
if (
@@ -285,12 +309,10 @@ class PatchManager:
and self.cfg.adapter == "qlora"
):
from axolotl.monkeypatch.fsdp2_qlora import (
- apply_bnb_torch_function_patch,
apply_init_sharded_param_patch,
apply_init_unsharded_param_patch,
)
- apply_bnb_torch_function_patch()
apply_init_sharded_param_patch()
apply_init_unsharded_param_patch()
@@ -334,6 +356,13 @@ class PatchManager:
replace_stablelm_attn_with_flash_attn(self.cfg.base_model)
+ if self.model_config.model_type in ("mistral3", "llava"):
+ from axolotl.monkeypatch.models.pixtral.modeling_flash_attention_utils import (
+ apply_patch_is_packed_sequence,
+ )
+
+ apply_patch_is_packed_sequence()
+
def _patch_loss_llama(self):
"""Patch loss functions and other optimizations for LLaMA models."""
if not self.cfg.is_llama_derived_model:
@@ -428,7 +457,7 @@ class PatchManager:
and self.cfg.flash_attention
and not self.inference
):
- # TODO(MengqingCao): split these patches seperately
+ # TODO(MengqingCao): split these patches separately
from axolotl.monkeypatch.llama_attn_hijack_flash import (
is_xformers_swiglu_available,
replace_llama_mlp_with_swiglu,
@@ -465,3 +494,26 @@ class PatchManager:
from axolotl.monkeypatch.lora_kernels import apply_lora_kernel_patches
apply_lora_kernel_patches(model=model, cfg=self.cfg)
+
+ def _apply_patch_deepspeed_zero3(self):
+ try:
+ from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+
+ from axolotl.monkeypatch.deepspeed_utils import apply_deepspeed_patches
+
+ if self.cfg.activation_offloading is True and (
+ is_deepspeed_zero3_enabled()
+ or os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3"
+ ):
+ apply_deepspeed_patches()
+ except ImportError as e:
+ LOG.warning(f"DeepSpeed patches not applied: {e}")
+
+ def _apply_apertus_patches(self):
+ """Apply patches for Apertus model."""
+ if self.cfg.model_config_type == "apertus":
+ from axolotl.monkeypatch.models.apertus.activation import (
+ patch_apertus_xielu_activation,
+ )
+
+ patch_apertus_xielu_activation()
diff --git a/src/axolotl/loaders/processor.py b/src/axolotl/loaders/processor.py
index 2e3ec8d7f..7580b2008 100644
--- a/src/axolotl/loaders/processor.py
+++ b/src/axolotl/loaders/processor.py
@@ -21,6 +21,13 @@ def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
if cfg.processor_type:
processor_cls = getattr(transformers, cfg.processor_type)
+ if cfg.tokenizer_use_mistral_common:
+ from axolotl.utils.mistral import Mistral3Processor
+
+ return Mistral3Processor(
+ tokenizer=tokenizer,
+ )
+
processor = processor_cls.from_pretrained(
cfg.processor_config,
trust_remote_code=cfg.trust_remote_code or False,
diff --git a/src/axolotl/loaders/tokenizer.py b/src/axolotl/loaders/tokenizer.py
index 0a486d023..69455dd77 100644
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -50,7 +50,7 @@ def modify_tokenizer_files(
tokenizer_dir = os.path.join(output_dir, "tokenizer")
os.makedirs(tokenizer_dir, exist_ok=True)
- if is_local_main_process(): # pylint: disable=too-many-nested-blocks
+ if is_local_main_process():
# Load the tokenizer
temp_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
@@ -73,9 +73,9 @@ def modify_tokenizer_files(
for token_id, new_value in token_id_mappings.items():
token_id_str = str(token_id)
if token_id_str in config_data["added_tokens_decoder"]:
- config_data["added_tokens_decoder"][token_id_str][
- "content"
- ] = new_value
+ config_data["added_tokens_decoder"][token_id_str]["content"] = (
+ new_value
+ )
else:
raise ValueError(
f"Token ID {token_id_str} not found in added_tokens_decoder"
@@ -124,13 +124,8 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
def _load_mistral_common_tokenizer(cfg: DictDefault):
"""Load mistral-common tokenizer"""
- from transformers import tokenization_mistral_common
-
from axolotl.utils.mistral import HFMistralTokenizer
- # patch
- tokenization_mistral_common.MistralCommonTokenizer = HFMistralTokenizer
-
# Load the HF-compatible wrapper around MistralTokenizer
tokenizer = HFMistralTokenizer.from_pretrained(cfg.tokenizer_config)
@@ -215,7 +210,7 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
for k, val in special_tokens.items():
# check if new special token is not already in tokenizer and
# is adapter training to make sure lora_modules_to_save is set
- # pylint: disable=too-many-boolean-expressions
+
if (
(getattr(tokenizer, k) is None or getattr(tokenizer, k) != val)
and (len(tokenizer.encode(val, add_special_tokens=False)) > 2)
@@ -296,7 +291,7 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
)
tokenizer.chat_template = chat_template_string
- else:
+ elif getattr(tokenizer, "chat_template", None) is None:
LOG.info(
"No Chat template selected. Consider adding a chat template for easier inference."
)
diff --git a/src/axolotl/logging_config.py b/src/axolotl/logging_config.py
index 10c5ae9dc..67b1d32f1 100644
--- a/src/axolotl/logging_config.py
+++ b/src/axolotl/logging_config.py
@@ -1,10 +1,7 @@
-"""
-Common logging module for axolotl
-"""
+"""Common logging module for axolotl."""
import logging
import os
-import sys
from logging import Formatter, Logger, LogRecord
from logging.config import dictConfig
from typing import Any, Dict
@@ -17,9 +14,9 @@ DEFAULT_LOG_LEVEL = "WARNING"
class AxolotlOrWarnErrorFilter(logging.Filter):
"""
- Allows ANY WARNING or higher (unless overridden by LOG_LEVEL)
- Allows axolotl.* at INFO or higher (unless overridden by AXOLOTL_LOG_LEVEL)
- Drops all other records (i.e. non-axolotl.INFO, DEBUG, etc. by default)
+ Allows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at
+ INFO or higher (unless overridden by AXOLOTL_LOG_LEVEL). Drops all other records
+ (i.e. non-axolotl.INFO, DEBUG, etc. by default).
"""
def __init__(self, **kwargs):
@@ -52,13 +49,12 @@ class AxolotlOrWarnErrorFilter(logging.Filter):
class AxolotlLogger(Logger):
- """A Logger that automatically rejects non-axolotl INFOs."""
+ """Logger that applies filtering to non-axolotl loggers."""
def __init__(self, name: str, level: int = logging.NOTSET):
super().__init__(name, level)
-
- # set global filter on the logger itself
- self.addFilter(AxolotlOrWarnErrorFilter())
+ if not name.startswith("axolotl"):
+ self.addFilter(AxolotlOrWarnErrorFilter())
class ColorfulFormatter(Formatter):
@@ -74,6 +70,7 @@ class ColorfulFormatter(Formatter):
def format(self, record):
record.rank = int(os.getenv("LOCAL_RANK", "0"))
+ record.rank_fmt = f" [RANK:{record.rank}]" if record.rank != 0 else ""
log_message = super().format(record)
return self.COLORS.get(record.levelname, "") + log_message + Fore.RESET
@@ -87,32 +84,54 @@ DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
},
"colorful": {
"()": ColorfulFormatter,
- "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] [RANK:%(rank)d] %(message)s",
+ "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d]%(rank_fmt)s %(message)s",
+ },
+ "concise": {
+ "format": "[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s",
+ },
+ "concise_color": {
+ "()": ColorfulFormatter,
+ "format": "[%(asctime)s] [%(levelname)s] [%(name)s]%(rank_fmt)s %(message)s",
+ },
+ },
+ "filters": {
+ "ax_or_warn": {
+ "()": "axolotl.logging_config.AxolotlOrWarnErrorFilter",
},
},
- "filters": {},
"handlers": {
"console": {
"class": "logging.StreamHandler",
- "formatter": "simple",
- "filters": [],
- "stream": sys.stdout,
+ "formatter": "concise",
+ "filters": ["ax_or_warn"],
+ "stream": "ext://sys.stdout",
},
"color_console": {
"class": "logging.StreamHandler",
- "formatter": "colorful",
- "filters": [],
- "stream": sys.stdout,
+ "formatter": "concise_color",
+ "filters": ["ax_or_warn"],
+ "stream": "ext://sys.stdout",
+ },
+ "ax_file_only": {
+ "class": "logging.StreamHandler",
+ "level": "DEBUG",
+ "formatter": "simple",
+ "stream": "ext://axolotl.utils.tee.file_only_stream",
+ },
+ "root_file_only": {
+ "class": "logging.StreamHandler",
+ "level": "DEBUG",
+ "formatter": "simple",
+ "stream": "ext://axolotl.utils.tee.file_only_stream",
},
},
- # log level will be superseded by the AxolotlLogger
"root": {
- "handlers": ["console"],
- "level": os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL),
+ "handlers": ["console", "root_file_only"],
+ "level": os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL).upper(),
},
"loggers": {
"axolotl": {
- "handlers": ["color_console"],
+ "handlers": ["color_console", "ax_file_only"],
"level": os.getenv("AXOLOTL_LOG_LEVEL", DEFAULT_AXOLOTL_LOG_LEVEL).upper(),
"propagate": False,
},
@@ -123,9 +142,15 @@ DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
def configure_logging():
"""Configure with default logging"""
init() # Initialize colorama
+
dictConfig(DEFAULT_LOGGING_CONFIG)
logging.setLoggerClass(AxolotlLogger)
- # set default `ACCELERATE_LOG_LEVEL` to `LOG_LEVEL` if available and not set
+ # Route Python warnings through logging so they reach file handlers
+ logging.captureWarnings(True)
+
+ # Set default `ACCELERATE_LOG_LEVEL` to `LOG_LEVEL` if available and not set
if "ACCELERATE_LOG_LEVEL" not in os.environ:
- os.environ["ACCELERATE_LOG_LEVEL"] = os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL)
+ os.environ["ACCELERATE_LOG_LEVEL"] = os.getenv(
+ "LOG_LEVEL", DEFAULT_LOG_LEVEL
+ ).upper()
diff --git a/src/axolotl/models/mamba/__init__.py b/src/axolotl/models/mamba/__init__.py
index fee88e3a4..d6bb40d99 100644
--- a/src/axolotl/models/mamba/__init__.py
+++ b/src/axolotl/models/mamba/__init__.py
@@ -21,4 +21,4 @@ def fix_mamba_attn_for_loss():
from .modeling_mamba import MambaLMHeadModel as MambaLMHeadModelFixed
mixer_seq_simple.MambaLMHeadModel = MambaLMHeadModelFixed
- return mixer_seq_simple.MambaLMHeadModel # pylint: disable=invalid-name
+ return mixer_seq_simple.MambaLMHeadModel
diff --git a/src/axolotl/models/mamba/modeling_mamba.py b/src/axolotl/models/mamba/modeling_mamba.py
index 70e9c88c8..2cfe11544 100644
--- a/src/axolotl/models/mamba/modeling_mamba.py
+++ b/src/axolotl/models/mamba/modeling_mamba.py
@@ -1,4 +1,3 @@
-# pylint: skip-file
import os
from collections import namedtuple
from functools import partial
@@ -112,7 +111,7 @@ class MambaLMHeadModel(nn.Module, GenerationMixin):
self,
save_directory: Union[str, os.PathLike],
state_dict: Optional[dict] = None,
- safe_serialization: Optional[bool] = None, # pylint: disable=unused-argument
+ safe_serialization: Optional[bool] = None,
):
if state_dict is None:
state_dict = self.state_dict()
diff --git a/src/axolotl/monkeypatch/accelerate/fsdp2.py b/src/axolotl/monkeypatch/accelerate/fsdp2.py
index efc388294..af6f24a63 100644
--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -4,6 +4,7 @@ monkeypatch for accelerate fsdp2 fix when modifying ordereddict during interatio
import copy
import functools
+import os
import sys
import torch
@@ -130,9 +131,9 @@ def get_state_dict(self, model, unwrap=True):
"Deepspeed TP requires deepspeed >= 0.16.4, Please update DeepSpeed via `pip install deepspeed -U`."
)
state_dict = (
- model._consolidated_16bit_state_dict() # pylint: disable=protected-access
+ model._consolidated_16bit_state_dict()
if tp_sharding
- else model._zero3_consolidated_16bit_state_dict() # pylint: disable=protected-access
+ else model._zero3_consolidated_16bit_state_dict()
)
else:
raise ValueError(
@@ -160,9 +161,11 @@ def get_state_dict(self, model, unwrap=True):
state_dict[param_name] = param.cpu()
torch.distributed.barrier()
elif self.distributed_type == DistributedType.FSDP:
- from torch.distributed.fsdp import FullStateDictConfig
- from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
- from torch.distributed.fsdp import StateDictType
+ from torch.distributed.fsdp import (
+ FullStateDictConfig,
+ FullyShardedDataParallel as FSDP,
+ StateDictType,
+ )
full_state_dict_config = FullStateDictConfig(
offload_to_cpu=True, rank0_only=True
@@ -187,7 +190,7 @@ def _process_lora_module_for_fsdp(module, fsdp2_kwargs):
# Linear4Bit will keep it's bias term in fp32. If the weight dtype is in bf16 we are not able to
# wrap this. Therefore we must ensure the bias has the same dtype as the weight
- if module.base_layer.bias is not None:
+ if hasattr(module.base_layer, "bias") and module.base_layer.bias is not None:
if module.base_layer.weight.dtype != module.base_layer.bias.dtype:
log_bias_dtype_mismatch = True
module.base_layer.bias.data = module.base_layer.bias.data.to(
@@ -231,8 +234,7 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
)
is_type_fsdp = isinstance(model, FSDPModule) or (
- is_compiled_module(model)
- and isinstance(model._orig_mod, FSDPModule) # pylint: disable=protected-access
+ is_compiled_module(model) and isinstance(model._orig_mod, FSDPModule)
)
if is_type_fsdp:
return model
@@ -276,6 +278,11 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
mesh = getattr(accelerator.state, "device_mesh", None)
+ # Disable memory pinning if requested
+ offload_to_cpu = isinstance(fsdp2_plugin.cpu_offload, CPUOffloadPolicy)
+ if offload_to_cpu and os.environ.get("FSDP_CPU_OFFLOAD_PIN_MEMORY", "") == "false":
+ fsdp2_plugin.cpu_offload.pin_memory = False
+
fsdp2_kwargs = {
"reshard_after_forward": fsdp2_plugin.reshard_after_forward,
"offload_policy": fsdp2_plugin.cpu_offload,
@@ -340,7 +347,6 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
)
if fsdp2_plugin.cpu_ram_efficient_loading:
- offload_to_cpu = isinstance(fsdp2_plugin.cpu_offload, CPUOffloadPolicy)
fsdp2_load_full_state_dict(
accelerator, model, original_sd, offload_to_cpu=offload_to_cpu
)
diff --git a/src/axolotl/monkeypatch/accelerate/parallelism_config.py b/src/axolotl/monkeypatch/accelerate/parallelism_config.py
index e3cafc87d..b2157fb6b 100644
--- a/src/axolotl/monkeypatch/accelerate/parallelism_config.py
+++ b/src/axolotl/monkeypatch/accelerate/parallelism_config.py
@@ -2,7 +2,6 @@
workaround to allow parallelism config for pure CP
"""
-# pylint: disable=protected-access
import os
import warnings
@@ -30,7 +29,7 @@ def _validate_accelerator(self, accelerator):
allow_parallelism_config = False
if (
- self.cp_size > 1 # pylint: disable=chained-comparison
+ self.cp_size > 1
and self.dp_shard_size <= 1
and os.environ.get("ACCELERATE_ALLOW_CP_STANDALONE", "false").lower() == "true"
):
@@ -55,6 +54,7 @@ def _validate_accelerator(self, accelerator):
warnings.warn(
"ParallelismConfig has the following warnings:\n" + "\n".join(_warnings),
UserWarning,
+ stacklevel=2,
)
diff --git a/src/axolotl/monkeypatch/attention/flex_attn.py b/src/axolotl/monkeypatch/attention/flex_attn.py
index 98aead832..678f65bee 100644
--- a/src/axolotl/monkeypatch/attention/flex_attn.py
+++ b/src/axolotl/monkeypatch/attention/flex_attn.py
@@ -1,10 +1,11 @@
"""Flex attention monkey patch"""
import sys
-from typing import Optional, Tuple, Union
import torch
import transformers
+from packaging import version
+from transformers.utils.import_utils import _torch_version, is_torch_less_or_equal
from axolotl.utils.logging import get_logger
@@ -46,167 +47,39 @@ def patch_flex_wrapper(**flex_attn_compile_kwargs):
"""
self.training = None
if not self._is_flex_compiled or training != self.training:
+ self.training = training
+ if is_torch_less_or_equal("2.5.1"):
+ self._compiled_flex_attention = torch.compile(
+ flex_attention, dynamic=False
+ )
# In PyTorch 2.6.0, there's a known issue with flex attention compilation which may
# cause errors. The suggested fix is to compile with "max-autotune-no-cudagraphs"
# see https://github.com/pytorch/pytorch/issues/146260 for training
- self.training = training
- LOG.info(
- "Compiling flex attention with kwargs: %s. This may take a while...",
- flex_attn_compile_kwargs,
- )
- self._compiled_flex_attention = torch.compile(
- flex_attention,
- **flex_attn_compile_kwargs,
- )
- LOG.info("Flex attention compiled successfully.")
+ elif version.parse(_torch_version).base_version == "2.6.0" and training:
+ self._compiled_flex_attention = torch.compile(
+ flex_attention, dynamic=False, mode="max-autotune-no-cudagraphs"
+ )
+ # Fallback, usually the most recent torch 2.7.x+ versions
+ else:
+ LOG.info(
+ "Compiling flex attention with kwargs: %s. This may take a while...",
+ flex_attn_compile_kwargs,
+ main_process_only=True,
+ )
+ self._compiled_flex_attention = torch.compile(
+ flex_attention,
+ **flex_attn_compile_kwargs,
+ )
+ LOG.info(
+ "Flex attention compiled successfully.", main_process_only=True
+ )
+
self._is_flex_compiled = True
def __call__(self):
return self._compiled_flex_attention
transformers.integrations.flex_attention.WrappedFlexAttention = WrappedFlexAttention
- setattr(
- sys.modules["transformers.integrations.flex_attention"],
- "WrappedFlexAttention",
- WrappedFlexAttention,
- )
-
-
-def patch_flex_make_mask():
- is_torch_2_6 = torch.__version__.startswith("2.6")
-
- if not is_torch_2_6:
- return
-
- from torch.nn.attention.flex_attention import (
- _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size,
- )
- from torch.nn.attention.flex_attention import (
- BlockMask,
- )
- from torch.nn.attention.flex_attention import (
- create_block_mask as create_block_causal_mask_flex,
- )
-
- Offset = Union[torch.Tensor, int]
-
- def patched_make_flex_block_causal_mask(
- attention_mask_2d: torch.Tensor,
- attention_chunk_size: Optional[int] = None,
- query_length=None,
- key_length=None,
- offsets: Optional[Tuple[Offset, Offset]] = None,
- ) -> "BlockMask":
- """
- Create a block causal document mask for a batch of sequences, both packed and unpacked.
- Create Block causal logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
- The resultant BlockMask is a compressed representation of the full block causal
- mask. BlockMask is essential for performant computation of flex attention.
- See: https://pytorch.org/blog/flexattention/
-
- Args:
- attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
- of shape (batch_size, total_seq_len). e.g.
-
- For unpacked sequence:
- [[1, 1, 1, 1, 0, 0, 0],
- [1, 1, 1, 1, 1, 0, 0]]
-
- For packed sequence:
- [[1, 1, 1, 2, 2, 2, 0],
- [1, 1, 2, 2, 2, 3, 3]]
-
- Returns:
- BlockMask
- """
-
- batch_size, total_seq_len = attention_mask_2d.shape
- if not key_length:
- key_length = total_seq_len
- if not query_length:
- query_length = total_seq_len
- attention_mask_2d = torch.nn.functional.pad(
- attention_mask_2d,
- value=0,
- pad=(0, abs(total_seq_len - max(key_length, flex_default_block_size))),
- )
- device = attention_mask_2d.device
- document_ids = attention_mask_2d.clone()
-
- if attention_chunk_size is not None:
- # we create an arange, then we just // by chunk size to get [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
- chunk_idxs = (document_ids.clone().fill_(1).cumsum(-1) - 1) // (
- attention_chunk_size
- )
-
- # Instead of passing a tensor mask, flex attention requires a mask_mod function
- # that determines which elements of QK^T should be included in the attention
- # computation prior to the softmax. For sample packing, we need both the
- # logic for both causal mask and document mask. See PyTorch's official
- # blog post for more details: https://pytorch.org/blog/flexattention/#mask-mods
- def causal_mask_mod(
- batch_idx, head_idx, q_idx, kv_idx
- ): # pylint: disable=unused-argument
- """
- Defines the logic of a block causal mask by combining both a standard causal mask
- and a block diagonal document mask.
-
- See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
- for an illustration.
- """
- causal_mask = q_idx >= kv_idx # not valid when decoding
- document_mask = (
- document_ids[batch_idx, q_idx] == document_ids[batch_idx, kv_idx]
- )
- padding_mask = attention_mask_2d[batch_idx, q_idx] > 0
- final_mask = causal_mask & padding_mask & document_mask
- return final_mask
-
- def chunk_causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
- """
- Combines the chunk mask with the causal mask for chunked attention.
- """
- chunk_mask = chunk_idxs[batch_idx, q_idx] == chunk_idxs[batch_idx, kv_idx]
- causal_doc_mask = causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx)
- return chunk_mask & causal_doc_mask
-
- mask_mod_maybe_combined = (
- causal_mask_mod if attention_chunk_size is None else chunk_causal_mask_mod
- )
-
- if offsets is not None:
- q_offset = offsets[0]
- kv_offset = offsets[1]
-
- def mask_mod(batch_idx, head_idx, q_idx, kv_idx):
- offset_q = q_idx + q_offset
- offset_kv = kv_idx + kv_offset
- return mask_mod_maybe_combined(batch_idx, head_idx, offset_q, offset_kv)
-
- else:
- mask_mod = mask_mod_maybe_combined
- return create_block_causal_mask_flex(
- mask_mod=mask_mod,
- B=batch_size,
- H=None, # attention head
- Q_LEN=query_length,
- KV_LEN=key_length,
- device=device,
- _compile=True,
- )
-
- for n in tuple(sys.modules):
- if ".modeling_" in n:
- if hasattr(sys.modules[n], "make_flex_block_causal_mask"):
- sys.modules[n].make_flex_block_causal_mask = (
- patched_make_flex_block_causal_mask
- )
- setattr(
- sys.modules[n],
- "make_flex_block_causal_mask",
- patched_make_flex_block_causal_mask,
- )
-
- transformers.integrations.flex_attention.make_flex_block_causal_mask = (
- patched_make_flex_block_causal_mask
- )
+ sys.modules[
+ "transformers.integrations.flex_attention"
+ ].WrappedFlexAttention = WrappedFlexAttention
diff --git a/src/axolotl/monkeypatch/attention/xformers.py b/src/axolotl/monkeypatch/attention/xformers.py
index 5901963f0..eca95797a 100644
--- a/src/axolotl/monkeypatch/attention/xformers.py
+++ b/src/axolotl/monkeypatch/attention/xformers.py
@@ -23,15 +23,15 @@ def xformers_attention_forward(
value: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
- dropout: float = 0.0, # pylint: disable=unused-argument
- scaling: Optional[float] = None, # pylint: disable=unused-argument
- sliding_window: Optional[int] = None, # pylint: disable=unused-argument
- softcap: Optional[float] = None, # pylint: disable=unused-argument
+ dropout: float = 0.0,
+ scaling: Optional[float] = None,
+ sliding_window: Optional[int] = None,
+ softcap: Optional[float] = None,
cu_seq_lens_q: Optional[torch.LongTensor] = None,
cu_seq_lens_k: Optional[torch.LongTensor] = None,
max_length_q: Optional[int] = None,
- max_length_k: Optional[int] = None, # pylint: disable=unused-argument
- **kwargs, # pylint: disable=unused-argument
+ max_length_k: Optional[int] = None,
+ **kwargs,
):
# Get dimensions
# query: [batch, heads, seq_len, hidden_dim]
diff --git a/src/axolotl/monkeypatch/btlm_attn_hijack_flash.py b/src/axolotl/monkeypatch/btlm_attn_hijack_flash.py
index 589980c8b..2c5077392 100644
--- a/src/axolotl/monkeypatch/btlm_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/btlm_attn_hijack_flash.py
@@ -25,9 +25,7 @@ def replace_btlm_attn_with_flash_attn(model_name="cerebras/btlm-3b-8k-base"):
".configuration_btlm", ".modeling_btlm"
)
modeling_btlm = importlib.import_module(module_name)
- modeling_btlm.BTLMAttention._attn = ( # pylint: disable=protected-access
- flashattn_attn
- )
+ modeling_btlm.BTLMAttention._attn = flashattn_attn
def flashattn_attn(
@@ -35,9 +33,9 @@ def flashattn_attn(
query: torch.Tensor,
key: Optional[torch.Tensor] = None,
value: Optional[torch.Tensor] = None,
- attention_mask: Optional[torch.Tensor] = None, # pylint: disable=unused-argument
+ attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
- position_bias: Optional[torch.Tensor] = None, # pylint: disable=unused-argument
+ position_bias: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
softmax_scale = (
1 / (key.size(-1) ** self.attn_scale_power) if self.scale_attn_weights else None
diff --git a/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py b/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py
index df8d106fd..c426344a6 100644
--- a/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py
+++ b/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py
@@ -1,15 +1,23 @@
-"""monkey patches for the dataset fetcher to handle batches of packed indexes"""
-
-# pylint: disable=protected-access
+"""Monkey patches for the dataset fetcher to handle batches of packed indexes."""
import torch
from torch.utils.data._utils.fetch import _BaseDatasetFetcher
from torch.utils.data._utils.worker import _worker_loop
+_ORIGINAL_MAP_DATASET_FETCHER = None
+_ORIGINAL_WORKER_LOOP = None
+_IS_PATCHED = False
+
class _MapDatasetFetcher(_BaseDatasetFetcher):
+ """
+ Custom dataset fetcher that handles nested batch structures from
+ MultipackBatchSampler.
+ """
+
def fetch(self, possibly_batched_index):
if isinstance(possibly_batched_index[0], list):
+ # Handle nested structure from MultipackBatchSampler
data = [None for i in possibly_batched_index]
for i, possibly_batched_index_ in enumerate(possibly_batched_index):
if self.auto_collation:
@@ -23,6 +31,7 @@ class _MapDatasetFetcher(_BaseDatasetFetcher):
else:
data[i] = self.dataset[possibly_batched_index_]
else:
+ # Standard batch handling
if self.auto_collation:
if hasattr(self.dataset, "__getitems__") and self.dataset.__getitems__:
data = self.dataset.__getitems__(possibly_batched_index)
@@ -34,14 +43,54 @@ class _MapDatasetFetcher(_BaseDatasetFetcher):
def patch_fetchers():
+ """Apply patches to PyTorch's DataLoader components."""
torch.utils.data._utils.fetch._MapDatasetFetcher = _MapDatasetFetcher
torch.utils.data.dataloader._utils.fetch._MapDatasetFetcher = _MapDatasetFetcher
def patched_worker_loop(*args, **kwargs):
+ """Worker loop that ensures patches are applied in worker processes."""
patch_fetchers()
return _worker_loop(*args, **kwargs)
-torch.utils.data._utils.worker._worker_loop = patched_worker_loop
-patch_fetchers()
+def apply_multipack_dataloader_patch():
+ """
+ This patch allows DataLoader to correctly process batches that contain multiple bins
+ of packed sequences.
+ """
+ # pylint: disable=global-statement
+ global _ORIGINAL_MAP_DATASET_FETCHER, _ORIGINAL_WORKER_LOOP, _IS_PATCHED
+
+ if _IS_PATCHED:
+ return
+
+ # Store original implementations
+ _ORIGINAL_MAP_DATASET_FETCHER = torch.utils.data._utils.fetch._MapDatasetFetcher
+ _ORIGINAL_WORKER_LOOP = torch.utils.data._utils.worker._worker_loop
+
+ # Apply patches
+ patch_fetchers()
+ torch.utils.data._utils.worker._worker_loop = patched_worker_loop
+
+ _IS_PATCHED = True
+
+
+def remove_multipack_dataloader_patch():
+ """Remove the monkeypatch and restore original PyTorch DataLoader behavior."""
+ # pylint: disable=global-statement
+ global _IS_PATCHED
+
+ if not _IS_PATCHED:
+ return
+
+ if _ORIGINAL_MAP_DATASET_FETCHER:
+ torch.utils.data._utils.fetch._MapDatasetFetcher = _ORIGINAL_MAP_DATASET_FETCHER
+ torch.utils.data.dataloader._utils.fetch._MapDatasetFetcher = (
+ _ORIGINAL_MAP_DATASET_FETCHER
+ )
+
+ if _ORIGINAL_WORKER_LOOP:
+ torch.utils.data._utils.worker._worker_loop = _ORIGINAL_WORKER_LOOP
+
+ _IS_PATCHED = False
diff --git a/src/axolotl/monkeypatch/deepspeed_utils.py b/src/axolotl/monkeypatch/deepspeed_utils.py
new file mode 100644
index 000000000..d7e69e112
--- /dev/null
+++ b/src/axolotl/monkeypatch/deepspeed_utils.py
@@ -0,0 +1,67 @@
+import importlib
+import importlib.util
+
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def patch_checkpoint_wrapper_setattr():
+ """
+ Patch CheckpointWrapper to properly forward DeepSpeed attributes to wrapped modules.
+
+ This fixes the issue where CheckpointWrapper doesn't forward ds_* attributes
+ (like ds_grads_remaining) to the actual wrapped module, causing DeepSpeed
+ ZeRO-3 to fail when gradient checkpointing is enabled.
+
+ This issue occurs specifically with:
+ - QLoRA + DeepSpeed ZeRO-3
+ - gradient_checkpointing: true
+ - activation_offloading: true
+
+ References:
+ - https://github.com/deepspeedai/DeepSpeed/issues/7203
+ - https://github.com/deepspeedai/DeepSpeed/blob/38d1a9eb64c9e01e32eccc50b25ba18925287441/deepspeed/runtime/zero/parameter_offload.py#L424-L458
+ - https://github.com/axolotl-ai-cloud/axolotl/pull/3102
+ """
+
+ try:
+ from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+ CheckpointWrapper,
+ )
+
+ # Check if already patched
+ if hasattr(CheckpointWrapper, "_axolotl_setattr_patched"):
+ LOG.debug("CheckpointWrapper already patched")
+ return
+
+ original_setattr = CheckpointWrapper.__setattr__
+
+ def new_setattr(self, name: str, value) -> None:
+ if name.startswith("ds_") and hasattr(self, "_checkpoint_wrapped_module"):
+ setattr(self._checkpoint_wrapped_module, name, value)
+ LOG.debug(
+ f"Forwarded {name} to wrapped module {type(self._checkpoint_wrapped_module).__name__}"
+ )
+ else:
+ original_setattr(self, name, value)
+
+ CheckpointWrapper.__setattr__ = new_setattr
+ CheckpointWrapper._axolotl_setattr_patched = True
+
+ LOG.info("CheckpointWrapper patched to forward DeepSpeed attributes")
+
+ except ImportError as e:
+ LOG.debug(f"CheckpointWrapper not available: {e}")
+ except Exception as e:
+ LOG.warning(f"Failed to patch CheckpointWrapper: {e}")
+
+
+def apply_deepspeed_patches():
+ """
+ Apply DeepSpeed-related patches
+ """
+ if importlib.util.find_spec("deepspeed") is not None:
+ patch_checkpoint_wrapper_setattr()
+ else:
+ LOG.debug("DeepSpeed not available, skipping patches")
diff --git a/src/axolotl/monkeypatch/fsdp2_qlora.py b/src/axolotl/monkeypatch/fsdp2_qlora.py
index a2cb7e472..04d0d1971 100644
--- a/src/axolotl/monkeypatch/fsdp2_qlora.py
+++ b/src/axolotl/monkeypatch/fsdp2_qlora.py
@@ -9,74 +9,12 @@ Params4bit parameters.
import importlib
import inspect
-import torch
-from torch.nn import Parameter
-
from axolotl.monkeypatch.utils import detab_code
from axolotl.utils.logging import get_logger
LOG = get_logger(__name__)
-def patched_torch_function(cls, func, types, args=(), kwargs=None):
- """
- Patched version of Params4bit.__torch_function__ for preserving Params4bit
- class identity and attributes.
- """
- if kwargs is None:
- kwargs = {}
-
- if func in [torch.chunk, torch.split]:
- tensor = args[0]
- result = Parameter.__torch_function__(func, types, args, kwargs)
-
- if isinstance(result, tuple):
- return tuple(
- cls(
- data=chunk,
- requires_grad=tensor.requires_grad,
- quant_state=tensor.quant_state,
- blocksize=tensor.blocksize,
- compress_statistics=tensor.compress_statistics,
- quant_type=tensor.quant_type,
- quant_storage=tensor.quant_storage,
- module=tensor.module,
- bnb_quantized=tensor.bnb_quantized,
- )
- for chunk in result
- )
-
- return cls(
- data=result,
- requires_grad=tensor.requires_grad,
- quant_state=tensor.quant_state,
- blocksize=tensor.blocksize,
- compress_statistics=tensor.compress_statistics,
- quant_type=tensor.quant_type,
- quant_storage=tensor.quant_storage,
- module=tensor.module,
- bnb_quantized=tensor.bnb_quantized,
- )
-
- return Parameter.__torch_function__(func, types, args, kwargs)
-
-
-# pylint: disable=protected-access
-def apply_bnb_torch_function_patch():
- """
- Patch Params4bit.__torch_function__ using Axolotl-style approach.
-
- Returns:
- True if patching succeeded, False otherwise.
- """
- from bitsandbytes.nn.modules import Params4bit
-
- Params4bit.__torch_function__ = classmethod(patched_torch_function)
-
- LOG.info("Successfully patched Params4bit.__torch_function__")
-
-
-# pylint: disable=protected-access
def apply_init_sharded_param_patch():
"""Apply patch to FSDPParam._init_sharded_param to support Params4bit."""
from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam
@@ -127,14 +65,14 @@ def apply_init_sharded_param_patch():
if item in patched_source:
items_to_import.append(item)
- exec( # pylint: disable=exec-used # nosec B102
+ exec( # nosec B102
f"from {module_name} import ({', '.join(items_to_import)})",
globals(),
)
- exec(patched_source, globals()) # pylint: disable=exec-used # nosec B102
+ exec(patched_source, globals()) # nosec B102
# Replace the method
- FSDPParam._init_sharded_param = patched_init_sharded_param # pylint: disable=undefined-variable # noqa: F821
+ FSDPParam._init_sharded_param = patched_init_sharded_param
LOG.info("Successfully applied FSDP _init_sharded_param patch")
else:
LOG.warning("Could not find target code for _init_sharded_param patching")
@@ -192,14 +130,14 @@ def apply_init_unsharded_param_patch():
if item in patched_source:
items_to_import.append(item)
- exec( # pylint: disable=exec-used # nosec B102
+ exec( # nosec B102
f"from {module_name} import ({', '.join(items_to_import)})",
globals(),
)
- exec(patched_source, globals()) # pylint: disable=exec-used # nosec B102
+ exec(patched_source, globals()) # nosec B102
# Replace the method
- FSDPParam.init_unsharded_param = patched_init_unsharded_param # pylint: disable=undefined-variable # noqa: F821
+ FSDPParam.init_unsharded_param = patched_init_unsharded_param
LOG.info("Successfully applied FSDP init_unsharded_param patch")
else:
LOG.warning("Could not find target code for patching")
diff --git a/src/axolotl/monkeypatch/gradient_checkpointing/__init__.py b/src/axolotl/monkeypatch/gradient_checkpointing/__init__.py
index 3b090d5e5..b58bbb67c 100644
--- a/src/axolotl/monkeypatch/gradient_checkpointing/__init__.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/__init__.py
@@ -25,9 +25,7 @@ else:
return False
-def hf_grad_checkpoint_offload_wrapper(
- decoder_layer, *args, use_reentrant=None
-): # pylint: disable=unused-argument
+def hf_grad_checkpoint_offload_wrapper(decoder_layer, *args, use_reentrant=None):
if uses_gc_layers(decoder_layer):
return CPU_Offloaded_Gradient_Checkpointer.apply(
decoder_layer,
@@ -44,9 +42,7 @@ def hf_grad_checkpoint_offload_wrapper(
)
-def hf_grad_checkpoint_disk_offload_wrapper(
- decoder_layer, *args, use_reentrant=None
-): # pylint: disable=unused-argument
+def hf_grad_checkpoint_disk_offload_wrapper(decoder_layer, *args, use_reentrant=None):
if uses_gc_layers(decoder_layer):
return Disco.apply(
decoder_layer,
diff --git a/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py b/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
index bbcfb91e6..8d06f172d 100644
--- a/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
@@ -35,9 +35,7 @@ else:
torch_cuda_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda")
-class CPU_Offloaded_Gradient_Checkpointer( # pylint: disable=invalid-name
- torch.autograd.Function
-):
+class CPU_Offloaded_Gradient_Checkpointer(torch.autograd.Function):
"""
Saves VRAM by smartly offloading to RAM.
Tiny hit to performance, since we mask the movement via non blocking calls.
@@ -66,6 +64,4 @@ class CPU_Offloaded_Gradient_Checkpointer( # pylint: disable=invalid-name
return (
None,
hidden_states.grad,
- ) + (
- None,
- ) * len(ctx.args)
+ ) + (None,) * len(ctx.args)
diff --git a/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py b/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
index 792d3c6ef..220799fbf 100644
--- a/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
@@ -62,9 +62,9 @@ class DiskOffloadManager:
# Track tensor paths and their status
self.tensor_paths: deque = deque() # Ordered history of tensor paths (LIFO)
- self.file_locks: Dict[str, threading.Lock] = (
- {}
- ) # Maps file_path -> threading.Lock()
+ self.file_locks: Dict[
+ str, threading.Lock
+ ] = {} # Maps file_path -> threading.Lock()
# Maps file_path -> status ("saving", "ready", "prefetching", "loaded", "deleted")
self.file_status: Dict[str, str] = {}
@@ -236,7 +236,7 @@ class DiskOffloadManager:
self.tensor_paths.append(file_path)
# Acquire semaphore to limit concurrent save operations
- self.save_semaphore.acquire() # pylint: disable=consider-using-with
+ self.save_semaphore.acquire()
# Queue tensor for saving in background
self.save_queue.put((tensor.detach(), file_path))
diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
index 1316b5374..3953cb138 100644
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -2,6 +2,7 @@
# copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
+import importlib.util
import warnings
from typing import Optional, Tuple
@@ -19,7 +20,7 @@ from axolotl.monkeypatch.utils import set_module_name
from axolotl.utils.logging import get_logger
try:
- from flash_attn.flash_attn_interface import ( # pylint: disable=ungrouped-imports
+ from flash_attn.flash_attn_interface import (
flash_attn_varlen_qkvpacked_func,
)
except ImportError:
@@ -32,12 +33,7 @@ LOG = get_logger(__name__)
def is_xformers_available() -> bool:
- try:
- import xformers # pylint: disable=unused-import # noqa: F401
-
- return True
- except ImportError:
- return False
+ return importlib.util.find_spec("xformers") is not None
def is_xformers_swiglu_available() -> bool:
@@ -83,7 +79,7 @@ def patch_fa_llama_cross_entropy():
num_items_in_batch: int = None,
ignore_index: int = -100,
**kwargs,
- ): # pylint: disable=unused-argument
+ ):
reduction = "sum" if num_items_in_batch is not None else "mean"
loss, _ = flash_attn_cross_entropy_loss(
source, target, ignore_index=ignore_index
@@ -120,9 +116,7 @@ def replace_llama_attn_with_flash_attn(
rms_norm: Optional[bool] = False,
use_shifted_sparse_attn: Optional[bool] = False,
):
- transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( # pylint: disable=protected-access
- _prepare_decoder_attention_mask
- )
+ transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
if use_shifted_sparse_attn:
transformers.models.llama.modeling_llama.LlamaAttention.forward = (
flashattn_forward_with_s2attn
@@ -145,7 +139,7 @@ def _prepare_decoder_attention_mask(
input_shape,
inputs_embeds,
past_key_values_length,
-): # pylint: disable=unused-argument
+):
# [bsz, seq_len]
return attention_mask
@@ -161,9 +155,9 @@ def flashattn_forward_with_s2attn(
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
- padding_mask: Optional[torch.LongTensor] = None, # pylint: disable=unused-argument
- cu_seqlens: Optional[torch.Tensor] = None, # pylint: disable=unused-argument
- max_seqlen: Optional[torch.Tensor] = None, # pylint: disable=unused-argument
+ padding_mask: Optional[torch.LongTensor] = None,
+ cu_seqlens: Optional[torch.Tensor] = None,
+ max_seqlen: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel
@@ -176,7 +170,8 @@ def flashattn_forward_with_s2attn(
"""
if output_attentions:
warnings.warn(
- "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+ "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.",
+ stacklevel=2,
)
bsz, q_len, _ = hidden_states.size()
@@ -198,7 +193,6 @@ def flashattn_forward_with_s2attn(
)
# [bsz, q_len, nh, hd]
# [bsz, nh, q_len, hd]
- # pylint: disable=duplicate-code
cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
query_states, key_states = apply_rotary_pos_emb(
@@ -244,9 +238,7 @@ def flashattn_forward_with_s2attn(
.permute(0, 3, 1, 2, 4, 5)
.reshape(bsz * 2, q_len, 3, self.num_heads // 2, self.head_dim)
)
- x = rearrange( # pylint: disable=invalid-name
- qkv, "b s three h d -> b s (three h d)"
- )
+ x = rearrange(qkv, "b s three h d -> b s (three h d)")
x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
cu_q_len_tmp = torch.arange(
0, max_s, group_size, device=key_padding_mask.device, dtype=cu_q_lens.dtype
diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
index 28223eee3..332242e2c 100644
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -32,10 +32,9 @@ def xformers_forward(
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
- padding_mask: Optional[torch.LongTensor] = None, # pylint: disable=unused-argument
- **kwargs, # pylint: disable=unused-argument
+ padding_mask: Optional[torch.LongTensor] = None,
+ **kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
- # pylint: disable=duplicate-code
bsz, q_len, _ = hidden_states.size()
if not hasattr(self, "pretraining_tp"):
@@ -102,7 +101,8 @@ def xformers_forward(
if output_attentions:
warnings.warn(
- "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+ "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.",
+ stacklevel=2,
)
#
diff --git a/src/axolotl/monkeypatch/llama_expand_mask.py b/src/axolotl/monkeypatch/llama_expand_mask.py
index 0277c212a..5cfb7818e 100644
--- a/src/axolotl/monkeypatch/llama_expand_mask.py
+++ b/src/axolotl/monkeypatch/llama_expand_mask.py
@@ -21,6 +21,4 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
def hijack_expand_mask():
import transformers
- transformers.models.llama.modeling_llama._expand_mask = ( # pylint: disable=protected-access
- _expand_mask
- )
+ transformers.models.llama.modeling_llama._expand_mask = _expand_mask
diff --git a/src/axolotl/monkeypatch/llama_patch_multipack.py b/src/axolotl/monkeypatch/llama_patch_multipack.py
index cfd525367..8d234881f 100644
--- a/src/axolotl/monkeypatch/llama_patch_multipack.py
+++ b/src/axolotl/monkeypatch/llama_patch_multipack.py
@@ -12,15 +12,15 @@ def hijack_llama_prepare_4d_mask():
from transformers import modeling_attn_mask_utils
from transformers.models.llama import modeling_llama
- modeling_llama._prepare_4d_causal_attention_mask_for_sdpa = ( # pylint: disable=protected-access
+ modeling_llama._prepare_4d_causal_attention_mask_for_sdpa = (
patched_prepare_4d_causal_attention_mask_for_sdpa
)
- modeling_attn_mask_utils._prepare_4d_causal_attention_mask_for_sdpa = ( # pylint: disable=protected-access
+ modeling_attn_mask_utils._prepare_4d_causal_attention_mask_for_sdpa = (
patched_prepare_4d_causal_attention_mask_for_sdpa
)
- modeling_llama._prepare_4d_causal_attention_mask = ( # pylint: disable=protected-access
+ modeling_llama._prepare_4d_causal_attention_mask = (
patched_prepare_4d_causal_attention_mask
)
- modeling_attn_mask_utils._prepare_4d_causal_attention_mask = ( # pylint: disable=protected-access
+ modeling_attn_mask_utils._prepare_4d_causal_attention_mask = (
patched_prepare_4d_causal_attention_mask
)
diff --git a/src/axolotl/monkeypatch/lora_kernels.py b/src/axolotl/monkeypatch/lora_kernels.py
index be1e1f2ff..8e335fe4c 100644
--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -30,48 +30,36 @@ QKV_PATCHES = [
query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-""".lstrip(
- "\n"
- ),
+""".lstrip("\n"),
"""
query_states, key_states, value_states = self.apply_qkv(hidden_states)
query_states = query_states.view(hidden_shape).transpose(1, 2)
key_states = key_states.view(hidden_shape).transpose(1, 2)
value_states = value_states.view(hidden_shape).transpose(1, 2)
-""".lstrip(
- "\n"
- ),
+""".lstrip("\n"),
),
(
"""
query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-""".lstrip(
- "\n"
- ),
+""".lstrip("\n"),
"""
query_states, key_states, value_states = self.apply_qkv(hidden_states)
query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
key_states = self.k_norm(key_states.view(hidden_shape)).transpose(1, 2)
value_states = value_states.view(hidden_shape).transpose(1, 2)
-""".lstrip(
- "\n"
- ),
+""".lstrip("\n"),
),
]
ORIGINAL_O_CODE = """
attn_output = self.o_proj(attn_output)
-""".lstrip(
- "\n"
-)
+""".lstrip("\n")
PATCHED_O_CODE = """
attn_output = self.apply_o(attn_output)
-""".lstrip(
- "\n"
-)
+""".lstrip("\n")
SUPPORTED_ACTIVATIONS = ["silu", "gelu"]
APPLY_FN_MAPPING = {
@@ -146,6 +134,11 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
return Qwen2Attention
+ if model_type == "qwen3_vl":
+ from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLTextAttention
+
+ return Qwen3VLTextAttention
+
if model_type == "mllama":
from transformers.models.mllama.modeling_mllama import MllamaTextSelfAttention
@@ -161,6 +154,11 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
return MistralAttention
+ if model_type == "gemma3_text":
+ from transformers.models.gemma3.modeling_gemma3 import Gemma3Attention
+
+ return Gemma3Attention
+
try:
# Dynamically import the module and attention class
module_path = f"transformers.models.{model_type}.modeling_{model_type}"
@@ -176,7 +174,6 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
) from e
-# pylint: disable=protected-access
def patch_self_attn_lora(cfg: DictDefault):
"""
Given an `axolotl` config, this method patches the inferred attention class forward
@@ -203,9 +200,9 @@ def patch_self_attn_lora(cfg: DictDefault):
attention_cls._original_forward = self_attn_forward
self_attn_forward, _ = detab_code(self_attn_forward)
- assert any(
- qkv_options[0] in self_attn_forward for qkv_options in QKV_PATCHES
- ), "Original QKV code not found"
+ assert any(qkv_options[0] in self_attn_forward for qkv_options in QKV_PATCHES), (
+ "Original QKV code not found"
+ )
assert ORIGINAL_O_CODE in self_attn_forward, "Original O code not found"
for qkv_orig, qkv_patched in QKV_PATCHES:
@@ -231,16 +228,14 @@ def patch_self_attn_lora(cfg: DictDefault):
if item in self_attn_forward:
items_to_import.append(item)
- exec( # pylint: disable=exec-used # nosec B102
+ exec(
f"from {module_name} import ({', '.join(items_to_import)})",
globals(),
)
- exec(self_attn_forward, globals()) # pylint: disable=exec-used # nosec B102
+ exec(self_attn_forward, globals())
LOG.info(f"Patched attention class with LoRA optims: {attention_cls.__name__}")
- attention_cls.forward = (
- axolotl_attn_forward # pylint: disable=undefined-variable # noqa: F821
- )
+ attention_cls.forward = axolotl_attn_forward
def find_self_attn_in_layer(
@@ -277,9 +272,13 @@ def find_mlp_in_layer(
layer.feedforward.experts.gate_projs,
layer.feedforward.experts.up_projs,
layer.feedforward.experts.down_projs,
+ strict=False,
):
- yield gate_proj, up_proj, down_proj, FakeMLP(
- gate_proj, up_proj, down_proj
+ yield (
+ gate_proj,
+ up_proj,
+ down_proj,
+ FakeMLP(gate_proj, up_proj, down_proj),
)
@@ -337,9 +336,9 @@ def apply_lora_kernel_patches(
# Get active LoRA adapter config
if hasattr(model, "active_adapters"):
- assert (
- len(model.active_adapters) == 1
- ), "Axolotl currently does not support LoRA Triton kernels for multiple adapters"
+ assert len(model.active_adapters) == 1, (
+ "Axolotl currently does not support LoRA Triton kernels for multiple adapters"
+ )
active_adapter = model.active_adapters[0]
else:
active_adapter = model.active_adapter
diff --git a/src/axolotl/monkeypatch/loss/chunked.py b/src/axolotl/monkeypatch/loss/chunked.py
index 0a9d0de82..26a52f898 100644
--- a/src/axolotl/monkeypatch/loss/chunked.py
+++ b/src/axolotl/monkeypatch/loss/chunked.py
@@ -25,7 +25,7 @@ class CEWithChunkedOutputLoss(torch.nn.Module):
self,
logits: torch.Tensor,
labels: torch.Tensor,
- normalize: bool = True, # pylint: disable=unused-argument
+ normalize: bool = True,
) -> torch.Tensor:
"""
Upcast logits to fp32 and compute cross entropy loss.
@@ -63,7 +63,7 @@ class CEWithChunkedOutputLoss(torch.nn.Module):
# compute one chunk at a time
total_loss = 0.0
- for logits_chunk, labels_chunk in zip(logits, labels):
+ for logits_chunk, labels_chunk in zip(logits, labels, strict=False):
total_loss += self.compute_cross_entropy(logits_chunk, labels_chunk)
if reduction == "sum":
@@ -88,9 +88,9 @@ def get_causal_lm_loss(num_output_chunks: int = 8, ignore_index: int = -100):
num_items_in_batch: int = None,
ignore_index: int = -100,
**kwargs,
- ): # pylint: disable=unused-argument
+ ):
reduction = "sum" if num_items_in_batch is not None else "mean"
- logit_chunks = [ # pylint: disable=unnecessary-comprehension
+ logit_chunks = [
chunk for chunk in source.chunk(loss_fn_ce.num_output_chunks, dim=1)
]
loss = loss_fn_ce(logit_chunks, target, reduction=reduction)
@@ -101,7 +101,7 @@ def get_causal_lm_loss(num_output_chunks: int = 8, ignore_index: int = -100):
def for_causal_lm_chunked_loss(
logits,
labels,
- vocab_size: int = None, # pylint: disable=unused-argument
+ vocab_size: int = None,
num_items_in_batch: Optional[int] = None,
ignore_index: int = -100,
shift_labels: Optional[torch.Tensor] = None,
diff --git a/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py b/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
index e1be424a3..0994da91c 100644
--- a/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
@@ -1,7 +1,5 @@
"""Flash attention monkey patch for mistral model"""
-# pylint: disable=duplicate-code
-
from functools import partial
import transformers
diff --git a/src/axolotl/monkeypatch/mixtral/__init__.py b/src/axolotl/monkeypatch/mixtral/__init__.py
index 5b8054000..b353b12cf 100644
--- a/src/axolotl/monkeypatch/mixtral/__init__.py
+++ b/src/axolotl/monkeypatch/mixtral/__init__.py
@@ -31,14 +31,12 @@ def patch_mixtral_moe_forward_zero3() -> None:
topk_weight = topk_weight.to(hidden_states.dtype)
hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
- y = torch.empty_like(hidden_states) # pylint: disable=invalid-name
+ y = torch.empty_like(hidden_states)
flat_topk_idx = topk_idx.view(-1)
for i in range(self.num_experts):
expert = self.experts[i]
y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
- y = ( # pylint: disable=invalid-name
- y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)
- ).sum(dim=1)
+ y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
return final_hidden_states, router_logits
diff --git a/src/axolotl/monkeypatch/models/apertus/__init__.py b/src/axolotl/monkeypatch/models/apertus/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/axolotl/monkeypatch/models/apertus/activation.py b/src/axolotl/monkeypatch/models/apertus/activation.py
new file mode 100644
index 000000000..d5470aceb
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/apertus/activation.py
@@ -0,0 +1,52 @@
+"""Monkeypatch for Apertus to dtype mismatch in XIELU act"""
+
+from torch import Tensor
+
+
+def patch_apertus_xielu_activation():
+ try:
+ from transformers.activations import XIELUActivation
+ except ImportError as err:
+ raise ImportError(
+ "Cannot import XIELUActivation. "
+ "Please make sure to update your transformers version >= 4.56.1."
+ ) from err
+
+ from transformers.activations import logger
+
+ # Store the original method
+ old_fn = XIELUActivation._xielu_cuda
+
+ def _xielu_cuda_fixed(self, x: Tensor) -> Tensor:
+ """Firewall function to prevent torch.compile from seeing .item() calls"""
+ original_shape = x.shape
+ # CUDA kernel expects 3D tensors, reshape if needed
+ while x.dim() < 3:
+ x = x.unsqueeze(0)
+ if x.dim() > 3:
+ x = x.view(-1, 1, x.size(-1))
+ if original_shape != x.shape:
+ logger.warning_once(
+ "Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).",
+ original_shape,
+ x.shape,
+ )
+ result = self._xielu_cuda_obj.forward(
+ x,
+ self.alpha_p.to(x.dtype),
+ self.alpha_n.to(x.dtype),
+ # Temporary until xIELU CUDA fully implemented -> self.{beta,eps}.item()
+ self._beta_scalar,
+ self._eps_scalar,
+ self.with_vector_loads,
+ )
+ return result.view(original_shape)
+
+ # Apply the patch
+ XIELUActivation._xielu_cuda = _xielu_cuda_fixed
+
+ def unpatch():
+ """Restore the original method"""
+ XIELUActivation._xielu_cuda = old_fn
+
+ return unpatch
diff --git a/src/axolotl/monkeypatch/models/llama4/modeling.py b/src/axolotl/monkeypatch/models/llama4/modeling.py
index 4127793e7..0fc8f5699 100644
--- a/src/axolotl/monkeypatch/models/llama4/modeling.py
+++ b/src/axolotl/monkeypatch/models/llama4/modeling.py
@@ -95,18 +95,12 @@ def patch_llama4_linearized_modeling():
old_lamma_4_text_experts = modeling_llama4.Llama4TextExperts
modeling_llama4.Llama4TextExperts = Llama4TextExperts
- setattr(
- sys.modules["transformers.models.llama4"],
- "Llama4TextExperts",
- Llama4TextExperts,
- )
+ sys.modules["transformers.models.llama4"].Llama4TextExperts = Llama4TextExperts
def unpatch():
modeling_llama4.Llama4TextExperts = old_lamma_4_text_experts
- setattr(
- sys.modules["transformers.models.llama4"],
- "Llama4TextExperts",
- old_lamma_4_text_experts,
- )
+ sys.modules[
+ "transformers.models.llama4"
+ ].Llama4TextExperts = old_lamma_4_text_experts
return unpatch
diff --git a/src/axolotl/monkeypatch/models/mistral3/__init__.py b/src/axolotl/monkeypatch/models/mistral3/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py b/src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py
new file mode 100644
index 000000000..9e7259a05
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py
@@ -0,0 +1,85 @@
+"""
+Monkeypatch to fix inefficient tensor conversion in MistralCommonTokenizer.apply_chat_template
+"""
+
+import importlib
+import inspect
+
+from axolotl.monkeypatch.utils import detab_code
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def apply_mistral_tokenizer_image_patch():
+ """Apply patch to MistralCommonTokenizer.apply_chat_template to fix image tensor conversion."""
+ from transformers.tokenization_mistral_common import MistralCommonTokenizer
+
+ # Get original source
+ original_source = inspect.getsource(MistralCommonTokenizer.apply_chat_template)
+ original_source, _ = detab_code(original_source)
+
+ # Define the replacement
+ original_tensor_conversion = (
+ " pixel_values = torch.tensor(images)"
+ )
+
+ patched_tensor_conversion = """ if isinstance(images, list) and len(images) > 0 and isinstance(images[0], np.ndarray):
+ pixel_values = torch.tensor(np.array(images))
+ else:
+ pixel_values = torch.tensor(images)"""
+
+ # Apply the replacement
+ if original_tensor_conversion in original_source:
+ patched_source = original_source.replace(
+ original_tensor_conversion, patched_tensor_conversion
+ )
+ patched_source = patched_source.replace(
+ "def apply_chat_template(",
+ "def patched_apply_chat_template(",
+ 1,
+ )
+
+ # Load necessary imports from the module
+ module_name = MistralCommonTokenizer.__module__
+ module = importlib.import_module(module_name)
+
+ # Detect what needs to be imported
+ items_to_import = []
+ for item in dir(module):
+ if item in patched_source and not item.startswith("_"):
+ items_to_import.append(item)
+
+ # Execute imports in global scope
+ if items_to_import:
+ exec( # nosec B102
+ f"from {module_name} import ({', '.join(items_to_import)})",
+ globals(),
+ )
+
+ # Also need standard imports that might be used
+ exec("import numpy as np", globals()) # nosec B102
+ exec("import torch", globals()) # nosec B102
+ exec("from typing import Union, Optional, List, Dict, Any, Callable", globals()) # nosec B102
+ exec("from pathlib import Path", globals()) # nosec B102
+
+ # Import other dependencies that might be needed
+ try:
+ exec("from transformers.utils import is_torch_available", globals()) # nosec B102
+ exec(
+ "from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TensorType",
+ globals(),
+ ) # nosec B102
+ exec("from transformers.utils import logging", globals()) # nosec B102
+ exec("logger = logging.get_logger(__name__)", globals()) # nosec B102
+ except ImportError as e:
+ LOG.warning(f"Could not import some dependencies: {e}")
+
+ # Execute the patched source
+ exec(patched_source, globals()) # nosec B102
+
+ # Replace the method
+ MistralCommonTokenizer.apply_chat_template = patched_apply_chat_template
+ LOG.info("Successfully applied MistralCommonTokenizer tensor conversion patch")
+ else:
+ LOG.warning("Could not find target code for MistralCommonTokenizer patching")
diff --git a/src/axolotl/monkeypatch/models/pixtral/__init__.py b/src/axolotl/monkeypatch/models/pixtral/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/axolotl/monkeypatch/models/pixtral/modeling_flash_attention_utils.py b/src/axolotl/monkeypatch/models/pixtral/modeling_flash_attention_utils.py
new file mode 100644
index 000000000..d2b482f19
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/pixtral/modeling_flash_attention_utils.py
@@ -0,0 +1,42 @@
+"""Monkeypatch for FA utils to accept 1D position_ids from Pixtral's position_ids_in_meshgrid"""
+
+import torch
+
+
+def apply_patch_is_packed_sequence():
+ """Apply patch to FA utils to accept 1D position_ids from Pixtral's position_ids_in_meshgrid"""
+ from transformers import modeling_flash_attention_utils
+
+ def fixed_is_packed_sequence(position_ids, batch_size):
+ """
+ Check the position ids whether packed sequences are indicated or not
+ 1. Position ids exist
+ 2. Flattened sequences only are supported
+ 3. Compile-friendly `not (torch.diff(position_ids, dim=-1) >= 0).all()`, i.e. we have multiple increasing sequences
+ """
+ if position_ids is None:
+ return False
+
+ if position_ids.ndim == 1:
+ position_ids = position_ids.unsqueeze(0) # [N] -> [1, N]
+
+ increasing_position_sequences = (
+ torch.arange(position_ids.shape[1], device=position_ids.device)
+ + position_ids.min()
+ )
+ return (
+ batch_size == 1
+ and (increasing_position_sequences - position_ids).abs().sum().bool().item()
+ )
+
+ # Store original method
+ old_fn = modeling_flash_attention_utils._is_packed_sequence
+
+ # Apply the patch
+ modeling_flash_attention_utils._is_packed_sequence = fixed_is_packed_sequence
+
+ def unpatch():
+ """Restore the original method"""
+ modeling_flash_attention_utils._is_packed_sequence = old_fn
+
+ return unpatch
diff --git a/src/axolotl/monkeypatch/models/qwen3_next/__init__.py b/src/axolotl/monkeypatch/models/qwen3_next/__init__.py
new file mode 100644
index 000000000..39bcd4115
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/qwen3_next/__init__.py
@@ -0,0 +1 @@
+"""Qwen3_Next model monkeypatches."""
diff --git a/src/axolotl/monkeypatch/models/qwen3_next/modeling.py b/src/axolotl/monkeypatch/models/qwen3_next/modeling.py
new file mode 100644
index 000000000..d68992d0e
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/qwen3_next/modeling.py
@@ -0,0 +1,317 @@
+"""Monkeypatch for Qwen3_Next model to pass position_ids to linear attention."""
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def get_cu_seqlens(position_ids):
+ """
+ Adapted from transformers.modeling_flash_attention_utils.prepare_fa_kwargs_from_position_ids.
+
+ https://github.com/huggingface/transformers/blob/0f1b128d3359a26bd18be99c26d7f04fb3cba914/src/transformers/modeling_flash_attention_utils.py#L316
+ """
+ tensor_kwargs = {"dtype": torch.int32, "device": position_ids.device}
+
+ position_ids = position_ids.view(-1)
+ indices_q = (position_ids == 0).nonzero().view(-1)
+
+ cu_seq_lens_q = torch.cat(
+ (
+ indices_q.to(**tensor_kwargs),
+ torch.tensor(position_ids.size(), **tensor_kwargs),
+ )
+ )
+
+ return cu_seq_lens_q
+
+
+def patch_qwen3_next_decoder_layer():
+ """Patch Qwen3NextDecoderLayer to pass position_ids to linear attention."""
+ try:
+ from transformers.models.qwen3_next.modeling_qwen3_next import (
+ Qwen3NextDecoderLayer,
+ )
+ except ImportError:
+ LOG.warning("Qwen3Next model not found, skipping patch")
+ return
+
+ # Store original forward method
+ original_decoder_forward = Qwen3NextDecoderLayer.forward
+
+ def patched_decoder_forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[torch.Tensor]] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs,
+ ) -> torch.FloatTensor:
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Token Mixer
+ if self.layer_type == "linear_attention":
+ hidden_states = self.linear_attn(
+ hidden_states=hidden_states,
+ cache_params=past_key_values,
+ cache_position=cache_position,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ )
+ elif self.layer_type == "full_attention":
+ # Self Attention
+ hidden_states, _ = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ cache_position=cache_position,
+ position_embeddings=position_embeddings,
+ **kwargs,
+ )
+
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ # For the MoE layers, we need to unpack
+ if isinstance(hidden_states, Tuple):
+ hidden_states, _ = hidden_states
+ hidden_states = residual + hidden_states
+
+ return hidden_states
+
+ # Apply the patches
+ Qwen3NextDecoderLayer.forward = patched_decoder_forward
+
+ def unpatch():
+ """Restore the original forward method"""
+ Qwen3NextDecoderLayer.forward = original_decoder_forward
+
+ return unpatch
+
+
+def patch_qwen3_next_gateddelta_layer():
+ """Patch Qwen3NextGatedDeltaNet to parse cu_seqlens and pass to chunk_gated_delta_rule"""
+ try:
+ from transformers.models.qwen3_next.modeling_qwen3_next import (
+ Qwen3NextDynamicCache,
+ Qwen3NextGatedDeltaNet,
+ apply_mask_to_padding_states,
+ )
+ except ImportError:
+ LOG.warning("Qwen3Next model not found, skipping patch")
+ return
+
+ # Store original forward method
+ original_gated_delta_net_forward = Qwen3NextGatedDeltaNet.forward
+
+ def patched_gated_delta_net_forward(
+ self,
+ hidden_states: torch.Tensor,
+ cache_params: Optional[Qwen3NextDynamicCache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ):
+ hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
+
+ # Set up dimensions for reshapes later
+ batch_size, seq_len, _ = hidden_states.shape
+
+ use_precomputed_states = (
+ cache_params is not None
+ and cache_params.has_previous_state
+ and seq_len == 1
+ and cache_position is not None
+ )
+
+ # getting projected states from cache if it exists
+ if cache_params is not None:
+ conv_state = cache_params.conv_states[self.layer_idx]
+ recurrent_state = cache_params.recurrent_states[self.layer_idx]
+
+ projected_states_qkvz = self.in_proj_qkvz(hidden_states)
+ projected_states_ba = self.in_proj_ba(hidden_states)
+ query, key, value, z, b, a = self.fix_query_key_value_ordering(
+ projected_states_qkvz, projected_states_ba
+ )
+ query, key, value = (
+ x.reshape(x.shape[0], x.shape[1], -1) for x in (query, key, value)
+ )
+
+ mixed_qkv = torch.cat((query, key, value), dim=-1)
+ mixed_qkv = mixed_qkv.transpose(1, 2)
+
+ if use_precomputed_states:
+ # 2. Convolution sequence transformation
+ # NOTE: the conv state is updated in `causal_conv1d_update`
+ mixed_qkv = self.causal_conv1d_update(
+ mixed_qkv,
+ conv_state,
+ self.conv1d.weight.squeeze(1),
+ self.conv1d.bias,
+ self.activation,
+ )
+ else:
+ if cache_params is not None:
+ conv_state = F.pad(
+ mixed_qkv, (self.conv_kernel_size - mixed_qkv.shape[-1], 0)
+ )
+ cache_params.conv_states[self.layer_idx] = conv_state
+ if self.causal_conv1d_fn is not None:
+ mixed_qkv = self.causal_conv1d_fn(
+ x=mixed_qkv,
+ weight=self.conv1d.weight.squeeze(1),
+ bias=self.conv1d.bias,
+ activation=self.activation,
+ seq_idx=None,
+ )
+ else:
+ mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len])
+
+ mixed_qkv = mixed_qkv.transpose(1, 2)
+ query, key, value = torch.split(
+ mixed_qkv,
+ [
+ self.key_dim,
+ self.key_dim,
+ self.value_dim,
+ ],
+ dim=-1,
+ )
+ query = query.reshape(query.shape[0], query.shape[1], -1, self.head_k_dim)
+ key = key.reshape(key.shape[0], key.shape[1], -1, self.head_k_dim)
+ value = value.reshape(value.shape[0], value.shape[1], -1, self.head_v_dim)
+
+ beta = b.sigmoid()
+ # If the model is loaded in fp16, without the .float() here, A might be -inf
+ g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
+ if self.num_v_heads // self.num_k_heads > 1:
+ query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
+ key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
+
+ if not use_precomputed_states:
+ cu_seqlens = get_cu_seqlens(position_ids=position_ids)
+ core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule(
+ query,
+ key,
+ value,
+ g=g,
+ beta=beta,
+ initial_state=None,
+ output_final_state=cache_params is not None,
+ use_qk_l2norm_in_kernel=True,
+ cu_seqlens=cu_seqlens,
+ )
+
+ else:
+ core_attn_out, last_recurrent_state = self.recurrent_gated_delta_rule(
+ query,
+ key,
+ value,
+ g=g,
+ beta=beta,
+ initial_state=recurrent_state,
+ output_final_state=cache_params is not None,
+ use_qk_l2norm_in_kernel=True,
+ )
+
+ # Update cache
+ if cache_params is not None:
+ cache_params.recurrent_states[self.layer_idx] = last_recurrent_state
+
+ z_shape_og = z.shape
+ # reshape input data into 2D tensor
+ core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+ z = z.reshape(-1, z.shape[-1])
+ core_attn_out = self.norm(core_attn_out, z)
+ core_attn_out = core_attn_out.reshape(z_shape_og)
+ core_attn_out = core_attn_out.reshape(
+ core_attn_out.shape[0], core_attn_out.shape[1], -1
+ )
+
+ output = self.out_proj(core_attn_out)
+ return output
+
+ # Apply the patches
+ Qwen3NextGatedDeltaNet.forward = patched_gated_delta_net_forward
+
+ def unpatch():
+ """Restore the original forward method"""
+ Qwen3NextGatedDeltaNet.forward = original_gated_delta_net_forward
+
+ return unpatch
+
+
+def patch_qwen3_next_imports():
+ """Patch Qwen3Next imports to use try/except instead of is_flash_linear_attention_available."""
+ try:
+ import transformers.models.qwen3_next.modeling_qwen3_next as qwen3_modeling
+ except ImportError:
+ LOG.warning("Qwen3Next model not found, skipping import patch")
+ return
+
+ # Save original values for unpatch
+ original_FusedRMSNormGated = getattr(qwen3_modeling, "FusedRMSNormGated", None)
+ original_chunk_gated_delta_rule = getattr(
+ qwen3_modeling, "chunk_gated_delta_rule", None
+ )
+ original_fused_recurrent_gated_delta_rule = getattr(
+ qwen3_modeling, "fused_recurrent_gated_delta_rule", None
+ )
+ original_is_fast_path_available = getattr(
+ qwen3_modeling, "is_fast_path_available", False
+ )
+
+ try:
+ from fla.modules import FusedRMSNormGated
+ from fla.ops.gated_delta_rule import (
+ chunk_gated_delta_rule,
+ fused_recurrent_gated_delta_rule,
+ )
+
+ qwen3_modeling.FusedRMSNormGated = FusedRMSNormGated
+ qwen3_modeling.chunk_gated_delta_rule = chunk_gated_delta_rule
+ qwen3_modeling.fused_recurrent_gated_delta_rule = (
+ fused_recurrent_gated_delta_rule
+ )
+
+ # Force is_fast_path_available to be True
+ # fla has triton kernels for causal_conv1d
+ qwen3_modeling.is_fast_path_available = True
+ except ImportError:
+ qwen3_modeling.chunk_gated_delta_rule = None
+ qwen3_modeling.fused_recurrent_gated_delta_rule = None
+ qwen3_modeling.FusedRMSNormGated = None
+
+ def unpatch():
+ """Restore the original import values"""
+ qwen3_modeling.FusedRMSNormGated = original_FusedRMSNormGated
+ qwen3_modeling.chunk_gated_delta_rule = original_chunk_gated_delta_rule
+ qwen3_modeling.fused_recurrent_gated_delta_rule = (
+ original_fused_recurrent_gated_delta_rule
+ )
+ qwen3_modeling.is_fast_path_available = original_is_fast_path_available
+
+ return unpatch
+
+
+def patch_qwen3_next_modeling_packing():
+ """Apply all Qwen3Next model patches."""
+ patch_qwen3_next_imports()
+ patch_qwen3_next_decoder_layer()
+ patch_qwen3_next_gateddelta_layer()
+
+ LOG.info("Applied Qwen3Next patch for packing")
diff --git a/src/axolotl/monkeypatch/multipack.py b/src/axolotl/monkeypatch/multipack.py
index 791f551bc..9e5c4b324 100644
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -11,6 +11,7 @@ from axolotl.monkeypatch.mixtral import patch_mixtral_moe_forward_zero3
from axolotl.monkeypatch.utils import get_unpad_data
SUPPORTED_MULTIPACK_MODEL_TYPES = [
+ "apertus",
"mllama_text_model",
"llama",
"llama4",
@@ -20,6 +21,7 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
"qwen2_moe",
"qwen3",
"qwen3_moe",
+ "qwen3_next",
"falcon",
"phi",
"phi3",
@@ -37,8 +39,17 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
"glm4",
"glm4_moe",
"smollm3",
+ "granite",
+ "granitemoe",
+ "granitemoeshared",
+ "granitemoehybrid",
+ "hunyuan_v1_dense",
+ "hunyuan_v1_moe",
"gpt_oss",
"arcee",
+ "seed_oss",
+ "lfm2",
+ "lfm2_moe",
]
@@ -50,9 +61,7 @@ def patch_for_multipack(model_type, model_name=None, has_remote_code=False):
assert hasattr(
transformers.modeling_flash_attention_utils, "_get_unpad_data"
), "transformers api changed for _get_unpad_data for flash attention"
- transformers.modeling_flash_attention_utils._get_unpad_data = ( # pylint: disable=protected-access
- get_unpad_data
- )
+ transformers.modeling_flash_attention_utils._get_unpad_data = get_unpad_data
if model_type == "mixtral" and is_deepspeed_zero3_enabled():
patch_mixtral_moe_forward_zero3()
@@ -68,6 +77,4 @@ def patch_remote(model_name):
module_name = ".".join(parts)
modeling_arch = importlib.import_module(module_name)
if hasattr(modeling_arch, "_get_unpad_data"):
- modeling_arch._get_unpad_data = ( # pylint: disable=protected-access
- get_unpad_data
- )
+ modeling_arch._get_unpad_data = get_unpad_data
diff --git a/src/axolotl/monkeypatch/peft/utils.py b/src/axolotl/monkeypatch/peft/utils.py
index 0c571fbd2..d1011f5eb 100644
--- a/src/axolotl/monkeypatch/peft/utils.py
+++ b/src/axolotl/monkeypatch/peft/utils.py
@@ -49,9 +49,7 @@ def patch_peft_prep_code():
prep_code = get_peft_prep_code()
except OSError:
return
- peft.utils.other._original_create_accelerator_and_postprocess = ( # pylint: disable=protected-access
- prep_code
- )
+ peft.utils.other._original_create_accelerator_and_postprocess = prep_code
prep_code, _ = detab_code(prep_code)
if ORIGINAL_PREPARE_CODE not in prep_code:
return
@@ -68,11 +66,15 @@ def patch_peft_prep_code():
if item in prep_code:
items_to_import.append(item)
- exec( # pylint: disable=exec-used # nosec B102
+ exec(
"from peft.utils.other import (" + ", ".join(x for x in items_to_import) + ")",
globals(),
)
- exec(prep_code, globals()) # pylint: disable=exec-used # nosec B102
+ exec(prep_code, globals())
LOG.info("patching prepare_model_for_kbit_training to allow for overrides")
- peft.utils.other.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training # pylint: disable=protected-access # pylint: disable=undefined-variable # noqa: F821
- axolotl.loaders.model.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training # pylint: disable=protected-access # pylint: disable=undefined-variable # noqa: F821
+ peft.utils.other.prepare_model_for_kbit_training = (
+ fixed_prepare_model_for_kbit_training
+ )
+ axolotl.loaders.model.prepare_model_for_kbit_training = (
+ fixed_prepare_model_for_kbit_training
+ )
diff --git a/src/axolotl/monkeypatch/relora.py b/src/axolotl/monkeypatch/relora.py
index 0028a0cf6..a01d850b3 100644
--- a/src/axolotl/monkeypatch/relora.py
+++ b/src/axolotl/monkeypatch/relora.py
@@ -91,9 +91,9 @@ class ReLoRACallback(TrainerCallback):
if not os.path.exists(self.last_full_model):
self.last_full_model = str(Path(snapshot_download(cfg.base_model)))
- assert os.path.exists(
- self.last_full_model
- ), "for ReLORA base_model must be a local path"
+ assert os.path.exists(self.last_full_model), (
+ "for ReLORA base_model must be a local path"
+ )
self.num_lora_restarts = 0
self.need_full_save = False
@@ -293,7 +293,6 @@ def find_lora_modules(model: peft.LoraModel) -> Dict[str, peft.tuners.lora.LoraL
key_list = [key for key, _ in model.model.named_modules() if "lora" not in key]
for key in key_list:
try:
- # pylint: disable=protected-access
_parent, target, _target_name = peft.utils._get_submodules(model.model, key)
except AttributeError:
continue
@@ -341,7 +340,7 @@ def merge_and_save(
modules = find_lora_modules(model)
if not quantized:
- for module_name, target in modules.items():
+ for _, target in modules.items():
active_adapter = target.active_adapter
if isinstance(active_adapter, list):
active_adapter = active_adapter[0]
diff --git a/src/axolotl/monkeypatch/ring_attn/__init__.py b/src/axolotl/monkeypatch/ring_attn/__init__.py
index 736378b16..1c14776c9 100644
--- a/src/axolotl/monkeypatch/ring_attn/__init__.py
+++ b/src/axolotl/monkeypatch/ring_attn/__init__.py
@@ -1,6 +1,5 @@
"""Init for ring attention monkeypatch module"""
-# pylint: disable=unused-import
# flake8: noqa
from .patch import (
diff --git a/src/axolotl/monkeypatch/ring_attn/adapters/batch.py b/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
index ebed9ebdc..74d33ed4a 100644
--- a/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
+++ b/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
@@ -7,8 +7,6 @@ Our implementation closely follows the structure of that module, but we've minif
somewhat to support only the latest versions of transformers.
"""
-# pylint: disable=protected-access,cyclic-import
-
import os
from typing import Callable
@@ -23,9 +21,12 @@ from transformers.modeling_flash_attention_utils import is_flash_attn_greater_or
try:
from transformers.modeling_flash_attention_utils import _flash_supports_window
except ImportError:
- from transformers.modeling_flash_attention_utils import (
- _flash_supports_window_size as _flash_supports_window,
- )
+ try:
+ from transformers.modeling_flash_attention_utils import (
+ _flash_supports_window_size as _flash_supports_window,
+ )
+ except ImportError:
+ _flash_supports_window = True
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
@@ -56,7 +57,7 @@ def create_flash_attn_forward_varlen_llama3(
"""
# transformers 4.48+
- # pylint: disable=unused-argument
+
def _flash_attention_forward(
query_states: torch.Tensor,
key_states: torch.Tensor,
diff --git a/src/axolotl/monkeypatch/ring_attn/patch.py b/src/axolotl/monkeypatch/ring_attn/patch.py
index 934687a16..e1fd10b3a 100644
--- a/src/axolotl/monkeypatch/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/ring_attn/patch.py
@@ -18,9 +18,12 @@ from torch.distributed import DeviceMesh
try:
from transformers.modeling_flash_attention_utils import _flash_supports_window
except ImportError:
- from transformers.modeling_flash_attention_utils import (
- _flash_supports_window_size as _flash_supports_window,
- )
+ try:
+ from transformers.modeling_flash_attention_utils import (
+ _flash_supports_window_size as _flash_supports_window,
+ )
+ except ImportError:
+ _flash_supports_window = True
from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
from axolotl.utils.logging import get_logger
@@ -40,7 +43,7 @@ def get_ring_attn_group() -> dist.ProcessGroup:
def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
"""Setter for ring attention group on this rank."""
- global RING_ATTN_GROUP # pylint: disable=global-statement
+ global RING_ATTN_GROUP
RING_ATTN_GROUP = ring_attn_group
@@ -54,29 +57,24 @@ def create_ring_flash_attention_forward(
query_states: torch.Tensor,
key_states: torch.Tensor,
value_states: torch.Tensor,
- attention_mask: torch.Tensor, # pylint: disable=unused-argument
+ attention_mask: torch.Tensor,
query_length: int,
is_causal: bool,
dropout: float = 0.0,
- position_ids: Optional[torch.Tensor] = None, # pylint: disable=unused-argument
+ position_ids: Optional[torch.Tensor] = None,
softmax_scale: Optional[float] = None,
sliding_window: Optional[int] = None,
use_top_left_mask: bool = False,
softcap: Optional[float] = None,
deterministic: bool = None,
- cu_seq_lens_q: Optional[
- torch.LongTensor
- ] = None, # pylint: disable=unused-argument
- cu_seq_lens_k: Optional[
- torch.LongTensor
- ] = None, # pylint: disable=unused-argument
- max_length_q: Optional[int] = None, # pylint: disable=unused-argument
- max_length_k: Optional[int] = None, # pylint: disable=unused-argument
- target_dtype: Optional[torch.dtype] = None, # pylint: disable=unused-argument
- attn_implementation: Optional[str] = None, # pylint: disable=unused-argument
- **kwargs, # pylint: disable=unused-argument
+ cu_seq_lens_q: Optional[torch.LongTensor] = None,
+ cu_seq_lens_k: Optional[torch.LongTensor] = None,
+ max_length_q: Optional[int] = None,
+ max_length_k: Optional[int] = None,
+ target_dtype: Optional[torch.dtype] = None,
+ attn_implementation: Optional[str] = None,
+ **kwargs,
):
- # pylint: disable=duplicate-code
if not use_top_left_mask:
causal = is_causal
else:
@@ -98,9 +96,9 @@ def create_ring_flash_attention_forward(
if deterministic is None:
deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
flash_kwargs["deterministic"] = deterministic
- assert (
- softcap is None
- ), "llama3_flash_attn_varlen_func does not support softcap yet."
+ assert softcap is None, (
+ "llama3_flash_attn_varlen_func does not support softcap yet."
+ )
# flash_kwargs["softcap"] = softcap
flash_kwargs["group"] = process_group
@@ -190,7 +188,7 @@ def register_ring_attn_from_device_mesh(
# fmt: off
import ring_flash_attn.adapters.hf_adapter
- from ring_flash_attn.adapters.hf_adapter import ( # isort: skip # pylint: disable=unused-import
+ from ring_flash_attn.adapters.hf_adapter import ( # isort: skip
create_ring_flash_attention_forward as create_ring_flash_attention_forward_orig,
)
diff --git a/src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py b/src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py
index 85454fe2e..0fa6d6424 100644
--- a/src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py
@@ -16,8 +16,8 @@
# This code is based off the following work:
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
-# pylint: disable=duplicate-code
"""PyTorch StableLM Epoch model."""
+
import importlib
import math
from typing import Optional, Tuple, Union
@@ -26,7 +26,7 @@ import torch
import torch.utils.checkpoint
from accelerate import init_empty_weights
from einops import rearrange
-from flash_attn.flash_attn_interface import ( # pylint: disable=ungrouped-imports
+from flash_attn.flash_attn_interface import (
flash_attn_varlen_qkvpacked_func,
)
from torch import nn
@@ -49,27 +49,21 @@ def replace_stablelm_attn_with_flash_attn(model_name="stabilityai/stablelm-3b-4e
".configuration_stablelm_epoch", ".modeling_stablelm_epoch"
)
modeling_stablelm = importlib.import_module(module_name)
- modeling_stablelm.Attention.forward = ( # pylint: disable=protected-access
- flashattn_attn
- )
- modeling_stablelm.StableLMEpochModel.forward = ( # pylint: disable=protected-access
- stablelm_model_forward
- )
- modeling_stablelm.DecoderLayer.forward = ( # pylint: disable=protected-access
- decoder_layer_forward
- )
+ modeling_stablelm.Attention.forward = flashattn_attn
+ modeling_stablelm.StableLMEpochModel.forward = stablelm_model_forward
+ modeling_stablelm.DecoderLayer.forward = decoder_layer_forward
def rotate_half(x: torch.Tensor):
"""Rotates half the hidden dims of the input."""
- # pylint: disable=invalid-name
+
x1, x2 = torch.chunk(x, 2, dim=-1)
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
# The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
- # pylint: disable=invalid-name
+
cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
cos = cos[position_ids].unsqueeze(1) # [batch_size, 1, seq_len, dim]
@@ -99,7 +93,7 @@ def flashattn_attn(
attention_mask: torch.FloatTensor,
position_ids: torch.LongTensor,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
- output_attentions: Optional[bool] = False, # pylint: disable=unused-argument
+ output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
cu_seqlens: Optional[torch.Tensor] = None,
max_seqlen: Optional[torch.Tensor] = None,
@@ -216,7 +210,6 @@ def decoder_layer_forward(
) -> Union[
Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]
]:
- # pylint: disable=duplicate-code
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
@@ -263,7 +256,6 @@ def stablelm_model_forward(
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
- # pylint: disable=duplicate-code
output_attentions = (
output_attentions
if output_attentions is not None
@@ -326,13 +318,11 @@ def stablelm_model_forward(
dtype=torch.bool,
device=inputs_embeds.device,
)
- attention_mask = (
- self._prepare_decoder_attention_mask( # pylint: disable=protected-access
- attention_mask,
- (batch_size, seq_length),
- inputs_embeds,
- past_key_values_length,
- )
+ attention_mask = self._prepare_decoder_attention_mask(
+ attention_mask,
+ (batch_size, seq_length),
+ inputs_embeds,
+ past_key_values_length,
)
hidden_states = inputs_embeds
diff --git a/src/axolotl/monkeypatch/tiled_mlp/base.py b/src/axolotl/monkeypatch/tiled_mlp/base.py
index 3b7326bdb..2c9dc8e4c 100644
--- a/src/axolotl/monkeypatch/tiled_mlp/base.py
+++ b/src/axolotl/monkeypatch/tiled_mlp/base.py
@@ -8,6 +8,94 @@ from typing import List
import torch
+class DeepSpeedTiledMLPMoE(torch.autograd.Function):
+ @staticmethod
+ def forward(
+ ctx,
+ fn,
+ self,
+ x,
+ shards,
+ compute_params,
+ ) -> torch.Tensor:
+ ctx.fn = fn
+ ctx.self = self
+ ctx.shards = shards
+ ctx.compute_params = [p for p in compute_params if p.requires_grad]
+ ctx.save_for_backward(x)
+
+ x_shards = list(torch.chunk(x, chunks=shards, dim=1))
+ with torch.no_grad():
+ output_shards = [fn(self, x_shard) for x_shard in x_shards]
+
+ ctx.is_tuple_output = isinstance(output_shards[0], tuple)
+ if isinstance(output_shards[0], tuple):
+ tuple_dim_idx = [1, 0]
+ output_unsharded = tuple(
+ torch.cat(
+ [output_shard[i] for output_shard in output_shards],
+ dim=tuple_dim_idx[i],
+ )
+ for i in range(len(output_shards[0]))
+ )
+ else:
+ output_unsharded = torch.cat(output_shards, dim=1)
+
+ return output_unsharded
+
+ @staticmethod
+ def backward(ctx, *grads) -> torch.Tensor:
+ fn = ctx.fn
+ (x,) = ctx.saved_tensors
+ self = ctx.self
+ shards = ctx.shards
+ compute_params = ctx.compute_params
+ is_tuple_output = ctx.is_tuple_output
+
+ x_requires_grad = x.requires_grad
+ x = x.detach()
+ # detach() unsets `x.requires_grad`, so restore it
+ x.requires_grad_(x_requires_grad)
+
+ incoming_grad = grads[0]
+ x_grad = torch.zeros_like(x)
+ x_shards = list(torch.chunk(x, chunks=shards, dim=1))
+
+ shard_step = x_shards[0].numel()
+ for i, x_shard in enumerate(x_shards):
+ # Tell deepspeed not to add a new grad to its ipg bucket until the last shard is run
+ if compute_params is not None:
+ if i + 1 < shards:
+ for param in compute_params:
+ param.ds_grad_is_ready = False
+ else:
+ # last shard, can add the grad
+ for param in compute_params:
+ param.ds_grad_is_ready = True
+
+ x_shard.requires_grad_(x_requires_grad)
+
+ shard_offset = i * shard_step
+ x_shard.grad = (
+ x_grad.view(-1)
+ .narrow(0, shard_offset, x_shard.numel())
+ .view_as(x_shard)
+ )
+ incoming_grad_shard = (
+ incoming_grad.view(-1)
+ .narrow(0, shard_offset, x_shard.numel())
+ .view_as(x_shard)
+ )
+ with torch.enable_grad():
+ output = fn(self, x_shard)
+ if is_tuple_output:
+ torch.autograd.backward(output[0], incoming_grad_shard)
+ else:
+ torch.autograd.backward(output, incoming_grad_shard)
+
+ return (None, None, x_grad, None, None)
+
+
class TiledMLP(torch.autograd.Function):
"""
TiledMLP implementation using gradient hooks
@@ -31,7 +119,18 @@ class TiledMLP(torch.autograd.Function):
x_shards = list(torch.chunk(x, chunks=shards, dim=1))
with torch.no_grad():
output_shards = [fn(self, x_shard) for x_shard in x_shards]
- output_unsharded = torch.cat(output_shards, dim=1)
+ ctx.is_tuple_output = isinstance(output_shards[0], tuple)
+ if isinstance(output_shards[0], tuple):
+ tuple_dim_idx = [1, 0]
+ output_unsharded = tuple(
+ torch.cat(
+ [output_shard[i] for output_shard in output_shards],
+ dim=tuple_dim_idx[i],
+ )
+ for i in range(len(output_shards[0]))
+ )
+ else:
+ output_unsharded = torch.cat(output_shards, dim=1)
return output_unsharded
@@ -42,6 +141,7 @@ class TiledMLP(torch.autograd.Function):
self = ctx.self
shards = ctx.shards
compute_params = ctx.compute_params
+ is_tuple_output = ctx.is_tuple_output
x_requires_grad = x.requires_grad
x = x.detach()
@@ -76,7 +176,10 @@ class TiledMLP(torch.autograd.Function):
with torch.enable_grad():
output = fn(self, x_shard)
- torch.autograd.backward(output, incoming_grad_shard)
+ if is_tuple_output:
+ torch.autograd.backward(output[0], incoming_grad_shard)
+ else:
+ torch.autograd.backward(output, incoming_grad_shard)
# Clean up hooks
grad_accumulator.cleanup()
diff --git a/src/axolotl/monkeypatch/tiled_mlp/patch.py b/src/axolotl/monkeypatch/tiled_mlp/patch.py
index 419c73104..c0f89236b 100644
--- a/src/axolotl/monkeypatch/tiled_mlp/patch.py
+++ b/src/axolotl/monkeypatch/tiled_mlp/patch.py
@@ -17,7 +17,7 @@ def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
TiledMLP as DeepSpeedTiledMLP,
)
- from axolotl.monkeypatch.tiled_mlp.base import TiledMLP
+ from axolotl.monkeypatch.tiled_mlp.base import DeepSpeedTiledMLPMoE, TiledMLP
try:
# Dynamically import the module and MLP class
@@ -40,7 +40,6 @@ def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
is_distributed = int(os.environ.get("WORLD_SIZE", 1)) > 1
def tiled_mlp_forward(self, x):
- # pylint: disable=protected-access
input_shape = x.shape
seqlen = input_shape[-2]
hidden = input_shape[-1]
@@ -65,7 +64,10 @@ def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
for p in self._compute_params
)
) or os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
- self._tiled_mlp_dist_impl = DeepSpeedTiledMLP
+ if model_type == "gpt_oss":
+ self._tiled_mlp_dist_impl = DeepSpeedTiledMLPMoE
+ else:
+ self._tiled_mlp_dist_impl = DeepSpeedTiledMLP
else:
self._tiled_mlp_dist_impl = TiledMLP
@@ -79,14 +81,13 @@ def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
return down_res
mlp_cls.forward = tiled_mlp_forward
- mlp_cls._compute_params = [] # pylint: disable=protected-access
- mlp_cls._tiled_mlp_dist_impl = None # pylint: disable=protected-access
+ mlp_cls._compute_params = []
+ mlp_cls._tiled_mlp_dist_impl = None
LOG.info(
f"Successfully monkey-patched TiledMLP for model_type: {model_type}",
main_process_only=True,
)
except (ImportError, AttributeError) as e:
raise RuntimeError(
- f"Could not import MLP class for model_type: {model_type}. "
- f"Error: {str(e)}"
+ f"Could not import MLP class for model_type: {model_type}. Error: {str(e)}"
) from e
diff --git a/src/axolotl/monkeypatch/trainer/lr.py b/src/axolotl/monkeypatch/trainer/lr.py
index 9afc23c46..c33674cee 100644
--- a/src/axolotl/monkeypatch/trainer/lr.py
+++ b/src/axolotl/monkeypatch/trainer/lr.py
@@ -39,4 +39,4 @@ def _get_learning_rate(self):
def patch_trainer_get_lr():
from transformers.trainer import Trainer
- Trainer._get_learning_rate = _get_learning_rate # pylint: disable=protected-access
+ Trainer._get_learning_rate = _get_learning_rate
diff --git a/src/axolotl/monkeypatch/trainer_accelerator_args.py b/src/axolotl/monkeypatch/trainer_accelerator_args.py
index 819a66255..9fc6e38c6 100644
--- a/src/axolotl/monkeypatch/trainer_accelerator_args.py
+++ b/src/axolotl/monkeypatch/trainer_accelerator_args.py
@@ -47,9 +47,7 @@ def patch_create_accelerate_code_for_fp8(enable_fsdp_float8_all_gather: bool):
create_code = get_create_accelerate_code()
except OSError:
return
- Trainer._original_create_accelerator_and_postprocess = ( # pylint: disable=protected-access
- create_code
- )
+ Trainer._original_create_accelerator_and_postprocess = create_code
create_code, _ = detab_code(create_code)
if ORIGINAL_TRAINER_CODE not in create_code:
return
@@ -72,12 +70,14 @@ def patch_create_accelerate_code_for_fp8(enable_fsdp_float8_all_gather: bool):
if item in create_code:
items_to_import.append(item)
- exec( # pylint: disable=exec-used # nosec B102
+ exec(
"from transformers.trainer import ("
+ ", ".join(x for x in items_to_import)
+ ")",
globals(),
)
- exec(create_code, globals()) # pylint: disable=exec-used # nosec B102
+ exec(create_code, globals())
LOG.info("patching create_accelerator_and_postprocess to allow for overrides")
- Trainer.create_accelerator_and_postprocess = fixed_create_accelerator_and_postprocess # pylint: disable=protected-access # pylint: disable=undefined-variable # noqa: F821
+ Trainer.create_accelerator_and_postprocess = (
+ fixed_create_accelerator_and_postprocess
+ )
diff --git a/src/axolotl/monkeypatch/trainer_fsdp_optim.py b/src/axolotl/monkeypatch/trainer_fsdp_optim.py
index 1c2511524..692f754d7 100644
--- a/src/axolotl/monkeypatch/trainer_fsdp_optim.py
+++ b/src/axolotl/monkeypatch/trainer_fsdp_optim.py
@@ -23,9 +23,7 @@ PATCHED_TRAINER_CODE = """
def get_training_loop_code() -> str:
- training_loop = inspect.getsource(
- Trainer._inner_training_loop # pylint: disable=protected-access
- )
+ training_loop = inspect.getsource(Trainer._inner_training_loop)
return training_loop
@@ -44,9 +42,7 @@ def patch_training_loop_for_fsdp():
training_loop = get_training_loop_code()
except OSError:
return
- Trainer._original_inner_training_loop = ( # pylint: disable=protected-access
- training_loop
- )
+ Trainer._original_inner_training_loop = training_loop
training_loop, _ = detab_code(training_loop)
if ORIGINAL_TRAINER_CODE not in training_loop:
return
@@ -66,14 +62,12 @@ def patch_training_loop_for_fsdp():
if item in training_loop:
items_to_import.append(item)
- exec( # pylint: disable=exec-used # nosec B102
+ exec(
"from transformers.trainer import ("
+ ", ".join(x for x in items_to_import)
+ ")",
globals(),
)
- exec(training_loop, globals()) # pylint: disable=exec-used # nosec B102
+ exec(training_loop, globals())
LOG.info("patching _inner_training_loop for fsdp optimizer save")
- Trainer._inner_training_loop = ( # pylint: disable=protected-access
- _fixed_inner_training_loop # pylint: disable=undefined-variable # noqa: F821
- )
+ Trainer._inner_training_loop = _fixed_inner_training_loop
diff --git a/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py b/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
new file mode 100644
index 000000000..ba8b16dda
--- /dev/null
+++ b/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
@@ -0,0 +1,66 @@
+"""Monkey patch to allow context parallelism with FlashAttention in HF Trainer."""
+
+from __future__ import annotations
+
+import importlib
+import inspect
+
+from transformers import Trainer
+
+from axolotl.monkeypatch.utils import detab_code
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+GUARD_PATTERN = 'if model.config._attn_implementation != "sdpa":'
+PATCHED_GUARD = 'if (attn_impl := (getattr(model.config, "_attn_implementation", None) or getattr(model.model.config, "_attn_implementation", None))) and attn_impl not in ("sdpa", "flash_attention_2"):'
+
+
+def patch_prepare_context_parallel_inputs() -> None:
+ """Relax the SDPA-only guard when running context parallelism with FlashAttention."""
+ if getattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched", False):
+ LOG.debug("Trainer._prepare_context_parallel_inputs already patched")
+ return
+
+ try:
+ original_source = inspect.getsource(Trainer._prepare_context_parallel_inputs)
+ except OSError as exc: # pragma: no cover - occurs when source is unavailable
+ LOG.warning("Unable to patch Trainer._prepare_context_parallel_inputs: %s", exc)
+ return
+
+ if GUARD_PATTERN not in original_source:
+ LOG.warning(
+ "Expected guard not found in Trainer._prepare_context_parallel_inputs; \n"
+ "skipping FlashAttention context parallelism patch"
+ )
+ return
+
+ patched_source = original_source.replace(GUARD_PATTERN, PATCHED_GUARD)
+ patched_source, _ = detab_code(patched_source)
+ patched_source = patched_source.replace(
+ "def _prepare_context_parallel_inputs(",
+ "def axolotl_prepare_context_parallel_inputs(",
+ 1,
+ )
+
+ module_name = Trainer.__module__
+ module = importlib.import_module(module_name)
+
+ # import symbols referenced in the method so exec can succeed
+ items_to_import = []
+ for item in dir(module):
+ if item in patched_source:
+ items_to_import.append(item)
+
+ exec(f"from {module_name} import ({', '.join(items_to_import)})", globals())
+ exec(patched_source, globals())
+
+ Trainer._original_prepare_context_parallel_inputs = (
+ Trainer._prepare_context_parallel_inputs
+ )
+ Trainer._prepare_context_parallel_inputs = axolotl_prepare_context_parallel_inputs
+ Trainer._axolotl_prepare_context_parallel_inputs_source = patched_source
+ Trainer._axolotl_prepare_context_parallel_inputs_patched = True
+ LOG.debug(
+ "Patched Trainer._prepare_context_parallel_inputs for FlashAttention + CP"
+ )
diff --git a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
index 75f4158b3..b8172bbe6 100644
--- a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
@@ -28,15 +28,6 @@ PATCHED_EVAL_CODE = {
"array": 'metrics[f"{metric_key_prefix}_loss"] = np.nanmean(all_losses).item()',
}
-ORIGINAL_FSDP2_CODE = """
- model.eval()
-"""
-
-PATCHED_FSDP2_CODE = """
- if hasattr(model, "eval") and callable(model.eval):
- self.model.eval()
-"""
-
ORIGINAL_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).mean().item()"
PATCHED_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).nanmean().item()"
@@ -46,18 +37,11 @@ def check_evaluation_loop_is_patchable() -> bool:
return all(value in evaluation_loop_source for value in ORIGINAL_EVAL_CODE.values())
-def check_evaluation_loop_is_fsdp2_patchable() -> bool:
- evaluation_loop_source = inspect.getsource(Trainer.evaluation_loop)
- evaluation_loop_source, _ = detab_code(evaluation_loop_source)
- return ORIGINAL_FSDP2_CODE in evaluation_loop_source
-
-
-# pylint: disable=protected-access
-def patch_evaluation_loop(patch_fsdp2: bool):
+def patch_evaluation_loop():
"""Patch the evaluation_loop method."""
# Check if already patched
if hasattr(Trainer, "_original_evaluation_loop"):
- LOG.info("Trainer.evaluation_loop already patched")
+ LOG.debug("Trainer.evaluation_loop already patched")
return
# Check if the patterns exist
@@ -76,13 +60,6 @@ def patch_evaluation_loop(patch_fsdp2: bool):
ORIGINAL_EVAL_CODE["array"], PATCHED_EVAL_CODE["array"]
)
- # Apply FSDP2 eval guard patch if needed
- if patch_fsdp2 and ORIGINAL_FSDP2_CODE in evaluation_loop_source:
- evaluation_loop_source = evaluation_loop_source.replace(
- ORIGINAL_FSDP2_CODE, PATCHED_FSDP2_CODE
- )
- LOG.info("Applied FSDP2 eval guard patch to evaluation_loop")
-
# Rename the function to avoid conflicts
evaluation_loop_source = evaluation_loop_source.replace(
"def evaluation_loop(",
@@ -101,16 +78,14 @@ def patch_evaluation_loop(patch_fsdp2: bool):
items_to_import.append(item)
# Execute the imports and patched method
- exec( # pylint: disable=exec-used # nosec B102
+ exec(
f"from {module_name} import ({', '.join(items_to_import)})",
globals(),
)
- exec(evaluation_loop_source, globals()) # pylint: disable=exec-used # nosec B102
+ exec(evaluation_loop_source, globals())
- LOG.info("Patched Trainer.evaluation_loop with nanmean loss calculation")
- Trainer.evaluation_loop = (
- axolotl_evaluation_loop # pylint: disable=undefined-variable # noqa: F821
- )
+ LOG.debug("Patched Trainer.evaluation_loop with nanmean loss calculation")
+ Trainer.evaluation_loop = axolotl_evaluation_loop
def check_maybe_log_save_evaluate_is_patchable() -> bool:
@@ -118,7 +93,6 @@ def check_maybe_log_save_evaluate_is_patchable() -> bool:
return ORIGINAL_MAYBE_CODE in maybe_log_source
-# pylint: disable=protected-access
def patch_maybe_log_save_evaluate():
"""Patch the _maybe_log_save_evaluate method."""
# Check if already patched
@@ -155,11 +129,11 @@ def patch_maybe_log_save_evaluate():
items_to_import.append(item)
# Execute the imports and patched method
- exec( # pylint: disable=exec-used # nosec B102
+ exec(
f"from {module_name} import ({', '.join(items_to_import)})",
globals(),
)
- exec(maybe_log_source, globals()) # pylint: disable=exec-used # nosec B102
+ exec(maybe_log_source, globals())
- LOG.info("Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation")
- Trainer._maybe_log_save_evaluate = axolotl_maybe_log_save_evaluate # pylint: disable=undefined-variable # noqa: F821
+ LOG.debug("Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation")
+ Trainer._maybe_log_save_evaluate = axolotl_maybe_log_save_evaluate
diff --git a/src/axolotl/monkeypatch/unsloth_.py b/src/axolotl/monkeypatch/unsloth_.py
index 146047e95..59f32c6f5 100644
--- a/src/axolotl/monkeypatch/unsloth_.py
+++ b/src/axolotl/monkeypatch/unsloth_.py
@@ -17,27 +17,19 @@ ORIGINAL_QKV_CODE = """
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
-""".lstrip(
- "\n"
-)
+""".lstrip("\n")
PATCHED_QKV_CODE = """
query_states, key_states, value_states = self.apply_qkv(self, hidden_states)
-""".lstrip(
- "\n"
-)
+""".lstrip("\n")
ORIGINAL_O_CODE = """
attn_output = self.o_proj(attn_output)
-""".lstrip(
- "\n"
-)
+""".lstrip("\n")
PATCHED_O_CODE = """
attn_output = self.apply_o(self, attn_output)
-""".lstrip(
- "\n"
-)
+""".lstrip("\n")
def original_apply_qkv(self, hidden_states):
@@ -66,13 +58,13 @@ def check_self_attn_is_patchable() -> bool:
def integrate_cross_entropy_loss_patch(model_type: str = "llama") -> None:
from unsloth.kernels.cross_entropy_loss import fast_cross_entropy_loss
- def UnslothForCausalLMLoss( # pylint: disable=invalid-name
+ def UnslothForCausalLMLoss(
logits,
labels,
- vocab_size: int, # pylint: disable=unused-argument
+ vocab_size: int,
num_items_in_batch: int = None,
- ignore_index: int = -100, # pylint: disable=unused-argument
- **kwargs, # pylint: disable=unused-argument
+ ignore_index: int = -100,
+ **kwargs,
):
# Upcast to float if we need to compute the loss to avoid potential precision issues
logits = logits.float()
@@ -93,18 +85,16 @@ def integrate_cross_entropy_loss_patch(model_type: str = "llama") -> None:
raise ValueError("Unsupported model type")
-self_attn_lora_patched = False # pylint: disable=invalid-name
+self_attn_lora_patched = False
def patch_self_attn_lora():
- global self_attn_lora_patched # pylint: disable=global-statement
+ global self_attn_lora_patched
if self_attn_lora_patched:
# prevent patching multiple times
return
self_attn_forward = get_self_attn_code()
- LlamaFlashAttention2._original_forward = ( # pylint: disable=protected-access
- self_attn_forward
- )
+ LlamaFlashAttention2._original_forward = self_attn_forward
self_attn_forward, _ = detab_code(self_attn_forward)
assert ORIGINAL_QKV_CODE in self_attn_forward, "Original qkv code not found"
assert ORIGINAL_O_CODE in self_attn_forward, "Original o code not found"
@@ -125,27 +115,25 @@ def patch_self_attn_lora():
if item in self_attn_forward:
items_to_import.append(item)
- exec( # pylint: disable=exec-used # nosec B102
+ exec(
"from transformers.models.llama.modeling_llama import ("
+ ", ".join(x for x in items_to_import)
+ ")",
globals(),
)
- exec(self_attn_forward, globals()) # pylint: disable=exec-used # nosec B102
+ exec(self_attn_forward, globals())
self_attn_lora_patched = True
LOG.info("patching unsloth attn lora")
- LlamaFlashAttention2.forward = (
- unsloth_attn_forward # pylint: disable=undefined-variable # noqa: F821
- )
+ LlamaFlashAttention2.forward = unsloth_attn_forward
def integrate_rope_embeddings():
import transformers.models.llama.modeling_llama
from unsloth.kernels.rope_embedding import fast_rope_embedding
- def apply_rotary_pos_emb( # pylint: disable=unused-argument
- q, # pylint: disable=invalid-name
- k, # pylint: disable=invalid-name
+ def apply_rotary_pos_emb(
+ q,
+ k,
cos,
sin,
position_ids=None,
diff --git a/src/axolotl/monkeypatch/xformers_/__init__.py b/src/axolotl/monkeypatch/xformers_/__init__.py
index a052ea49e..6f5b43f77 100644
--- a/src/axolotl/monkeypatch/xformers_/__init__.py
+++ b/src/axolotl/monkeypatch/xformers_/__init__.py
@@ -36,7 +36,7 @@ class FusedMLP(torch.nn.Module):
self.swiglu.w3.weight.data = down_proj.weight.data
def _post_training(self, model, name):
- w1, w2 = torch.split( # pylint: disable=invalid-name
+ w1, w2 = torch.split(
self.swiglu.w12.weight.data, self.config.intermediate_size, dim=0
)
@@ -48,5 +48,5 @@ class FusedMLP(torch.nn.Module):
set_module_name(model, name, new_mlp)
- def forward(self, x: torch.Tensor) -> torch.Tensor: # pylint: disable=invalid-name
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.swiglu(x)
diff --git a/src/axolotl/processing_strategies.py b/src/axolotl/processing_strategies.py
index 4cc5e85a1..07b114163 100644
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -6,11 +6,14 @@ from typing import Optional
from PIL import Image, ImageOps
from PIL.Image import Resampling
from torch import Tensor, zeros_like
-from transformers import ProcessorMixin, VoxtralProcessor
+from transformers import ProcessorMixin
from transformers.image_utils import load_image
+from transformers.models.smolvlm import SmolVLMProcessor
+from transformers.models.voxtral import VoxtralProcessor
from axolotl.utils.dict import remove_none_values
from axolotl.utils.logging import get_logger
+from axolotl.utils.mistral.mistral3_processor import Mistral3Processor
LOG = get_logger(__name__)
@@ -138,7 +141,7 @@ class ProcessingStrategy:
image_key = key
break
- # if the image key exists, add the image to the first message
+ # if the image key exists, add the image to the first user message
if image_key is not None and processed_example[image_key] is not None:
# TODO: check if it's normal to be single image only for common datasets
# From observation, it's usually a list of single image but some datasets may have several columns for images
@@ -156,9 +159,9 @@ class ProcessingStrategy:
image_value = load_image(image_value)
if self.image_size is not None:
- assert hasattr(
- image_value, "resize"
- ), "Image does not have a resize method"
+ assert hasattr(image_value, "resize"), (
+ "Image does not have a resize method"
+ )
if isinstance(self.image_size, tuple):
image_value = image_value.resize(
@@ -179,26 +182,34 @@ class ProcessingStrategy:
# Look for any image type in the first message
# some dataset have an {type: "image"} in the first message
+ msg_ind_to_add = None
ind_to_add = None
+ first_user_idx = None
- for i, content in enumerate(
- processed_example["messages"][0]["content"]
- ):
- # Usually datasets created with image columns, don't have it in the messages itself
- if content["type"] == "image" and all(
- k not in content for k in ["image", "url", "path", "base64"]
+ for msg_idx, msg_content in enumerate(processed_example["messages"]):
+ if first_user_idx is None and msg_content["role"] == "user":
+ first_user_idx = msg_idx
+ for i, content in enumerate(
+ processed_example["messages"][msg_idx]["content"]
):
- ind_to_add = i
- break
+ # Usually datasets created with image columns, don't have it in the messages itself
+ if content["type"] == "image" and all(
+ k not in content for k in ["image", "url", "path", "base64"]
+ ):
+ msg_ind_to_add = msg_idx
+ ind_to_add = i
+ break
# If an image type is found, add the image to that index
- if ind_to_add is not None:
- processed_example["messages"][0]["content"][ind_to_add][
- "image"
- ] = image_value
+ if ind_to_add is not None and msg_ind_to_add is not None:
+ processed_example["messages"][msg_ind_to_add]["content"][
+ ind_to_add
+ ]["image"] = image_value
else:
- # if no image type is found, add it to end of the first message
- processed_example["messages"][0]["content"].append(
+ # if no image type is found, add it to end of the first user message
+ if first_user_idx is None:
+ first_user_idx = 0
+ processed_example["messages"][first_user_idx]["content"].append(
{
"type": "image",
"image": image_value,
@@ -395,6 +406,54 @@ class VoxtralProcessingStrategy(ProcessingStrategy):
return labels
+class SmolVLM2ProcessingStrategy(ProcessingStrategy):
+ """Processing Strategy class for SmolVLM2"""
+
+ def __init__(
+ self,
+ processor: ProcessorMixin,
+ chat_template: Optional[str] = None,
+ image_size: int | tuple[int, int] | None = None,
+ image_resize_algorithm: Resampling | None = None,
+ ):
+ super().__init__(processor, chat_template, image_size, image_resize_algorithm)
+ self.image_token = "
](https://github.com/axolotl-ai-cloud/axolotl)"""
transformers.modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n{badge_markdown}"
- if getattr(cfg, "axolotl_config_path"):
+ if cfg.axolotl_config_path:
raw_axolotl_cfg = Path(cfg.axolotl_config_path)
version = importlib.metadata.version("axolotl")
if raw_axolotl_cfg.is_file():
@@ -470,7 +473,9 @@ def handle_untrained_tokens_fix(
)
-def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> tuple[
+def setup_model_and_trainer(
+ cfg: DictDefault, dataset_meta: TrainDatasetMeta
+) -> tuple[
"HFRLTrainerBuilder" | "HFCausalTrainerBuilder",
PeftModel | PreTrainedModel,
PreTrainedTokenizer,
@@ -520,6 +525,17 @@ def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) ->
plugin_manager = PluginManager.get_instance()
plugin_manager.post_trainer_create(cfg, trainer)
+ if cfg.use_ray:
+ try:
+ import ray.train.huggingface.transformers
+
+ trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)
+ except ImportError:
+ LOG.warning(
+ "The Ray integration with Hugging Face Transformers is not available. "
+ "To use Ray, install the 'ray[train]' package."
+ )
+
return (
trainer,
model,
@@ -564,7 +580,7 @@ def train(
setup_model_card(cfg)
# Execute the training
- resume_from_checkpoint = determine_resume_checkpoint(cfg)
+ resume_from_checkpoint = determine_last_checkpoint(cfg)
execute_training(cfg, trainer, resume_from_checkpoint)
# clear cache
@@ -573,6 +589,9 @@ def train(
# Save the trained model and cleanup
save_trained_model(cfg, trainer, model, safe_serialization)
+ tokenizer.save_pretrained(
+ str(Path(cfg.output_dir)), save_jinja_files=cfg.tokenizer_save_jinja_files
+ )
create_model_card(cfg, trainer)
if not cfg.use_ray:
cleanup_distributed()
diff --git a/src/axolotl/utils/__init__.py b/src/axolotl/utils/__init__.py
index e669413f8..72f8173f3 100644
--- a/src/axolotl/utils/__init__.py
+++ b/src/axolotl/utils/__init__.py
@@ -17,7 +17,13 @@ def is_comet_available():
return importlib.util.find_spec("comet_ml") is not None
-# pylint: disable=duplicate-code
+def is_opentelemetry_available():
+ return (
+ importlib.util.find_spec("opentelemetry") is not None
+ and importlib.util.find_spec("prometheus_client") is not None
+ )
+
+
def get_pytorch_version() -> tuple[int, int, int]:
"""
Get Pytorch version as a tuple of (major, minor, patch).
@@ -45,15 +51,6 @@ def set_pytorch_cuda_alloc_conf():
)
-def patch_optimized_env():
- """
- Patch environment variables to improve VRAM usage and increase download speed
- """
- if os.getenv("HF_HUB_ENABLE_HF_TRANSFER") is None:
- os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
- set_pytorch_cuda_alloc_conf()
-
-
def get_not_null(value, default=None):
"""
return the value if it's not None, otherwise return the default value
diff --git a/src/axolotl/utils/bench.py b/src/axolotl/utils/bench.py
index dd3a85b8c..0a4594991 100644
--- a/src/axolotl/utils/bench.py
+++ b/src/axolotl/utils/bench.py
@@ -60,13 +60,14 @@ def gpu_memory_usage_all(device=0):
active = torch.cuda.memory_stats().get("active_bytes.all.peak", 0) / 1024.0**3
allocated = torch.cuda.max_memory_allocated(device) / 1024.0**3
reserved = torch.cuda.max_memory_reserved(device) / 1024.0**3
+ torch.cuda.reset_peak_memory_stats(device)
return active, allocated, reserved
def mps_memory_usage_all():
- usage = torch.mps.current_allocated_memory() / 1024.0**3
- reserved = torch.mps.driver_allocated_memory() / 1024.0**3
- return usage, reserved - usage, 0
+ active = torch.mps.current_allocated_memory() / 1024.0**3
+ allocated = torch.mps.driver_allocated_memory() / 1024.0**3
+ return active, allocated, 0
def npu_memory_usage_all(device=0):
diff --git a/src/axolotl/utils/callbacks/__init__.py b/src/axolotl/utils/callbacks/__init__.py
index d3f3126b5..36370ef13 100644
--- a/src/axolotl/utils/callbacks/__init__.py
+++ b/src/axolotl/utils/callbacks/__init__.py
@@ -16,8 +16,8 @@ import pandas as pd
import torch
import torch.distributed as dist
import wandb
+import yaml
from datasets import load_dataset
-from optimum.bettertransformer import BetterTransformer
from tqdm import tqdm
from transformers import (
GenerationConfig,
@@ -28,8 +28,6 @@ from transformers import (
TrainingArguments,
)
from transformers.trainer_utils import (
- PREFIX_CHECKPOINT_DIR,
- IntervalStrategy,
SaveStrategy,
)
from trl.models import unwrap_model_for_generation
@@ -56,42 +54,6 @@ IGNORE_INDEX = -100
LOG = get_logger(__name__)
-class SaveBetterTransformerModelCallback(
- TrainerCallback
-): # pylint: disable=too-few-public-methods
- """Callback to save the BetterTransformer wrapped model"""
-
- def on_step_end(
- self,
- args: TrainingArguments,
- state: TrainerState,
- control: TrainerControl,
- **kwargs,
- ) -> TrainerControl:
- # Save
- if (
- args.save_strategy == IntervalStrategy.STEPS
- and args.save_steps > 0
- and state.global_step % args.save_steps == 0
- ):
- control.should_save = True
-
- if control.should_save:
- checkpoint_folder = os.path.join(
- args.output_dir,
- f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
- )
-
- model = BetterTransformer.reverse(kwargs["model"])
- model.save_pretrained(checkpoint_folder)
- # FIXME - need to cleanup old checkpoints
-
- # since we're saving here, we don't need the trainer loop to attempt to save too b/c
- # the trainer will raise an exception since it can't save a BetterTransformer wrapped model
- control.should_save = False
- return control
-
-
class LossWatchDogCallback(TrainerCallback):
"""Callback to track loss and stop training if loss is too high"""
@@ -103,7 +65,7 @@ class LossWatchDogCallback(TrainerCallback):
def on_step_end(
self,
- args: TrainingArguments, # pylint: disable=unused-argument
+ args: TrainingArguments,
state: TrainerState,
control: TrainerControl,
**_kwargs,
@@ -126,7 +88,7 @@ class SaveModelOnFirstStepCallback(TrainerCallback):
def on_step_end(
self,
- args: TrainingArguments, # pylint: disable=unused-argument
+ args: TrainingArguments,
state: TrainerState,
control: TrainerControl,
**_kwargs,
@@ -239,10 +201,10 @@ def bench_eval_callback_factory(trainer, tokenizer):
def on_evaluate(
self,
args: AxolotlTrainingArguments,
- state: TrainerState, # pylint: disable=unused-argument
- control: TrainerControl, # pylint: disable=unused-argument
- metrics: Dict[str, float], # pylint: disable=unused-argument
- **kwargs, # pylint: disable=unused-argument
+ state: TrainerState,
+ control: TrainerControl,
+ metrics: Dict[str, float],
+ **kwargs,
):
data_loader = trainer.get_bench_dataloader(
bench_dataset.remove_columns(["input", "subject", "output", "name"])
@@ -272,7 +234,7 @@ def bench_eval_callback_factory(trainer, tokenizer):
# Extract results by subject.
bench_name = bench_dataset["name"]
bench_names: dict = {s: {"refs": [], "preds": []} for s in set(bench_name)}
- for s, p, r in zip(bench_name, preds, refs): # pylint: disable=invalid-name
+ for s, p, r in zip(bench_name, preds, refs, strict=False):
bench_names[s]["preds"].append(p)
bench_names[s]["refs"].append(r)
barrier()
@@ -310,9 +272,7 @@ def bench_eval_callback_factory(trainer, tokenizer):
bench_scores = []
bench_refs = []
bench_preds = []
- for (
- bench_name
- ) in combined_bench_names: # pylint: disable=consider-using-dict-items
+ for bench_name in combined_bench_names:
bench_score = accuracy.compute(
references=combined_bench_names[bench_name]["refs"],
predictions=combined_bench_names[bench_name]["preds"],
@@ -361,18 +321,18 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
else:
try:
metrics[metric] = evaluate.load(metric)
- except Exception as exc: # pylint: disable=broad-exception-caught
+ except Exception as exc:
LOG.warning(f"{metric}: {exc.args}")
return metrics
def on_evaluate(
self,
- args: AxolotlTrainingArguments, # pylint: disable=unused-argument
+ args: AxolotlTrainingArguments,
state: TrainerState,
control: TrainerControl,
- train_dataloader, # pylint: disable=unused-argument
+ train_dataloader,
eval_dataloader,
- **kwargs, # pylint: disable=unused-argument
+ **kwargs,
):
trainer.model_wrapped.eval()
@@ -380,7 +340,6 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
self.cfg.device
) # Use this instead of trainer.model_wrapped.device as it may return cpu if fsdp offloaded
- # pylint: disable=duplicate-code
generation_config = GenerationConfig(
max_new_tokens=self.cfg.eval_max_new_tokens,
bos_token_id=tokenizer.bos_token_id,
@@ -411,9 +370,7 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
try:
# Only pass the kwargs that are in the metric's feature list
metric_kwargs = {
- k: kwargs[k]
- for k in metric._feature_names() # pylint: disable=protected-access
- if k in kwargs
+ k: kwargs[k] for k in metric._feature_names() if k in kwargs
}
if isinstance(metric, Perplexity):
@@ -425,7 +382,7 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
if "score" in metric_score
else metric_score["mean_score"]
)
- except Exception: # pylint: disable=broad-exception-caught
+ except Exception:
traceback.print_exc()
LOG.debug(
f"Failed to compute metric {metric.name} with kwargs {kwargs.keys()}"
@@ -473,6 +430,7 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
batch_input_ids,
batch_labels,
batch_pos_ids,
+ strict=False,
):
if pos_ids is None:
pos_ranges = [(0, len(input_ids_all) - 1)]
@@ -523,7 +481,7 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
prediction_all_tokens = predictions["sequences"].cpu().tolist()
prediction_without_prompt_tokens_list = []
for prompt_token_ids, prediction_tokens in zip(
- prompt_token_ids_list, prediction_all_tokens
+ prompt_token_ids_list, prediction_all_tokens, strict=False
):
prediction_without_prompt_tokens = prediction_tokens[
len(prompt_token_ids) :
@@ -561,12 +519,12 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
def on_evaluate(
self,
- args: AxolotlTrainingArguments, # pylint: disable=unused-argument
+ args: AxolotlTrainingArguments,
state: TrainerState,
control: TrainerControl,
- train_dataloader, # pylint: disable=unused-argument
+ train_dataloader,
eval_dataloader,
- **kwargs, # pylint: disable=unused-argument
+ **kwargs,
):
eval_table_size = self.cfg.eval_table_size
@@ -576,7 +534,6 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
trainer.model.eval()
device = torch.device(self.cfg.device)
- # pylint: disable=duplicate-code
generation_config = GenerationConfig(
max_new_tokens=self.cfg.eval_max_new_tokens,
bos_token_id=tokenizer.bos_token_id,
@@ -644,6 +601,7 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
batch_labels,
batch_pos_ids,
batch_logits,
+ strict=False,
):
if pos_ids is None:
pos_ranges = [(0, len(input_ids_all) - 1)]
@@ -697,7 +655,7 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
prediction_all_tokens = predictions["sequences"].cpu().tolist()
prediction_without_prompt_tokens_list = []
for prompt_token_ids, prediction_tokens in zip(
- prompt_token_ids_list, prediction_all_tokens
+ prompt_token_ids_list, prediction_all_tokens, strict=False
):
prediction_without_prompt_tokens = prediction_tokens[
len(prompt_token_ids) :
@@ -716,7 +674,11 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
prediction_text,
pred_step_text,
) in zip(
- prompt_texts, completion_texts, predicted_texts, pred_step_texts
+ prompt_texts,
+ completion_texts,
+ predicted_texts,
+ pred_step_texts,
+ strict=False,
):
table_data["id"].append(row_index)
table_data["Prompt"].append(prompt_text)
@@ -774,10 +736,10 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
def on_train_begin(
self,
- args: AxolotlTrainingArguments, # pylint: disable=unused-argument
- state: TrainerState, # pylint: disable=unused-argument
+ args: AxolotlTrainingArguments,
+ state: TrainerState,
control: TrainerControl,
- **kwargs, # pylint: disable=unused-argument
+ **kwargs,
):
if state.is_world_process_zero:
try:
@@ -798,6 +760,37 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
except (FileNotFoundError, ConnectionError) as err:
LOG.warning(f"Error while saving Axolotl config to WandB: {err}")
+ try:
+ with open(self.axolotl_config_path, "r", encoding="utf-8") as f:
+ cfg = yaml.safe_load(f) or {}
+
+ chat_tpl = cfg.get("chat_template_jinja")
+ if chat_tpl:
+ with NamedTemporaryFile(
+ mode="w", delete=True, suffix=".jinja", prefix="chat_template_"
+ ) as temp_ct_file:
+ if (
+ isinstance(chat_tpl, str)
+ and os.path.exists(chat_tpl)
+ and os.path.isfile(chat_tpl)
+ ):
+ copyfile(chat_tpl, temp_ct_file.name)
+ else:
+ temp_ct_file.write(str(chat_tpl))
+ temp_ct_file.flush()
+
+ artifact = wandb.Artifact(
+ f"chat-template-{wandb.run.id}", type="jinja-template"
+ )
+ artifact.add_file(temp_ct_file.name)
+ wandb.log_artifact(artifact)
+ wandb.save(temp_ct_file.name)
+ LOG.info(
+ "The chat_template_jinja has been saved to the WandB run under files."
+ )
+ except (FileNotFoundError, ConnectionError, yaml.YAMLError) as err:
+ LOG.warning(f"Error while saving chat_template_jinja to WandB: {err}")
+
if args.deepspeed:
try:
# sync config to top level in run, cannot delete file right away because wandb schedules it to be synced even w/policy = 'now', so let OS delete it later.
@@ -845,19 +838,30 @@ class GCCallback(TrainerCallback):
gc.collect()
def on_train_begin(
- self, args, state, control, **kwargs # pylint: disable=unused-argument
+ self,
+ args,
+ state,
+ control,
+ **kwargs,
):
self._gc()
def on_step_begin(
- self, args, state, control, **kwargs # pylint: disable=unused-argument
+ self,
+ args,
+ state,
+ control,
+ **kwargs,
):
- # pylint: disable=consider-using-in
if self.next_gc_on_begin_step == state.global_step or state.global_step == 0:
self._gc()
def on_step_end(
- self, args, state, control, **kwargs # pylint: disable=unused-argument
+ self,
+ args,
+ state,
+ control,
+ **kwargs,
):
if control.should_evaluate:
# automatically GC before evals so the eval memory spike from the CEL doesn't OOM the trainer
@@ -879,7 +883,11 @@ class GCCallback(TrainerCallback):
self._gc()
def on_epoch_end(
- self, args, state, control, **kwargs # pylint: disable=unused-argument
+ self,
+ args,
+ state,
+ control,
+ **kwargs,
):
self._gc()
@@ -892,16 +900,12 @@ def colab_inference_post_train_callback(trainer: Trainer):
self.gpu_name = torch.cuda.get_device_name(0)
self.cfg = cfg
- def on_train_end(
- self, args, state, control, **kwargs
- ): # pylint: disable=unused-argument
+ def on_train_end(self, args, state, control, **kwargs):
"""
handle T4 gpu, we need to convert attention to eager for inference
"""
if "Tesla T4" in self.gpu_name and self.cfg.xformers_attention:
- trainer.model.config._attn_implementation = ( # pylint: disable=protected-access
- "eager"
- )
+ trainer.model.config._attn_implementation = "eager"
trainer.model.gradient_checkpointing_disable()
trainer.model.config.use_cache = True
trainer.model.eval()
diff --git a/src/axolotl/utils/callbacks/comet_.py b/src/axolotl/utils/callbacks/comet_.py
index 7dce95145..cd3bcf70e 100644
--- a/src/axolotl/utils/callbacks/comet_.py
+++ b/src/axolotl/utils/callbacks/comet_.py
@@ -22,10 +22,10 @@ class SaveAxolotlConfigtoCometCallback(TrainerCallback):
def on_train_begin(
self,
- args: "AxolotlTrainingArguments", # pylint: disable=unused-argument
- state: TrainerState, # pylint: disable=unused-argument
+ args: "AxolotlTrainingArguments",
+ state: TrainerState,
control: TrainerControl,
- **kwargs, # pylint: disable=unused-argument
+ **kwargs,
):
if is_main_process():
try:
diff --git a/src/axolotl/utils/callbacks/lisa.py b/src/axolotl/utils/callbacks/lisa.py
index 348cdf2da..03f189d80 100644
--- a/src/axolotl/utils/callbacks/lisa.py
+++ b/src/axolotl/utils/callbacks/lisa.py
@@ -55,9 +55,7 @@ def lisa_callback_factory(trainer: "AxolotlTrainer"):
for param in layer.parameters():
param.requires_grad = False
- def on_step_begin(
- self, args, state, control, **kwargs
- ): # pylint: disable=unused-argument
+ def on_step_begin(self, args, state, control, **kwargs):
# Check if it's time to switch active layers, including at step 0
if state.global_step % self.step_interval == 0 or state.global_step == 1:
self.switch_active_layers()
diff --git a/src/axolotl/utils/callbacks/mlflow_.py b/src/axolotl/utils/callbacks/mlflow_.py
index ac72f5e6d..30120a87d 100644
--- a/src/axolotl/utils/callbacks/mlflow_.py
+++ b/src/axolotl/utils/callbacks/mlflow_.py
@@ -23,7 +23,6 @@ def should_log_artifacts() -> bool:
class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
- # pylint: disable=duplicate-code
"""Callback to save axolotl config to mlflow"""
def __init__(self, axolotl_config_path):
@@ -31,10 +30,10 @@ class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
def on_train_begin(
self,
- args: "AxolotlTrainingArguments", # pylint: disable=unused-argument
- state: TrainerState, # pylint: disable=unused-argument
+ args: "AxolotlTrainingArguments",
+ state: TrainerState,
control: TrainerControl,
- **kwargs, # pylint: disable=unused-argument
+ **kwargs,
):
if is_main_process():
try:
diff --git a/src/axolotl/utils/callbacks/opentelemetry.py b/src/axolotl/utils/callbacks/opentelemetry.py
new file mode 100644
index 000000000..3f7e56b78
--- /dev/null
+++ b/src/axolotl/utils/callbacks/opentelemetry.py
@@ -0,0 +1,238 @@
+"""OpenTelemetry metrics callback for Axolotl training"""
+
+import threading
+from typing import Dict, Optional
+
+from transformers import (
+ TrainerCallback,
+ TrainerControl,
+ TrainerState,
+ TrainingArguments,
+)
+
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+try:
+ from opentelemetry import metrics
+ from opentelemetry.exporter.prometheus import PrometheusMetricReader
+ from opentelemetry.metrics import set_meter_provider
+ from opentelemetry.sdk.metrics import MeterProvider as SDKMeterProvider
+ from prometheus_client import start_http_server
+
+ OPENTELEMETRY_AVAILABLE = True
+except ImportError:
+ LOG.warning("OpenTelemetry not available. pip install [opentelemetry]")
+ OPENTELEMETRY_AVAILABLE = False
+
+
+class OpenTelemetryMetricsCallback(TrainerCallback):
+ """
+ TrainerCallback that exports training metrics to OpenTelemetry/Prometheus.
+
+ This callback automatically tracks key training metrics including:
+ - Training loss
+ - Evaluation loss
+ - Learning rate
+ - Epoch progress
+ - Global step count
+ - Gradient norm
+
+ Metrics are exposed via HTTP endpoint for Prometheus scraping.
+ """
+
+ def __init__(self, cfg):
+ if not OPENTELEMETRY_AVAILABLE:
+ LOG.warning("OpenTelemetry not available, metrics will not be collected")
+ self.metrics_enabled = False
+ return
+
+ self.cfg = cfg
+ self.metrics_host = getattr(cfg, "otel_metrics_host", "localhost")
+ self.metrics_port = getattr(cfg, "otel_metrics_port", 8000)
+ self.metrics_enabled = True
+ self.server_started = False
+ self.metrics_lock = threading.Lock()
+
+ try:
+ # Create Prometheus metrics reader
+ prometheus_reader = PrometheusMetricReader()
+
+ # Create meter provider with Prometheus exporter
+ provider = SDKMeterProvider(metric_readers=[prometheus_reader])
+ set_meter_provider(provider)
+
+ # Get meter for creating metrics
+ self.meter = metrics.get_meter("axolotl.training")
+
+ # Create metrics
+ self._create_metrics()
+
+ except Exception as e:
+ LOG.warning(f"Failed to initialize OpenTelemetry metrics: {e}")
+ self.metrics_enabled = False
+
+ def _create_metrics(self):
+ """Create all metrics that will be tracked"""
+ self.train_loss_gauge = self.meter.create_gauge(
+ name="axolotl_train_loss",
+ description="Current training loss",
+ unit="1",
+ )
+
+ self.eval_loss_gauge = self.meter.create_gauge(
+ name="axolotl_eval_loss",
+ description="Current evaluation loss",
+ unit="1",
+ )
+
+ self.learning_rate_gauge = self.meter.create_gauge(
+ name="axolotl_learning_rate",
+ description="Current learning rate",
+ unit="1",
+ )
+
+ self.epoch_gauge = self.meter.create_gauge(
+ name="axolotl_epoch",
+ description="Current training epoch",
+ unit="1",
+ )
+
+ self.global_step_counter = self.meter.create_counter(
+ name="axolotl_global_steps",
+ description="Total training steps completed",
+ unit="1",
+ )
+
+ self.grad_norm_gauge = self.meter.create_gauge(
+ name="axolotl_gradient_norm",
+ description="Gradient norm",
+ unit="1",
+ )
+
+ self.memory_usage_gauge = self.meter.create_gauge(
+ name="axolotl_memory_usage",
+ description="Current memory usage in MB",
+ unit="MB",
+ )
+
+ def _start_metrics_server(self):
+ """Start the HTTP server for metrics exposure"""
+ if self.server_started:
+ return
+
+ try:
+ start_http_server(self.metrics_port, addr=self.metrics_host)
+ self.server_started = True
+ LOG.info(
+ f"OpenTelemetry metrics server started on http://{self.metrics_host}:{self.metrics_port}/metrics"
+ )
+
+ except Exception as e:
+ LOG.error(f"Failed to start OpenTelemetry metrics server: {e}")
+
+ def on_train_begin(
+ self,
+ args: TrainingArguments,
+ state: TrainerState,
+ control: TrainerControl,
+ **kwargs,
+ ):
+ """Called at the beginning of training"""
+ if not self.metrics_enabled:
+ return
+
+ self._start_metrics_server()
+ LOG.info("OpenTelemetry metrics collection started")
+
+ def on_log(
+ self,
+ args: TrainingArguments,
+ state: TrainerState,
+ control: TrainerControl,
+ logs: Optional[Dict[str, float]] = None,
+ **kwargs,
+ ):
+ """Called when logging occurs"""
+ if not self.metrics_enabled or not logs:
+ return
+
+ if "loss" in logs:
+ self.train_loss_gauge.set(logs["loss"])
+
+ if "eval_loss" in logs:
+ self.eval_loss_gauge.set(logs["eval_loss"])
+
+ if "learning_rate" in logs:
+ self.learning_rate_gauge.set(logs["learning_rate"])
+
+ if "epoch" in logs:
+ self.epoch_gauge.set(logs["epoch"])
+
+ if "grad_norm" in logs:
+ self.grad_norm_gauge.set(logs["grad_norm"])
+ if "memory_usage" in logs:
+ self.memory_usage_gauge.set(logs["memory_usage"])
+
+ def on_step_end(
+ self,
+ args: TrainingArguments,
+ state: TrainerState,
+ control: TrainerControl,
+ **kwargs,
+ ):
+ """Called at the end of each training step"""
+ if not self.metrics_enabled:
+ return
+
+ # Update step counter and epoch
+ self.global_step_counter.add(1)
+ if state.epoch is not None:
+ self.epoch_gauge.set(state.epoch)
+
+ def on_evaluate(
+ self,
+ args: TrainingArguments,
+ state: TrainerState,
+ control: TrainerControl,
+ metrics: Optional[Dict[str, float]] = None,
+ **kwargs,
+ ):
+ """Called after evaluation"""
+ if not self.metrics_enabled or not metrics:
+ return
+
+ if "eval_loss" in metrics:
+ self.eval_loss_gauge.set(metrics["eval_loss"])
+
+ # Record any other eval metrics as gauges
+ for key, value in metrics.items():
+ if key.startswith("eval_") and isinstance(value, (int, float)):
+ # Create gauge for this metric if it doesn't exist
+ gauge_name = f"axolotl_{key}"
+ try:
+ gauge = self.meter.create_gauge(
+ name=gauge_name,
+ description=f"Evaluation metric: {key}",
+ unit="1",
+ )
+ gauge.set(value)
+ except Exception as e:
+ LOG.warning(f"Failed to create/update metric {gauge_name}: {e}")
+
+ def on_train_end(
+ self,
+ args: TrainingArguments,
+ state: TrainerState,
+ control: TrainerControl,
+ **kwargs,
+ ):
+ """Called at the end of training"""
+ if not self.metrics_enabled:
+ return
+
+ LOG.info("Training completed. OpenTelemetry metrics collection finished.")
+ LOG.info(
+ f"Metrics are still available at http://{self.metrics_host}:{self.metrics_port}/metrics"
+ )
diff --git a/src/axolotl/utils/callbacks/profiler.py b/src/axolotl/utils/callbacks/profiler.py
index d26b7f9dd..2cf5e0f4f 100644
--- a/src/axolotl/utils/callbacks/profiler.py
+++ b/src/axolotl/utils/callbacks/profiler.py
@@ -26,58 +26,50 @@ class PytorchProfilerCallback(TrainerCallback):
if profiler_steps_start == 0:
# start recording memory allocations before everything is allocated, because if we start
# at the beginning of step 0, we won't have any memory allocations in the traces
- torch.cuda.memory._record_memory_history( # pylint: disable=protected-access
- enabled="all"
- )
+ torch.cuda.memory._record_memory_history(enabled="all")
profiler_steps_start = -1
self.profiler_steps_start = profiler_steps_start
- def on_step_begin( # pylint: disable=unused-argument
+ def on_step_begin(
self,
- args: TrainingArguments, # pylint: disable=unused-argument
+ args: TrainingArguments,
state: TrainerState,
- control: TrainerControl, # pylint: disable=unused-argument
- **kwargs, # pylint: disable=unused-argument
+ control: TrainerControl,
+ **kwargs,
):
if state.global_step == self.profiler_steps_start:
- torch.cuda.memory._record_memory_history( # pylint: disable=protected-access
- enabled="all"
- )
+ torch.cuda.memory._record_memory_history(enabled="all")
- def on_step_end( # pylint: disable=unused-argument
+ def on_step_end(
self,
- args: TrainingArguments, # pylint: disable=unused-argument
+ args: TrainingArguments,
state: TrainerState,
- control: TrainerControl, # pylint: disable=unused-argument
- **kwargs, # pylint: disable=unused-argument
+ control: TrainerControl,
+ **kwargs,
):
if state.global_step == self.profiler_steps_end:
- snapshot = torch.cuda.memory._snapshot() # pylint: disable=protected-access
+ snapshot = torch.cuda.memory._snapshot()
with open(Path(args.output_dir) / "snapshot.pickle", "wb") as fout:
dump(snapshot, fout)
# tell CUDA to stop recording memory allocations now
- torch.cuda.memory._record_memory_history( # pylint: disable=protected-access
- enabled=None
- )
+ torch.cuda.memory._record_memory_history(enabled=None)
- def on_train_end( # pylint: disable=unused-argument
+ def on_train_end(
self,
- args: TrainingArguments, # pylint: disable=unused-argument
+ args: TrainingArguments,
state: TrainerState,
- control: TrainerControl, # pylint: disable=unused-argument
- **kwargs, # pylint: disable=unused-argument
+ control: TrainerControl,
+ **kwargs,
):
# make sure to record if we happen to have more steps than steps to profile
if (
state.global_step >= self.profiler_steps_start
and state.global_step < self.profiler_steps_end
):
- snapshot = torch.cuda.memory._snapshot() # pylint: disable=protected-access
+ snapshot = torch.cuda.memory._snapshot()
with open(Path(args.output_dir) / "snapshot.pickle", "wb") as fout:
dump(snapshot, fout)
# tell CUDA to stop recording memory allocations now
- torch.cuda.memory._record_memory_history( # pylint: disable=protected-access
- enabled=None
- )
+ torch.cuda.memory._record_memory_history(enabled=None)
diff --git a/src/axolotl/utils/callbacks/qat.py b/src/axolotl/utils/callbacks/qat.py
index cf4d9a937..70746d6be 100644
--- a/src/axolotl/utils/callbacks/qat.py
+++ b/src/axolotl/utils/callbacks/qat.py
@@ -38,9 +38,7 @@ class QATCallback(TrainerCallback):
def __init__(self, cfg: QATConfig):
self.cfg = cfg
- def on_step_begin(
- self, args, state, control, model, **kwargs
- ): # pylint: disable=unused-argument
+ def on_step_begin(self, args, state, control, model, **kwargs):
if self.cfg.fake_quant_after_n_steps is not None:
if state.global_step == 0:
LOG.info(f"Disabling fake quantization at step {state.global_step}")
diff --git a/src/axolotl/utils/callbacks/tokens_per_second.py b/src/axolotl/utils/callbacks/tokens_per_second.py
new file mode 100644
index 000000000..ead129240
--- /dev/null
+++ b/src/axolotl/utils/callbacks/tokens_per_second.py
@@ -0,0 +1,64 @@
+"""A callback for calculating tokens per second during training."""
+
+import time
+
+import torch
+from transformers import (
+ TrainerCallback,
+ TrainerControl,
+ TrainerState,
+ TrainingArguments,
+)
+
+
+class TokensPerSecondCallback(TrainerCallback):
+ """
+ A callback to measure and log tokens per second during training.
+ """
+
+ def __init__(self, tensor_parallel_size, context_parallel_size):
+ super().__init__()
+ self.step_time = 0.0
+ self.start_time = 0.0
+ self.non_data_parallel_size = 1
+ if tensor_parallel_size is not None:
+ self.non_data_parallel_size *= tensor_parallel_size
+ if context_parallel_size is not None:
+ self.non_data_parallel_size *= context_parallel_size
+
+ def on_step_begin(
+ self,
+ args: TrainingArguments,
+ state: TrainerState,
+ control: TrainerControl,
+ **kwargs,
+ ): # pylint: disable=unused-argument
+ self.start_time = time.perf_counter()
+ state.last_tokens_per_second = torch.zeros(1)
+
+ def on_step_end(
+ self,
+ args: TrainingArguments,
+ state: TrainerState,
+ control: TrainerControl,
+ **kwargs,
+ ): # pylint: disable=unused-argument
+ if hasattr(state, "num_tokens"):
+ step_time = time.perf_counter() - self.start_time
+ num_tokens_per_device = state.num_tokens.clone()
+ # non data parallel groups have duplicated tokens, so we avoid double-counting
+ num_tokens_per_device = num_tokens_per_device / self.non_data_parallel_size
+ state.last_tokens_per_second = num_tokens_per_device / step_time
+
+ def on_log(
+ self,
+ args: TrainingArguments,
+ state: TrainerState,
+ control: TrainerControl,
+ logs=None,
+ **kwargs,
+ ): # pylint: disable=unused-argument
+ # after logging, clear the running metrics
+ if hasattr(state, "last_tokens_per_second"):
+ state.last_tokens_per_second.zero_()
+ state.num_tokens = torch.zeros(1)
diff --git a/src/axolotl/utils/collators/__init__.py b/src/axolotl/utils/collators/__init__.py
index 8c60f223c..d5e6ad17d 100644
--- a/src/axolotl/utils/collators/__init__.py
+++ b/src/axolotl/utils/collators/__init__.py
@@ -1,11 +1,17 @@
-"""
-shared axolotl collators for multipack, mamba, multimodal
-"""
+"""Shared axolotl collators for multipacking, mamba, multimodal."""
-from .batching import ( # noqa: F401
+from .batching import (
BatchSamplerDataCollatorForSeq2Seq,
DataCollatorForSeq2Seq,
PretrainingBatchSamplerDataCollatorForSeq2Seq,
V2BatchSamplerDataCollatorForSeq2Seq,
)
-from .mamba import MambaDataCollator # noqa: F401
+from .mamba import MambaDataCollator
+
+__all__ = [
+ "DataCollatorForSeq2Seq",
+ "BatchSamplerDataCollatorForSeq2Seq",
+ "V2BatchSamplerDataCollatorForSeq2Seq",
+ "PretrainingBatchSamplerDataCollatorForSeq2Seq",
+ "MambaDataCollator",
+]
diff --git a/src/axolotl/utils/collators/mm_chat.py b/src/axolotl/utils/collators/mm_chat.py
index 0075d4830..542918527 100644
--- a/src/axolotl/utils/collators/mm_chat.py
+++ b/src/axolotl/utils/collators/mm_chat.py
@@ -5,7 +5,6 @@ Collators for multi-modal chat messages and packing
from dataclasses import dataclass
from typing import Any, Optional, Union
-import torch
from torch import Tensor
from transformers import PreTrainedTokenizerBase
from transformers.data.data_collator import DataCollatorMixin
@@ -42,62 +41,19 @@ class MultiModalChatDataCollator(DataCollatorMixin):
examples = self.processing_strategy(examples)
# Initialize batch
- batch: dict[str, Any] = {}
+ messages = [ex["messages"] for ex in examples]
- # Process each example
- for example in examples:
- # Apply chat template to process the example
- # This method requires transformers>=4.49.0
- result = self.processing_strategy.processor.apply_chat_template(
- example["messages"],
- add_generation_prompt=False,
- tokenize=True,
- return_tensors="pt",
- padding=True,
- return_dict=True,
- chat_template=self.processing_strategy.chat_template,
- )
-
- # TODO: Check if need handling for len(input_ids) > sequence_len
-
- # Add the processed tensors to our batch
- for key in result.keys():
- if key not in batch:
- batch[key] = []
-
- batch[key].append(result[key].squeeze(0))
-
- # Pad sequences to the same length
- input_ids = torch.nn.utils.rnn.pad_sequence(
- batch["input_ids"],
- batch_first=True,
- padding_value=self.tokenizer.pad_token_id,
+ batch = self.processing_strategy.processor.apply_chat_template(
+ messages,
+ add_generation_prompt=False,
+ tokenize=True,
+ return_tensors="pt",
+ padding=True,
+ return_dict=True,
+ chat_template=self.processing_strategy.chat_template,
)
- attention_mask = torch.nn.utils.rnn.pad_sequence(
- batch["attention_mask"], batch_first=True, padding_value=0
- )
-
- # Create the final batch
- final_batch = {
- "input_ids": input_ids,
- "attention_mask": attention_mask,
- }
-
- for key, val in batch.items():
- if key in ["input_ids", "attention_mask"]:
- continue
-
- if key in ["token_type_ids", "cross_attention_mask"]:
- final_batch[key] = torch.nn.utils.rnn.pad_sequence(
- val, batch_first=True, padding_value=0
- )
- else:
- final_batch[key] = torch.stack(val)
-
# Process the labels
- final_batch["labels"] = self.processing_strategy.process_labels(
- final_batch["input_ids"]
- )
+ batch["labels"] = self.processing_strategy.process_labels(batch["input_ids"])
- return final_batch
+ return batch
diff --git a/src/axolotl/utils/config/__init__.py b/src/axolotl/utils/config/__init__.py
index c9613c39b..7a2bbd6f9 100644
--- a/src/axolotl/utils/config/__init__.py
+++ b/src/axolotl/utils/config/__init__.py
@@ -17,8 +17,8 @@ from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.config import (
AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
+ AxolotlInputConfig as AxolotlInputConfigBase,
)
-from axolotl.utils.schemas.config import AxolotlInputConfig as AxolotlInputConfigBase
from axolotl.utils.schemas.datasets import DPODataset, KTODataset, SFTDataset
LOG = get_logger(__name__)
@@ -37,7 +37,7 @@ def choose_device(cfg):
return f"npu:{cfg.local_rank}"
raise SystemError("No CUDA/mps/npu device found")
- except Exception: # pylint: disable=broad-exception-caught
+ except Exception:
return "cpu"
cfg.device = get_device()
@@ -77,7 +77,7 @@ def resolve_dtype(cfg):
if cfg.device == "mps":
cfg.load_in_8bit = False
cfg.tf32 = False
- if cfg.bf16:
+ if cfg.bf16 and cfg.fp16 is not False:
cfg.fp16 = True
cfg.bf16 = False
else:
@@ -266,14 +266,16 @@ def validate_config(
if cfg.plugins:
(
- AxolotlConfigWCapabilities, # pylint: disable=invalid-name
- AxolotlInputConfig, # pylint: disable=invalid-name
+ AxolotlConfigWCapabilities,
+ AxolotlInputConfig,
) = merge_input_args()
# Convert datasets to proper format if needed
if cfg.get("datasets"):
for idx, ds_cfg in enumerate(cfg["datasets"]):
- if cfg.get("rl") in ["dpo", "simpo"] and not isinstance(ds_cfg, DPODataset):
+ if cfg.get("rl") in ["dpo", "ipo", "simpo"] and not isinstance(
+ ds_cfg, DPODataset
+ ):
cfg["datasets"][idx] = DPODataset(**ds_cfg)
elif cfg.get("rl") == "kto" and not isinstance(ds_cfg, KTODataset):
cfg["datasets"][idx] = KTODataset(**dict(ds_cfg))
diff --git a/src/axolotl/utils/ctx_managers/__init__.py b/src/axolotl/utils/ctx_managers/__init__.py
index e544621b5..6ffda9e55 100644
--- a/src/axolotl/utils/ctx_managers/__init__.py
+++ b/src/axolotl/utils/ctx_managers/__init__.py
@@ -1,6 +1,5 @@
"""Init for context manager submodule"""
-# pylint: disable=unused-import
# flake8: noqa
from .sequence_parallel import SequenceParallelContextManager
diff --git a/src/axolotl/utils/ctx_managers/sequence_parallel.py b/src/axolotl/utils/ctx_managers/sequence_parallel.py
index 029d991dd..78b3d1cae 100644
--- a/src/axolotl/utils/ctx_managers/sequence_parallel.py
+++ b/src/axolotl/utils/ctx_managers/sequence_parallel.py
@@ -26,7 +26,7 @@ def apply_sequence_parallelism(
local_rank: int,
local_world_size: int,
gradient_accumulation_steps: int,
- ring_attn_func: RingAttnFunc, # pylint: disable=unused-argument
+ ring_attn_func: RingAttnFunc,
) -> tuple[dict[str, torch.Tensor], int, int]:
"""
Apply sequence parallelism slicing to a batch.
@@ -48,10 +48,10 @@ def apply_sequence_parallelism(
- The original sequence length before padding.
- The number of padding tokens added.
"""
- original_seq_len = batch["input_ids"].size(1)
+ batch_size, original_seq_len = batch["input_ids"].shape
# Update ring attention params if needed
- if batch.get("position_ids") is not None:
+ if batch.get("position_ids") is not None and batch_size == 1:
update_ring_attn_params(position_ids=batch["position_ids"])
else:
# If position_ids aren't already in the batch, create them
diff --git a/src/axolotl/utils/data/__init__.py b/src/axolotl/utils/data/__init__.py
index d162a7d0b..8b9e4e91d 100644
--- a/src/axolotl/utils/data/__init__.py
+++ b/src/axolotl/utils/data/__init__.py
@@ -1,19 +1,19 @@
"""Init for `axolotl.utils.data` module."""
-from axolotl.utils.data.pretraining import (
- encode_pretraining,
- wrap_pretraining_dataset,
-)
from axolotl.utils.data.rl import prepare_preference_datasets
from axolotl.utils.data.sft import (
get_dataset_wrapper,
prepare_datasets,
)
+from axolotl.utils.data.streaming import (
+ encode_streaming,
+ wrap_streaming_dataset,
+)
from axolotl.utils.data.utils import md5
__all__ = [
- "encode_pretraining",
- "wrap_pretraining_dataset",
+ "encode_streaming",
+ "wrap_streaming_dataset",
"prepare_preference_datasets",
"get_dataset_wrapper",
"prepare_datasets",
diff --git a/src/axolotl/utils/data/rl.py b/src/axolotl/utils/data/rl.py
index 6fd539758..f7a5ec04c 100644
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -113,7 +113,7 @@ def _map_dataset(
dataset = dataset.map(
ds_transform_fn,
- num_proc=cfg.dataset_processes,
+ num_proc=cfg.dataset_num_proc,
load_from_cache_file=not cfg.is_preprocess,
desc="Mapping RL Dataset",
**map_kwargs,
@@ -234,7 +234,7 @@ def _load_split(cfg: DictDefault, split: Literal["train", "test"]) -> Dataset:
prior_len = len(split_datasets[i])
split_datasets[i] = split_datasets[i].filter(
drop_long,
- num_proc=cfg.dataset_processes,
+ num_proc=cfg.dataset_num_proc,
load_from_cache_file=not cfg.is_preprocess,
desc="Dropping Long Sequences",
)
@@ -255,7 +255,6 @@ def _load_split(cfg: DictDefault, split: Literal["train", "test"]) -> Dataset:
return dataset
-# pylint: disable=duplicate-code
def _load_or_create_dataset_split(
cfg: DictDefault, tokenizer: PreTrainedTokenizer, split: Literal["train", "test"]
) -> Dataset:
diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py
index 975f26e71..ba5aec2d6 100644
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -9,13 +9,13 @@ from datasets import (
Dataset,
DatasetDict,
IterableDataset,
+ IterableDatasetDict,
load_dataset,
)
from transformers import PreTrainedTokenizer, ProcessorMixin
from axolotl.prompters import Prompter
from axolotl.utils.data.lock import FileLockLoader
-from axolotl.utils.data.pretraining import wrap_pretraining_dataset
from axolotl.utils.data.shared import (
create_train_validation_split,
datasets_with_name_generator,
@@ -26,9 +26,10 @@ from axolotl.utils.data.shared import (
save_preprocessed_dataset,
try_load_from_hub,
)
+from axolotl.utils.data.streaming import wrap_streaming_dataset
from axolotl.utils.data.utils import (
deduplicate_and_log_datasets,
- drop_long_seq_in_dataset,
+ handle_long_seq_in_dataset,
retry_on_request_exceptions,
)
from axolotl.utils.data.wrappers import get_dataset_wrapper
@@ -48,7 +49,6 @@ def prepare_datasets(
cfg: DictDefault,
tokenizer: PreTrainedTokenizer,
processor: ProcessorMixin | None = None,
- preprocess_iterable: bool = False,
) -> tuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]:
"""Prepare training and evaluation datasets based on configuration.
@@ -56,23 +56,19 @@ def prepare_datasets(
cfg: Dictionary mapping `axolotl` config keys to values.
tokenizer: Tokenizer to use for processing text.
processor: Optional processor for multimodal datasets.
- preprocess_iterable: Whether to use iterable preprocessing.
Returns:
Tuple of (train_dataset, eval_dataset, total_steps, prompters).
"""
- if cfg.pretraining_dataset:
- return _prepare_pretraining_dataset(
- cfg, tokenizer, processor, preprocess_iterable
- )
- return _prepare_standard_dataset(cfg, tokenizer, processor, preprocess_iterable)
+ if cfg.streaming or cfg.pretraining_dataset:
+ return _prepare_streaming_dataset(cfg, tokenizer, processor)
+ return _prepare_standard_dataset(cfg, tokenizer, processor)
def _prepare_standard_dataset(
cfg: DictDefault,
tokenizer: PreTrainedTokenizer,
processor: ProcessorMixin | None,
- preprocess_iterable: bool,
) -> tuple[Dataset, Dataset | None, int, list[Prompter | None]]:
"""Prepare standard (non-pretraining) datasets."""
@@ -83,7 +79,6 @@ def _prepare_standard_dataset(
cfg,
split="train",
processor=processor,
- preprocess_iterable=preprocess_iterable,
)
# Overwrite eval_dataset if test data exists
@@ -93,7 +88,6 @@ def _prepare_standard_dataset(
cfg,
split="test",
processor=processor,
- preprocess_iterable=preprocess_iterable,
)
return train_dataset, eval_dataset, prompters
@@ -128,22 +122,40 @@ def _prepare_standard_dataset(
return train_dataset, eval_dataset, total_num_steps, prompters
-def _prepare_pretraining_dataset(
+def _prepare_streaming_dataset(
cfg: DictDefault,
tokenizer: PreTrainedTokenizer,
processor: ProcessorMixin | None,
- preprocess_iterable: bool,
) -> tuple[IterableDataset, Dataset | None, int, list[Prompter | None]]:
"""
- Prepare dataset for pretraining mode.
+ Prepare dataset for streaming mode.
- Note: Pre-training datasets are streamed from the HuggingFace Hub.
+ Note: Streaming datasets are loaded incrementally from the source.
"""
- # Extract pretraining dataset configuration
- pretraining_config = _extract_pretraining_config(cfg)
+ if cfg.pretraining_dataset:
+ dataset_config = _extract_pretraining_config(cfg)
+ train_dataset = _load_streaming_dataset(dataset_config, cfg, tokenizer)
+ elif cfg.sample_packing:
+ # TODO(djsaunde): Implement for multiple datasets
+ dataset_config = DictDefault(cfg.datasets[0])
- # Load streaming dataset for training
- train_dataset = _load_pretraining_dataset(pretraining_config, cfg, tokenizer)
+ # Ensure we have a split set - default to 'train' if not specified
+ if not hasattr(dataset_config, "split") or not dataset_config.split:
+ dataset_config.split = "train"
+ train_dataset = _load_streaming_dataset(dataset_config, cfg, tokenizer)
+ else:
+ # Use legacy loading function for non-packed streaming datasets
+ train_dataset, eval_dataset, prompters = _load_and_prepare_datasets(
+ tokenizer,
+ cfg,
+ split="train",
+ processor=processor,
+ streaming=True,
+ )
+
+ # Return early for non-packed streaming datasets
+ total_num_steps = cfg.max_steps if cfg.max_steps else -1
+ return train_dataset, eval_dataset, total_num_steps, prompters
# Load evaluation dataset if specified
eval_dataset = None
@@ -153,14 +165,12 @@ def _prepare_pretraining_dataset(
cfg,
split="test",
processor=processor,
- preprocess_iterable=preprocess_iterable,
+ streaming=False,
)
- if cfg.dataset_exact_deduplication:
- LOG.info("Deduplication not available for pretrained datasets")
-
- # For pretraining, we return max_steps directly from config
- return train_dataset, eval_dataset, cfg.max_steps, []
+ # For streaming, we return max_steps directly from config or -1 if not set
+ total_num_steps = cfg.max_steps if cfg.max_steps else -1
+ return train_dataset, eval_dataset, total_num_steps, []
def _extract_pretraining_config(cfg: DictDefault) -> DictDefault:
@@ -192,7 +202,7 @@ def _extract_pretraining_config(cfg: DictDefault) -> DictDefault:
)
-def _load_pretraining_dataset(
+def _load_streaming_dataset(
pretraining_config: DictDefault, cfg: DictDefault, tokenizer: PreTrainedTokenizer
) -> IterableDataset:
"""Load and prepare a streaming dataset for pretraining."""
@@ -227,15 +237,11 @@ def _load_pretraining_dataset(
iter_dataset = iter_dataset.skip(pretraining_config["skip"])
# Wrap the dataset for pretraining
- train_dataset = wrap_pretraining_dataset(
+ train_dataset = wrap_streaming_dataset(
iter_dataset,
tokenizer,
cfg,
dataset_wrapper_partial,
- max_tokens=cfg.sequence_len,
- batch_size=cfg.micro_batch_size,
- seed=cfg.seed,
- buffer_size=cfg.pretrain_multipack_buffer_size or 10_000,
)
# Format for PyTorch
@@ -256,7 +262,7 @@ def _load_tokenized_prepared_datasets(
cfg: DictDefault,
split: Literal["train", "test"] = "train",
processor: ProcessorMixin | None = None,
- preprocess_iterable: bool = False,
+ streaming: bool = False,
) -> tuple[Dataset | DatasetDict, list[Prompter | None]]:
"""Load or create tokenized and prepared datasets for training or testing.
@@ -265,7 +271,7 @@ def _load_tokenized_prepared_datasets(
cfg: Configuration object.
split: Dataset split to load ('train' or 'test').
processor: Optional processor for multimodal datasets.
- preprocess_iterable: Whether to use iterable preprocessing.
+ streaming: Whether to use iterable preprocessing.
Returns:
Tuple of (dataset, prompters list).
@@ -296,7 +302,7 @@ def _load_tokenized_prepared_datasets(
tokenizer,
split,
processor,
- preprocess_iterable,
+ streaming,
)
return dataset, prompters
@@ -308,7 +314,7 @@ def _load_raw_datasets(
tokenizer: PreTrainedTokenizer,
split: str,
processor: ProcessorMixin | None = None,
- preprocess_iterable: bool = False,
+ streaming: bool = False,
) -> tuple[Dataset, list[Prompter | None]]:
"""Load, process, merge, and save raw datasets."""
LOG.info("Loading raw datasets...", main_process_only=False)
@@ -329,7 +335,7 @@ def _load_raw_datasets(
split=split,
seed=cfg.seed,
processor=processor,
- preprocess_iterable=preprocess_iterable,
+ streaming=streaming,
)
datasets.append(dataset_wrapper)
prompters.append(dataset_prompter)
@@ -337,11 +343,11 @@ def _load_raw_datasets(
# Merge datasets
dataset = merge_datasets(datasets, cfg)
- if not cfg.skip_prepare_dataset:
+ if not cfg.skip_prepare_dataset and not streaming:
if split == "test" and cfg.eval_sequence_len:
- dataset = drop_long_seq_in_dataset(dataset, cfg.eval_sequence_len, cfg)
+ dataset = handle_long_seq_in_dataset(dataset, cfg.eval_sequence_len, cfg)
else:
- dataset = drop_long_seq_in_dataset(dataset, cfg.sequence_len, cfg)
+ dataset = handle_long_seq_in_dataset(dataset, cfg.sequence_len, cfg)
if cfg.sample_packing:
dataset, _ = process_datasets_for_packing(cfg, dataset, None)
@@ -361,19 +367,19 @@ def _load_and_process_single_dataset(
split: str,
seed: int,
processor: ProcessorMixin | None = None,
- preprocess_iterable: bool = False,
+ streaming: bool = False,
) -> tuple[Dataset | IterableDataset, Prompter | None]:
"""Load and process a single dataset based on the passed config."""
# Load the dataset
dataset = load_dataset_with_config(
- dataset_config, cfg.hf_use_auth_token, streaming=preprocess_iterable
+ dataset_config, cfg.hf_use_auth_token, streaming=streaming
)
# Parse dataset type
d_base_type, d_prompt_style = _parse_dataset_type(dataset_config.type)
# Select the appropriate split
- if isinstance(dataset, DatasetDict):
+ if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
if dataset_config.split and dataset_config.split in dataset:
dataset = dataset[dataset_config.split]
elif split in dataset:
@@ -479,7 +485,7 @@ def _load_and_prepare_datasets(
cfg: DictDefault,
split: Literal["train", "test"] = "train",
processor: ProcessorMixin | None = None,
- preprocess_iterable: bool = False,
+ streaming: bool = False,
) -> tuple[Dataset | None, Dataset | None, list[Prompter | None]]:
"""Load and prepare datasets with optional validation split and sharding.
@@ -488,7 +494,7 @@ def _load_and_prepare_datasets(
cfg: Configuration object.
split: Dataset split to load ('train' or 'test').
processor: Optional processor for multimodal datasets.
- preprocess_iterable: Whether to use iterable preprocessing.
+ streaming: Whether to use iterable preprocessing.
Returns:
Tuple of (train_dataset, eval_dataset, prompters).
@@ -499,7 +505,7 @@ def _load_and_prepare_datasets(
cfg,
split=split,
processor=processor,
- preprocess_iterable=preprocess_iterable,
+ streaming=streaming,
)
# Apply dataset sharding if configured using shared function
diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py
index 21c8e472b..a8ed55ae2 100644
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -236,11 +236,14 @@ def _load_from_local_path(
try:
return load_from_disk(dataset_config.path)
except FileNotFoundError:
- load_dataset_kwargs["streaming"] = False
return load_dataset(dataset_config.path, **load_dataset_kwargs)
elif local_path.is_file():
dataset_type = get_dataset_type(dataset_config)
- load_dataset_kwargs["streaming"] = False
+
+ # For single file datasets, HF always creates only a "train" split
+ if dataset_type in ("json", "csv", "text"):
+ load_dataset_kwargs["split"] = "train"
+
return load_dataset(
dataset_type,
data_files=dataset_config.path,
@@ -337,7 +340,7 @@ def generate_split_fingerprints(
dataset: Dataset, val_set_size: int | float, seed: int
) -> tuple[str, str]:
"""Generate consistent fingerprints for train/test splits."""
- fingerprint = dataset._fingerprint # pylint: disable=protected-access
+ fingerprint = dataset._fingerprint
train_hash_input = f"{fingerprint}|{val_set_size}|train|{seed}"
test_hash_input = f"{fingerprint}|{val_set_size}|test|{seed}"
@@ -411,7 +414,7 @@ def save_preprocessed_dataset(
) -> None:
"""Save preprocessed dataset to disk and optionally push to the HF Hub."""
prepared_ds_path = get_prepared_dataset_path(cfg, dataset_hash)
- num_workers = cfg.dataset_processes or get_default_process_count()
+ num_workers = cfg.dataset_num_proc or get_default_process_count()
if isinstance(dataset, IterableDataset):
ds_from_iter = Dataset.from_generator(
functools.partial(_generate_from_iterable_dataset, dataset),
@@ -497,7 +500,7 @@ def try_load_from_hub(
token=cfg.hf_use_auth_token,
)
return dataset[split]
- except Exception: # pylint: disable=broad-except # nosec
+ except Exception:
LOG.info("Unable to find prepared dataset in HuggingFace Hub")
return None
diff --git a/src/axolotl/utils/data/pretraining.py b/src/axolotl/utils/data/streaming.py
similarity index 86%
rename from src/axolotl/utils/data/pretraining.py
rename to src/axolotl/utils/data/streaming.py
index f3422f990..2cb35ee7c 100644
--- a/src/axolotl/utils/data/pretraining.py
+++ b/src/axolotl/utils/data/streaming.py
@@ -1,4 +1,4 @@
-"""data handling specific to pretraining"""
+"""Data handling specific to streaming datasets."""
import functools
from collections import defaultdict
@@ -17,10 +17,10 @@ from axolotl.utils.trainer import process_pretraining_datasets_for_packing
LOG = get_logger(__name__)
-def encode_pretraining(
+def encode_streaming(
+ examples: Dict[str, List],
tokenizer: PreTrainedTokenizerBase,
max_tokens: int,
- examples: Dict[str, List],
text_column: str = "text",
concatenate: bool = True,
) -> Dict[str, List]:
@@ -67,7 +67,7 @@ def encode_pretraining(
buffer_labels = torch.tensor([], dtype=torch.long)
buffer_attention_mask = torch.tensor([], dtype=torch.long)
- for ids, labels, mask in zip(input_ids, targets, attention_mask):
+ for ids, labels, mask in zip(input_ids, targets, attention_mask, strict=False):
if buffer_input_ids.numel() == max_tokens:
new_input_ids.append(buffer_input_ids)
new_labels.append(buffer_labels)
@@ -176,45 +176,57 @@ def encode_pretraining(
return ret
-def wrap_pretraining_dataset(
+def wrap_streaming_dataset(
dataset,
tokenizer,
cfg,
ds_wrapper_fn,
- max_tokens=2048,
- batch_size=1,
- seed=42,
- buffer_size=10_000,
):
if cfg.sample_packing:
+ # For SFT (non-pretraining) datasets, always use multipack_attn=True to ensure
+ # attention isolation between packed sequences
+ multipack_attn = (
+ True if not cfg.pretraining_dataset else cfg.pretrain_multipack_attn
+ )
+
collate_fn = PretrainingBatchSamplerDataCollatorForSeq2Seq(
tokenizer,
return_tensors="pt",
padding=True,
- pad_to_multiple_of=max_tokens,
- multipack_attn=cfg.pretrain_multipack_attn,
+ pad_to_multiple_of=cfg.sequence_len,
+ multipack_attn=multipack_attn,
)
encode = functools.partial(
- encode_packed_pretraining,
+ encode_packed_streaming,
collate_fn,
ds_wrapper_fn,
- max_seq_length=max_tokens,
- batch_size=batch_size,
- multipack_attn=cfg.pretrain_multipack_attn,
+ max_seq_length=cfg.sequence_len,
+ batch_size=cfg.micro_batch_size,
+ multipack_attn=multipack_attn,
)
- # set this to 1 so downstream data_loader doesn't try to increase the batch again
+
+ # Set this to 1 so downstream data_loader doesn't try to increase the batch size
+ # again
cfg.micro_batch_size = 1
else:
+ # NOTE: This is not reachable for SFT datasets since we use the pre-existing
+ # loading function for non-packed streaming datasets. Refer to
+ # _prepare_streaming_datasets in sft.py for that code path.
+ text_column = (
+ getattr(cfg.pretraining_dataset[0], "text_column", "text") or "text"
+ )
encode = functools.partial(
- encode_pretraining,
- tokenizer,
- max_tokens,
- text_column=cfg.pretraining_dataset[0].text_column or "text",
+ encode_streaming,
+ tokenizer=tokenizer,
+ max_tokens=cfg.sequence_len,
+ text_column=text_column,
concatenate=cfg.pretraining_sample_concatenation is True,
)
if cfg.shuffle_merged_datasets:
- dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
+ dataset = dataset.shuffle(
+ seed=cfg.seed, buffer_size=cfg.streaming_multipack_buffer_size
+ )
else:
LOG.debug("NOT shuffling merged pretraining datasets")
@@ -232,14 +244,13 @@ def wrap_pretraining_dataset(
dataset = dataset.map(
encode,
batched=True,
- batch_size=buffer_size,
- # input_columns="text",
+ batch_size=cfg.streaming_multipack_buffer_size,
remove_columns=remove_columns,
)
return dataset
-def encode_packed_pretraining(
+def encode_packed_streaming(
collate_fn,
ds_wrapper: Callable,
examples: Dict[str, List],
@@ -247,7 +258,6 @@ def encode_packed_pretraining(
batch_size: int = 4,
multipack_attn: Optional[bool] = True,
) -> Dict[str, List]:
- # pylint: disable=duplicate-code
# tokenize all the examples
# rows get split with stride (overlap)
train_dataset = ds_wrapper(dataset=Dataset.from_dict(examples))[0]
@@ -275,8 +285,6 @@ def encode_packed_pretraining(
for batch in sampler:
for data in batch:
features = train_dataset[data]
- if "num_truncated_tokens" in features:
- del features["num_truncated_tokens"]
if "num_truncated_tokens" in features:
del features["num_truncated_tokens"]
if "overflow_to_sample_mapping" in features:
diff --git a/src/axolotl/utils/data/utils.py b/src/axolotl/utils/data/utils.py
index c0efb7a42..2d0ca9d0e 100644
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -44,7 +44,7 @@ def retry_on_request_exceptions(
def decorator(func):
@functools.wraps(func)
- def wrapper(*args, **kwargs): # pylint: disable=inconsistent-return-statements
+ def wrapper(*args, **kwargs):
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
@@ -148,7 +148,36 @@ def deduplicate_and_log_datasets(
return dataset, other_dataset
-def drop_long_seq_in_dataset(
+def truncate_long_seq(sample, sequence_len=2048, min_sequence_len=2):
+ """
+ Truncate samples whose sequence length is too long (> sequence_len)
+ or drop those too short (< min_sequence_len).
+ """
+ min_sequence_len = min_sequence_len or 2
+
+ input_ids = sample["input_ids"]
+ results = []
+
+ # Batched (input_ids is a list of lists)
+ for i, seq in enumerate(input_ids):
+ length = len(seq)
+ if length < min_sequence_len:
+ results.append(False)
+ elif length > sequence_len:
+ sample["input_ids"][i] = seq[:sequence_len]
+ if "attention_mask" in sample:
+ sample["attention_mask"][i] = sample["attention_mask"][i][:sequence_len]
+ if "labels" in sample:
+ sample["labels"][i] = sample["labels"][i][:sequence_len]
+ if "position_ids" in sample:
+ sample["position_ids"][i] = sample["position_ids"][i][:sequence_len]
+ results.append(True)
+ else:
+ results.append(True)
+ return results
+
+
+def handle_long_seq_in_dataset(
dataset: Dataset, sequence_len: int, cfg: DictDefault
) -> Dataset:
"""Remove sequences longer than configured maximum from dataset.
@@ -161,12 +190,21 @@ def drop_long_seq_in_dataset(
Returns:
Filtered dataset with long sequences removed.
"""
- if "input_ids" not in dataset.column_names:
+ if (
+ hasattr(dataset, "column_names")
+ and dataset.column_names
+ and "input_ids" not in dataset.column_names
+ ):
LOG.warning(
"Dataset does not contain 'input_ids' column. Skip drop long seq. This is "
"expected for reward modeling."
)
return dataset
+ elif not hasattr(dataset, "column_names") or dataset.column_names is None:
+ LOG.info(
+ "Dataset is streaming (IterableDataset), skipping long sequence handling"
+ )
+ return dataset
drop_long = functools.partial(
drop_long_seq,
@@ -185,15 +223,28 @@ def drop_long_seq_in_dataset(
filter_map_kwargs = {}
if not isinstance(dataset, IterableDataset):
- filter_map_kwargs["num_proc"] = cfg.dataset_processes
+ filter_map_kwargs["num_proc"] = cfg.dataset_num_proc
filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
drop_long_kwargs = {}
if filter_map_kwargs:
drop_long_kwargs["desc"] = f"Dropping Long Sequences (>{sequence_len})"
+ excess_length_strategy = (cfg.excess_length_strategy or "drop").lower()
+ if excess_length_strategy == "truncate":
+ process_fn = functools.partial(
+ truncate_long_seq,
+ sequence_len=sequence_len,
+ min_sequence_len=cfg.min_sample_len,
+ )
+ drop_long_kwargs["desc"] = (
+ f"Truncating/Filtering Sequences (target_len={sequence_len})"
+ )
+ else:
+ process_fn = drop_long
+
dataset = dataset.filter(
- drop_long,
+ process_fn,
batched=True,
**filter_map_kwargs,
**drop_long_kwargs,
@@ -201,6 +252,11 @@ def drop_long_seq_in_dataset(
if prior_len:
dropped = prior_len - len(dataset)
if dropped:
- LOG.warning(f"Dropped {dropped} long samples from dataset")
+ action = (
+ "truncated/filtered"
+ if excess_length_strategy == "truncate"
+ else "dropped"
+ )
+ LOG.warning(f"{action.title()} {dropped} samples from dataset")
return dataset
diff --git a/src/axolotl/utils/data/wrappers.py b/src/axolotl/utils/data/wrappers.py
index b6dc42c71..3a10bde00 100644
--- a/src/axolotl/utils/data/wrappers.py
+++ b/src/axolotl/utils/data/wrappers.py
@@ -54,7 +54,6 @@ def handle_unknown_dataset_strategy(dataset_config: DictDefault) -> NoReturn:
raise ValueError(error_message)
-# pylint: disable=too-many-return-statements
def get_dataset_wrapper(
dataset_config: DictDefault,
tokenizer: PreTrainedTokenizer,
@@ -62,7 +61,7 @@ def get_dataset_wrapper(
dataset_base_type: str | None,
dataset: Dataset | IterableDataset,
dataset_prompt_style: str | None = None,
- processor: ProcessorMixin | None = None, # pylint: disable=unused-argument
+ processor: ProcessorMixin | None = None,
) -> tuple[Dataset | IterableDataset, Prompter | None]:
"""Create an appropriate dataset wrapper and prompter based on dataset
configuration.
@@ -81,7 +80,7 @@ def get_dataset_wrapper(
"""
# Common parameters for dataset wrapping
dataset_kwargs: dict[str, Any] = {
- "process_count": cfg.dataset_processes,
+ "process_count": cfg.dataset_num_proc,
"keep_in_memory": cfg.dataset_keep_in_memory is True,
}
diff --git a/src/axolotl/utils/datasets.py b/src/axolotl/utils/datasets.py
index 93e1a2416..9b8a8e25a 100644
--- a/src/axolotl/utils/datasets.py
+++ b/src/axolotl/utils/datasets.py
@@ -4,6 +4,8 @@ import os
def get_default_process_count():
+ if axolotl_dataset_num_proc := os.environ.get("AXOLOTL_DATASET_NUM_PROC"):
+ return int(axolotl_dataset_num_proc)
if axolotl_dataset_processes := os.environ.get("AXOLOTL_DATASET_PROCESSES"):
return int(axolotl_dataset_processes)
if runpod_cpu_count := os.environ.get("RUNPOD_CPU_COUNT"):
diff --git a/src/axolotl/utils/dict.py b/src/axolotl/utils/dict.py
index c2670dfeb..7d146c7a9 100644
--- a/src/axolotl/utils/dict.py
+++ b/src/axolotl/utils/dict.py
@@ -17,15 +17,15 @@ class DictDefault(Dict):
def __setitem__(self, name, value):
# workaround for pickle/unpickle issues and __frozen not being available
try:
- isFrozen = hasattr( # pylint: disable=invalid-name
+ isFrozen = hasattr(self, "__frozen") and object.__getattribute__(
self, "__frozen"
- ) and object.__getattribute__(self, "__frozen")
+ )
except AttributeError:
- isFrozen = False # pylint: disable=invalid-name
+ isFrozen = False
if isFrozen and name not in super().keys():
raise KeyError(name)
- super(Dict, self).__setitem__(name, value) # pylint: disable=bad-super-call
+ super(Dict, self).__setitem__(name, value)
try:
p = object.__getattribute__(self, "__parent")
key = object.__getattribute__(self, "__key")
diff --git a/src/axolotl/utils/distributed.py b/src/axolotl/utils/distributed.py
index 48771fd97..840772d91 100644
--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -15,7 +15,7 @@ from transformers.utils.import_utils import (
is_torch_npu_available,
)
-distributed_state = None # pylint: disable=invalid-name
+distributed_state = None
def get_device_type() -> torch.device:
@@ -48,7 +48,7 @@ def get_current_device() -> int:
def init_distributed_state():
- global distributed_state # pylint: disable=global-statement
+ global distributed_state
if distributed_state is None:
timeout = int(os.environ.get("AXOLOTL_NCCL_TIMEOUT", 1800))
try:
@@ -137,7 +137,7 @@ def zero_first(is_main: bool):
barrier()
-def gather_scalar_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-name
+def gather_scalar_from_all_ranks(fn, world_size=1):
"""
Run a callable 'fn' on all ranks and gather the results on the specified rank.
@@ -201,7 +201,7 @@ def broadcast_dict(vals: dict):
return vals
-def compute_and_broadcast(fn): # pylint: disable=invalid-name
+def compute_and_broadcast(fn):
"""
Compute a value using the function 'fn' only on the specified rank (default is 0).
The value is then broadcasted to all other ranks.
@@ -234,7 +234,7 @@ def compute_and_broadcast(fn): # pylint: disable=invalid-name
return float(value_tensor.item())
-def gather_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-name
+def gather_from_all_ranks(fn, world_size=1):
"""
Run a callable 'fn' on all ranks and gather the results on the specified rank.
diff --git a/src/axolotl/utils/environment.py b/src/axolotl/utils/environment.py
index 3c83c87cb..d5f2d9f78 100644
--- a/src/axolotl/utils/environment.py
+++ b/src/axolotl/utils/environment.py
@@ -2,32 +2,47 @@
utils to get GPU info for the current environment
"""
+import os
from importlib.metadata import version
+import torch
from accelerate.utils.environment import (
check_cuda_p2p_ib_support as accelerate_check_cuda_p2p_ib_support,
)
-from accelerate.utils.environment import (
- get_gpu_info,
-)
from packaging.version import Version, parse
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
def check_cuda_p2p_ib_support():
if not accelerate_check_cuda_p2p_ib_support():
return False
- unsupported_devices = {"RTX 6000 Ada", "L40S"}
+ if not check_cuda_p2p_support():
+ return False
+ return True
+
+
+def check_cuda_p2p_support() -> bool:
try:
- device_names, device_count = get_gpu_info()
- if 1 < device_count < 8:
- if any(
- unsupported_device in device_name
- for device_name in device_names
- for unsupported_device in unsupported_devices
- ):
- return False
- except Exception: # pylint: disable=broad-except # nosec
- pass
+ world_size = int(os.environ.get("WORLD_SIZE", "1"))
+ local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+ except ValueError:
+ return True
+
+ if world_size > 1:
+ node_world_size = int(os.environ.get("NODE_WORLD_SIZE", "8"))
+ local_other_rank = (local_rank // node_world_size) * node_world_size
+ local_other_rank += 1 if (local_rank % node_world_size) == 0 else 0
+ try:
+ can_p2p = torch.cuda.can_device_access_peer(local_rank, local_other_rank)
+ except AssertionError as exc:
+ # some sort of logic error in indexing processes, assume p2p is fine for now
+ LOG.warning(exc)
+ return True
+ return can_p2p
+
return True
diff --git a/src/axolotl/utils/logging.py b/src/axolotl/utils/logging.py
index 7cc3530ae..35810897a 100644
--- a/src/axolotl/utils/logging.py
+++ b/src/axolotl/utils/logging.py
@@ -2,7 +2,6 @@
import functools
import logging
-import os
from axolotl.utils.distributed import is_main_process
@@ -40,10 +39,6 @@ class MultiProcessAdapter(logging.LoggerAdapter):
def get_logger(name: str, log_level: str | None = None) -> MultiProcessAdapter:
- if log_level is None:
- log_level = os.environ.get("AXOLOTL_LOG_LEVEL", None)
logger = logging.getLogger(name)
- if log_level is not None:
- logger.setLevel(log_level.upper())
- logger.root.setLevel(log_level.upper())
+ logger.setLevel(logging.DEBUG)
return MultiProcessAdapter(logger, extra={})
diff --git a/src/axolotl/utils/lora.py b/src/axolotl/utils/lora.py
index 759c17ac2..6ae481b6b 100644
--- a/src/axolotl/utils/lora.py
+++ b/src/axolotl/utils/lora.py
@@ -15,6 +15,7 @@
"""
module to get the state dict of a merged lora model
"""
+
import torch
from peft.tuners.tuners_utils import onload_layer
from peft.utils import ModulesToSaveWrapper, _get_submodules
diff --git a/src/axolotl/utils/mistral/__init__.py b/src/axolotl/utils/mistral/__init__.py
index eb1e2df89..eb51031ec 100644
--- a/src/axolotl/utils/mistral/__init__.py
+++ b/src/axolotl/utils/mistral/__init__.py
@@ -1,5 +1,6 @@
"""Init for `axolotl.utils.mistral` module."""
+from axolotl.utils.mistral.mistral3_processor import Mistral3Processor
from axolotl.utils.mistral.mistral_tokenizer import HFMistralTokenizer
-__all__ = ["HFMistralTokenizer"]
+__all__ = ["HFMistralTokenizer", "Mistral3Processor"]
diff --git a/src/axolotl/utils/mistral/mistral3_processor.py b/src/axolotl/utils/mistral/mistral3_processor.py
new file mode 100644
index 000000000..85479ca7b
--- /dev/null
+++ b/src/axolotl/utils/mistral/mistral3_processor.py
@@ -0,0 +1,169 @@
+"""Processor for Mistral3 multimodal models with image support"""
+
+from typing import Any, Dict, Optional, Union
+
+import torch
+from transformers import ProcessorMixin
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import ProcessingKwargs
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+from axolotl.utils.mistral.mistral_tokenizer import HFMistralTokenizer
+
+
+class Mistral3ProcessorKwargs(ProcessingKwargs):
+ _defaults: Dict[str, Dict[str, Any]] = {
+ "text_kwargs": {
+ "padding": True,
+ },
+ "common_kwargs": {
+ "return_tensors": "pt",
+ "return_dict": True,
+ "tokenize": True,
+ },
+ }
+
+
+class Mistral3Processor(ProcessorMixin):
+ """
+ Processor for Mistral3 multimodal models that handles text and images.
+ Wraps HFMistralTokenizer and adds image processing capabilities.
+ """
+
+ attributes = ["tokenizer"]
+ tokenizer_class = "HFMistralTokenizer"
+
+ def __init__(self, tokenizer: HFMistralTokenizer):
+ # Don't call super().__init__ to avoid the class validation issue
+ self.tokenizer = tokenizer
+
+ @property
+ def chat_template(self) -> None:
+ """Chat template is not supported. Dummy method to satisfy HuggingFace API."""
+ return None
+
+ @property
+ def audio_tokenizer(self) -> None:
+ """Audio tokenizer is not supported. Dummy method to satisfy HuggingFace API."""
+ return None
+
+ def _merge_kwargs(
+ self, processor_kwargs_class: Any, **kwargs: Any
+ ) -> Dict[str, Dict[str, Any]]:
+ """Merge kwargs with defaults similar to ProcessorMixin"""
+ defaults = processor_kwargs_class._defaults
+ output_kwargs: Dict[str, Dict[str, Any]] = {}
+
+ for kwarg_type, default_values in defaults.items():
+ output_kwargs[kwarg_type] = {**default_values}
+
+ # Update with provided kwargs
+ for key, value in kwargs.items():
+ # Try to match key to appropriate kwarg type
+ if key in ["padding", "truncation", "max_length"]:
+ output_kwargs.setdefault("text_kwargs", {}).update({key: value})
+ elif key in ["return_tensors", "return_dict", "tokenize"]:
+ output_kwargs.setdefault("common_kwargs", {}).update({key: value})
+ else:
+ # Add to text_kwargs by default
+ output_kwargs.setdefault("text_kwargs", {}).update({key: value})
+
+ return output_kwargs
+
+ def apply_chat_template(
+ self,
+ conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
+ **kwargs: Any,
+ ) -> Union[BatchFeature, str, list[str]]:
+ """
+ Apply chat template with image support for Mistral3.
+
+ Similar to VoxtralProcessor, this method extracts images from the conversation,
+ calls the tokenizer's apply_chat_template, then adds pixel_values and image_sizes
+ to the result.
+ """
+ output_kwargs = self._merge_kwargs(Mistral3ProcessorKwargs, **kwargs)
+ text_kwargs = output_kwargs["text_kwargs"]
+ common_kwargs = output_kwargs["common_kwargs"]
+
+ return_tensors = common_kwargs.pop("return_tensors", "pt")
+ if return_tensors != "pt":
+ raise ValueError(
+ f"{self.__class__.__name__} only supports `return_tensors='pt'`."
+ )
+
+ return_dict = common_kwargs.pop("return_dict", False)
+ tokenize = common_kwargs.pop("tokenize", False)
+
+ # Determine if batched
+ if isinstance(conversation, (list, tuple)) and (
+ isinstance(conversation[0], (list, tuple))
+ or hasattr(conversation[0], "content")
+ ):
+ is_batched = True
+ conversations = conversation
+ else:
+ is_batched = False
+ conversations = [conversation] # type: ignore
+
+ # Call tokenizer's apply_chat_template
+ tokenizer_kwargs = {**text_kwargs, **common_kwargs}
+ tokenizer_kwargs["return_tensors"] = return_tensors
+ tokenizer_kwargs["tokenize"] = tokenize
+ tokenizer_kwargs["return_dict"] = return_dict
+
+ encoded_instruct_inputs = self.tokenizer.apply_chat_template(
+ conversations,
+ **tokenizer_kwargs,
+ )
+
+ if tokenize:
+ if return_dict:
+ # The tokenizer already handles pixel_values, we just need to add image_sizes
+ if hasattr(encoded_instruct_inputs, "items"):
+ data: Dict[str, Any] = dict(encoded_instruct_inputs) # type: ignore
+ elif hasattr(encoded_instruct_inputs, "data"):
+ data = encoded_instruct_inputs.data # type: ignore
+ else:
+ raise ValueError("Unknown data type")
+
+ if "pixel_values" in data:
+ pixel_values = data["pixel_values"]
+
+ # MistralTokenizer returns a Double, so we convert to fp32
+ data["pixel_values"] = pixel_values.to(dtype=torch.float32)
+
+ # Always batched: [B, C, H, W] -> image_sizes: [B, 2]
+ # Since tensor is homogeneous, all images have same H, W
+ batch_size = pixel_values.shape[0]
+ image_sizes = torch.tensor([pixel_values.shape[-2:]] * batch_size)
+ data["image_sizes"] = image_sizes
+
+ return BatchFeature(data=data, tensor_type=return_tensors)
+
+ if not is_batched:
+ return encoded_instruct_inputs[0]
+
+ return encoded_instruct_inputs
+
+ def __call__(
+ self,
+ text: Optional[
+ Union[
+ TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
+ ]
+ ],
+ **kwargs: Any,
+ ) -> BatchFeature:
+ """
+ Forward text processing to the tokenizer.
+ This method does not support images - use apply_chat_template instead.
+ """
+ output_kwargs = self._merge_kwargs(Mistral3ProcessorKwargs, **kwargs)
+ text_kwargs = output_kwargs["text_kwargs"]
+ common_kwargs = output_kwargs["common_kwargs"]
+
+ out = self.tokenizer(text, **text_kwargs)
+ return BatchFeature(
+ data=out, tensor_type=common_kwargs.pop("return_tensors", None)
+ )
diff --git a/src/axolotl/utils/mistral/mistral_tokenizer.py b/src/axolotl/utils/mistral/mistral_tokenizer.py
index 61cbdc5b0..0414ece78 100644
--- a/src/axolotl/utils/mistral/mistral_tokenizer.py
+++ b/src/axolotl/utils/mistral/mistral_tokenizer.py
@@ -53,7 +53,7 @@ class HFMistralTokenizer(MistralCommonTokenizer):
"""
# Check if MistralRequestValidator has a _mode attribute.
# This is a private API and may change in the future.
- # pylint: disable=protected-access
+
from mistral_common.protocol.instruct.validator import MistralRequestValidator
if not (
@@ -74,7 +74,7 @@ class HFMistralTokenizer(MistralCommonTokenizer):
def apply_chat_template( # type: ignore
self,
conversation: list[dict] | list[list[dict]],
- chat_template: str | None = None, # pylint: disable=unused-argument
+ chat_template: str | None = None,
add_generation_prompt: bool = False,
**kwargs,
) -> str | list[int]:
diff --git a/src/axolotl/utils/model_shard_quant.py b/src/axolotl/utils/model_shard_quant.py
index 5c5006eda..ca152113a 100644
--- a/src/axolotl/utils/model_shard_quant.py
+++ b/src/axolotl/utils/model_shard_quant.py
@@ -46,13 +46,11 @@ def _replace_linear(
if isinstance(module, torch.nn.Linear) and name not in skip_modules:
if issubclass(linear_replacement, Linear4bit):
- model._modules[name] = ( # pylint: disable=protected-access
- linear_replacement(
- module.in_features,
- module.out_features,
- module.bias is not None,
- **kwargs,
- )
+ model._modules[name] = linear_replacement(
+ module.in_features,
+ module.out_features,
+ module.bias is not None,
+ **kwargs,
)
else:
raise ValueError(
@@ -150,8 +148,8 @@ def load_sharded_model(
model = AutoModelForCausalLM.from_pretrained(
model_name,
use_cache=False,
- torch_dtype=torch.float32,
- _attn_implementation=model_config._attn_implementation, # pylint: disable=protected-access
+ dtype=torch.float32,
+ _attn_implementation=model_config._attn_implementation,
trust_remote_code=cfg.trust_remote_code,
)
dtype = torch_dtype if not cfg.float32 else None
@@ -160,7 +158,7 @@ def load_sharded_model(
with init_empty_weights():
model = AutoModelForCausalLM.from_config(
model_config,
- torch_dtype=torch_dtype,
+ dtype=torch_dtype,
trust_remote_code=cfg.trust_remote_code,
)
return model
diff --git a/src/axolotl/utils/optimizers/adopt.py b/src/axolotl/utils/optimizers/adopt.py
index 6f064abbf..20ddfa7ec 100644
--- a/src/axolotl/utils/optimizers/adopt.py
+++ b/src/axolotl/utils/optimizers/adopt.py
@@ -6,7 +6,6 @@ Taniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeo
"""
# mypy: ignore-errors
-# pylint: skip-file
# flake8: noqa
# mypy: allow-untyped-decorators
# mypy: allow-untyped-defs
@@ -288,7 +287,9 @@ def _single_tensor_adopt(
assert (
param.device.type == step_t.device.type
and param.device.type in capturable_supported_devices
- ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+ ), (
+ f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+ )
step = step_t if capturable or differentiable else _get_value(step_t)
@@ -365,7 +366,9 @@ def _multi_tensor_adopt(
p.device.type == step.device.type
and p.device.type in capturable_supported_devices
for p, step in zip(params, state_steps)
- ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+ ), (
+ f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+ )
assert grad_scale is None and found_inf is None
diff --git a/src/axolotl/utils/quantization.py b/src/axolotl/utils/quantization.py
index f9a30b660..6c29a5442 100644
--- a/src/axolotl/utils/quantization.py
+++ b/src/axolotl/utils/quantization.py
@@ -3,30 +3,47 @@ Utilities for quantization including QAT and PTQ using torchao.
"""
import torch
-from torch import nn
+from packaging import version
from torchao.core.config import AOBaseConfig
from torchao.quantization import quantize_
from torchao.quantization.qat import (
- FakeQuantizeConfig,
- FromIntXQuantizationAwareTrainingConfig,
- IntXQuantizationAwareTrainingConfig,
+ QATConfig,
)
from torchao.quantization.quant_api import (
- Int4DynamicActivationInt4WeightConfig,
- Int4WeightOnlyConfig,
+ Float8DynamicActivationFloat8WeightConfig,
+ Float8DynamicActivationInt4WeightConfig,
Int8DynamicActivationInt4WeightConfig,
- Int8DynamicActivationInt8WeightConfig,
- Int8WeightOnlyConfig,
- UIntXWeightOnlyConfig,
- _is_linear,
)
-from axolotl.utils.schemas.enums import TorchIntDType
+from axolotl.utils.schemas.enums import TorchAOQuantDType
+
+quantization_config_to_str = {
+ Int8DynamicActivationInt4WeightConfig: "int8int4",
+ Float8DynamicActivationFloat8WeightConfig: "fp8fp8",
+ Float8DynamicActivationInt4WeightConfig: "fp8int4",
+}
+
+if version.parse(torch.__version__) >= version.parse("2.8.0"):
+ try:
+ from torchao.prototype.mx_formats import NVFP4InferenceConfig
+
+ quantization_config_to_str[NVFP4InferenceConfig] = "nvfp4"
+ except:
+ pass
+
+ # int4 weight config imports will fail on machines with fbgemm-gpu installed
+ # without a CUDA runtime available so we do this safely
+ try:
+ from torchao.quantization.quant_api import Int4WeightOnlyConfig
+
+ quantization_config_to_str[Int4WeightOnlyConfig] = "int4"
+ except:
+ pass
-def get_ptq_config(
- weight_dtype: TorchIntDType,
- activation_dtype: TorchIntDType | None = None,
+def get_quantization_config(
+ weight_dtype: TorchAOQuantDType,
+ activation_dtype: TorchAOQuantDType | None = None,
group_size: int | None = None,
) -> AOBaseConfig:
"""
@@ -45,44 +62,101 @@ def get_ptq_config(
or if the group size is not specified for int8 or int4 weight only quantization.
"""
if activation_dtype is None:
- if not weight_dtype.value.is_signed: # type: ignore[attr-defined,union-attr]
- return UIntXWeightOnlyConfig(
- dtype=weight_dtype.value,
- group_size=group_size,
- set_inductor_config=False,
- )
- if weight_dtype == TorchIntDType.int8:
- if group_size is None:
- raise ValueError(
- "group_size must be specified for int8 weight only quantization"
- )
- return Int8WeightOnlyConfig(
- group_size=group_size,
- )
- if weight_dtype == TorchIntDType.int4:
- if group_size is None:
- raise ValueError(
- "group_size must be specified for int4 weight only quantization"
- )
- return Int4WeightOnlyConfig(
- group_size=group_size,
- )
- if activation_dtype == TorchIntDType.int4 and weight_dtype == TorchIntDType.int4:
- return Int4DynamicActivationInt4WeightConfig()
- if activation_dtype == TorchIntDType.int8 and weight_dtype == TorchIntDType.int8:
- return Int8DynamicActivationInt8WeightConfig()
- if activation_dtype == TorchIntDType.int8 and weight_dtype == TorchIntDType.int4:
- return Int8DynamicActivationInt4WeightConfig()
+ if weight_dtype == TorchAOQuantDType.int8:
+ raise ValueError("Int8WeightOnlyConfig is not supported by torchao QAT.")
+ if weight_dtype == TorchAOQuantDType.int4:
+ from torchao.quantization.quant_api import Int4WeightOnlyConfig
+
+ if group_size is not None:
+ return Int4WeightOnlyConfig(group_size=group_size, version=2)
+ else:
+ return Int4WeightOnlyConfig(version=2)
+ if (
+ activation_dtype == TorchAOQuantDType.int4
+ and weight_dtype == TorchAOQuantDType.int4
+ ):
+ raise ValueError(
+ "Int4DynamicActivationInt4WeightConfig is not supported by torchao QAT."
+ )
+ if (
+ activation_dtype == TorchAOQuantDType.int8
+ and weight_dtype == TorchAOQuantDType.int8
+ ):
+ raise ValueError(
+ "Int8DynamicActivationInt8WeightConfig is not supported by torchao QAT."
+ )
+ if (
+ activation_dtype == TorchAOQuantDType.int8
+ and weight_dtype == TorchAOQuantDType.int4
+ ):
+ if group_size is not None:
+ return Int8DynamicActivationInt4WeightConfig(group_size=group_size)
+ else:
+ return Int8DynamicActivationInt4WeightConfig()
+ if (
+ activation_dtype == TorchAOQuantDType.float8_e4m3fn
+ and weight_dtype == TorchAOQuantDType.float8_e4m3fn
+ ):
+ return Float8DynamicActivationFloat8WeightConfig()
+ if (
+ activation_dtype == TorchAOQuantDType.float8_e4m3fn
+ and weight_dtype == TorchAOQuantDType.int4
+ ):
+ return Float8DynamicActivationInt4WeightConfig()
+ if weight_dtype == TorchAOQuantDType.nvfp4:
+ from torchao.prototype.mx_formats import NVFP4InferenceConfig
+
+ if group_size is not None and group_size != 16:
+ raise ValueError("NVFP4 quantization must use a group_size of 16")
+ return NVFP4InferenceConfig()
raise ValueError(
f"Invalid activation/weight dtype combination: {activation_dtype}/{weight_dtype}"
)
+def quantize_model(
+ model,
+ weight_dtype: TorchAOQuantDType,
+ group_size: int | None = None,
+ activation_dtype: TorchAOQuantDType | None = None,
+ quantize_embedding: bool | None = None,
+):
+ """
+ This function is used to quantize a model.
+
+ Args:
+ model: The model to quantize.
+ weight_dtype: The dtype to use for weight quantization.
+ group_size: The group size to use for weight quantization.
+ activation_dtype: The dtype to use for activation quantization.
+ quantize_embedding: Whether to quantize the model's embedding weights.
+
+ """
+ linear_ptq_config = get_quantization_config(
+ weight_dtype=weight_dtype,
+ activation_dtype=activation_dtype,
+ group_size=group_size,
+ )
+ quantize_(model, linear_ptq_config)
+ if quantize_embedding:
+ # activation fake quantization is not supported for embedding layers
+ embedding_quantize_config = get_quantization_config(
+ weight_dtype=weight_dtype,
+ activation_dtype=None,
+ group_size=group_size,
+ )
+ quantize_(
+ model,
+ embedding_quantize_config,
+ filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
+ )
+
+
def prepare_model_for_qat(
model,
- weight_dtype: TorchIntDType,
- group_size: int,
- activation_dtype: TorchIntDType | None = None,
+ weight_dtype: TorchAOQuantDType,
+ group_size: int | None = None,
+ activation_dtype: TorchAOQuantDType | None = None,
quantize_embedding: bool = False,
):
"""
@@ -100,86 +174,40 @@ def prepare_model_for_qat(
Raises:
ValueError: If the activation/weight dtype combination is invalid.
"""
- if activation_dtype:
- activation_config = FakeQuantizeConfig(
- dtype=activation_dtype.value, granularity="per_token", is_symmetric=False
- )
- weight_config = FakeQuantizeConfig(dtype=weight_dtype.value, group_size=group_size)
- linear_quantize_config = IntXQuantizationAwareTrainingConfig(
- activation_config=None if activation_dtype is None else activation_config,
- weight_config=weight_config,
- )
- quantize_(model, linear_quantize_config)
- if quantize_embedding:
- # activation fake quantization is not supported for embedding layers
- embedding_quantize_config = IntXQuantizationAwareTrainingConfig(
- activation_config=None,
- weight_config=weight_config,
- )
- quantize_(
- model,
- embedding_quantize_config,
- filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
- )
-
-
-def quantize_model_for_ptq(
- model,
- weight_dtype: TorchIntDType,
- group_size: int | None = None,
- activation_dtype: TorchIntDType | None = None,
- quantize_embedding: bool | None = None,
-):
- """
- This function is used to quantize a model for post-training quantization.
- It swaps the model's linear layers with fake quantized linear layers.
- If `quantize_embedding` is True, it will also swap the model's embedding weights with fake quantized embedding weights.
-
- Args:
- model: The model to quantize.
- weight_dtype: The dtype to use for weight quantization.
- group_size: The group size to use for weight quantization.
- activation_dtype: The dtype to use for activation quantization.
- quantize_embedding: Whether to quantize the model's embedding weights.
-
- """
- linear_ptq_config = get_ptq_config(
+ base_config = get_quantization_config(
weight_dtype=weight_dtype,
activation_dtype=activation_dtype,
group_size=group_size,
)
- quantize_(model, linear_ptq_config)
+ qat_config = QATConfig(base_config)
+ quantize_(model, qat_config)
if quantize_embedding:
- embedding_quantize_config = get_ptq_config(
+ # activation fake quantization is not supported for embedding layers
+ embedding_base_config = get_quantization_config(
weight_dtype=weight_dtype,
activation_dtype=None,
group_size=group_size,
)
+ embedding_qat_config = QATConfig(embedding_base_config)
quantize_(
model,
- embedding_quantize_config,
+ embedding_qat_config,
filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
)
-def convert_qat_model_for_ptq(
+def convert_qat_model(
model,
- *,
- quantize_embedding: bool | None = None,
+ quantize_embedding: bool = False,
):
"""
- This function is used to convert a swap fake-quantized modules in a model
- which has been trained with QAT back to the original modules, ready for PTQ.
-
- Args:
- model: The model to convert.
- quantize_embedding: Whether to quantize the model's embedding weights.
+ This function converts a QAT model which has fake quantized layers back to the original model.
"""
+ config = QATConfig(step="convert")
+ quantize_(model, config)
if quantize_embedding:
-
- def filter_fn(m, _):
- return isinstance(m, nn.Embedding) or _is_linear(m)
-
- else:
- filter_fn = _is_linear
- quantize_(model, FromIntXQuantizationAwareTrainingConfig(), filter_fn=filter_fn)
+ quantize_(
+ model,
+ config,
+ filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
+ )
diff --git a/src/axolotl/utils/samplers/multipack.py b/src/axolotl/utils/samplers/multipack.py
index af62c0a4f..662c63caa 100644
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -5,6 +5,7 @@ into fixed-capacity batches to optimize memory usage and training throughput.
import gc
import math
+import os
import time
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import cpu_count, get_context
@@ -268,7 +269,7 @@ class MultipackBatchSampler(BatchSampler):
num_processes: int | None = None, # Number of processes for parallel packing
safe_mode: bool = True, # Conservative packing to prevent training instability
mp_start_method: str = "fork",
- **kwargs, # pylint: disable=unused-argument
+ **kwargs,
):
super().__init__(sampler, batch_size, drop_last)
self.batch_size = batch_size
@@ -291,7 +292,10 @@ class MultipackBatchSampler(BatchSampler):
self.total_token_slots = 0
# The number of times to calculate batches to determine minimum packed dataset length
- self.num_count_samples = num_count_samples
+ world_size = int(os.environ.get("WORLD_SIZE", "1"))
+ self.num_count_samples = (
+ 1 if world_size >= num_count_samples else num_count_samples
+ )
if self.sequential and not isinstance(sampler, SequentialSampler):
LOG.warning(
@@ -317,9 +321,7 @@ class MultipackBatchSampler(BatchSampler):
return self._batches
# Get indices from the sampler
- indices = [ # pylint: disable=unnecessary-comprehension
- idx for idx in self.sampler
- ]
+ indices = [idx for idx in self.sampler]
# Get lengths of the selected sequences
lengths = self.lengths[indices]
@@ -417,7 +419,7 @@ class MultipackBatchSampler(BatchSampler):
# Gather efficiency from all ranks and apply the calculation function
sample_packing_actual_eff_all = reduce_and_broadcast(
- lambda: float(self.efficiency()), # pylint: disable=unnecessary-lambda
+ lambda: float(self.efficiency()),
calc_sample_packing_eff_est,
)
diff --git a/src/axolotl/utils/schedulers.py b/src/axolotl/utils/schedulers.py
index cdaf92271..83a993089 100644
--- a/src/axolotl/utils/schedulers.py
+++ b/src/axolotl/utils/schedulers.py
@@ -107,9 +107,7 @@ class InterpolatingLogScheduler(LRScheduler):
self.num_steps = num_steps
self.min_lr = min_lr
self.max_lr = max_lr
- self.q = (max_lr / min_lr) ** ( # pylint: disable=invalid-name
- 1 / (num_steps - 1)
- )
+ self.q = (max_lr / min_lr) ** (1 / (num_steps - 1))
super().__init__(optimizer, last_epoch)
def get_lr(self):
@@ -310,7 +308,6 @@ class JaggedLRRestartScheduler(LRScheduler):
jagged_restart_anneal_steps: int = 1,
min_lr_scale: float = 0.001,
) -> None:
- # pylint: disable=duplicate-code
self.inner_schedule = inner_schedule
self.restarts_steps = jagged_restart_steps
self.warmup_steps = jagged_restart_warmup_steps
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index 21e99c048..86b3aa17b 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -1,7 +1,5 @@
"""Module with Pydantic models for configuration."""
-# pylint: disable=too-many-lines
-
from typing import Annotated, Any, Literal
from annotated_types import MinLen
@@ -26,11 +24,13 @@ from axolotl.utils.schemas.datasets import (
)
from axolotl.utils.schemas.deprecated import DeprecatedParameters, RemappedParameters
from axolotl.utils.schemas.enums import ChatTemplate, RingAttnFunc, RLType
+from axolotl.utils.schemas.fsdp import FSDPConfig
from axolotl.utils.schemas.integrations import (
CometConfig,
GradioConfig,
LISAConfig,
MLFlowConfig,
+ OpenTelemetryConfig,
RayConfig,
WandbConfig,
)
@@ -51,7 +51,6 @@ from axolotl.utils.schemas.vllm import VllmConfig
LOG = get_logger(__name__)
-# pylint: disable=too-many-ancestors
class AxolotlInputConfig(
ModelInputConfig,
ModelOutputConfig,
@@ -62,6 +61,7 @@ class AxolotlInputConfig(
WandbConfig,
MLFlowConfig,
CometConfig,
+ OpenTelemetryConfig,
LISAConfig,
GradioConfig,
RayConfig,
@@ -109,6 +109,12 @@ class AxolotlInputConfig(
"description": "Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs"
},
)
+ reinit_weights: bool | None = Field(
+ default=None,
+ json_schema_extra={
+ "description": "Reinitialize model weights randomly instead of loading pretrained weights"
+ },
+ )
trainer_cls: str | None = Field(
default=None,
@@ -124,10 +130,10 @@ class AxolotlInputConfig(
},
)
trl: TRLConfig | None = Field(
- default_factory=lambda: TRLConfig(), # pylint: disable=unnecessary-lambda
+ default_factory=lambda: TRLConfig(),
)
vllm: VllmConfig | None = Field(
- default_factory=lambda: VllmConfig(), # pylint: disable=unnecessary-lambda
+ default_factory=lambda: VllmConfig(),
)
qat: QATConfig | None = None
quantization: PTQConfig | None = None
@@ -141,6 +147,12 @@ class AxolotlInputConfig(
"description": "Process reward modelling: `True` or `False`"
},
)
+ center_rewards_coefficient: float | None = Field(
+ default=None,
+ json_schema_extra={
+ "description": "Coefficient to incentivize the reward model to output mean-zero rewards (proposed by https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`."
+ },
+ )
num_labels: int | None = None
# Whether to use weighting in DPO trainer.
# If `None`, default is `False` in the trainer.
@@ -224,6 +236,7 @@ class AxolotlInputConfig(
)
dataset_processes: int | None = Field(
default=None,
+ deprecated="Use `dataset_num_proc` instead. This parameter will be removed in a future version.",
json_schema_extra={
"description": (
"The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` if not set.\n"
@@ -231,6 +244,16 @@ class AxolotlInputConfig(
)
},
)
+ dataset_num_proc: int | None = Field(
+ default=None,
+ json_schema_extra={
+ "description": (
+ "The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` if not set.\n"
+ "For Runpod VMs, it will default to number of vCPUs via RUNPOD_CPU_COUNT."
+ )
+ },
+ )
+
dataset_exact_deduplication: bool | None = Field(
default=None,
json_schema_extra={
@@ -414,6 +437,12 @@ class AxolotlInputConfig(
"description": "The maximum length of an input to train with, this should typically be less than 2048 as most models have a token/context limit of 2048"
},
)
+ excess_length_strategy: Literal["drop", "truncate"] | None = Field(
+ default=None,
+ json_schema_extra={
+ "description": "What to do when a tokenized row exceeds sequence_len. 'drop' removes the row; 'truncate' slices tensors to sequence_len. Defaults to 'drop' for backward compatibility."
+ },
+ )
eval_sequence_len: int | None = Field(
default=None,
json_schema_extra={
@@ -421,8 +450,8 @@ class AxolotlInputConfig(
},
)
min_sample_len: int | None = None
- max_prompt_len: int = Field(
- default=512,
+ max_prompt_len: int | None = Field(
+ default=None,
json_schema_extra={"description": "maximum prompt length for RL training"},
)
sample_packing: bool | None = Field(
@@ -472,12 +501,6 @@ class AxolotlInputConfig(
},
)
multipack_real_batches: bool | None = None
- pretraining_sample_concatenation: bool | None = Field(
- default=None,
- json_schema_extra={
- "description": "whether to concatenate samples during pretraining",
- },
- )
batch_flattening: Literal["auto"] | bool | None = Field(
default=None,
@@ -492,13 +515,34 @@ class AxolotlInputConfig(
pose_max_context_len: int | None = None
pose_num_chunks: int | None = None
- pretrain_multipack_buffer_size: int | None = 10_000
+ # Deprecated: Use streaming_multipack_buffer_size instead
+ pretrain_multipack_buffer_size: int | None = Field(
+ default=None,
+ deprecated="Deprecated in v0.13.0, will be removed in v0.14.0. Use streaming_multipack_buffer_size instead",
+ )
pretrain_multipack_attn: bool | None = Field(
default=True,
json_schema_extra={
"description": "whether to prevent cross attention for packed sequences during pretraining",
},
)
+ pretraining_sample_concatenation: bool | None = Field(
+ default=None,
+ json_schema_extra={
+ "description": "whether to concatenate samples during pretraining",
+ },
+ )
+
+ streaming: bool | None = Field(
+ default=None,
+ json_schema_extra={"description": "Use streaming mode for loading datasets"},
+ )
+ streaming_multipack_buffer_size: int | None = Field(
+ default=10_000,
+ json_schema_extra={
+ "description": "Buffer size for multipack streaming datasets"
+ },
+ )
xformers_attention: bool | None = Field(
default=None,
@@ -637,8 +681,7 @@ class AxolotlInputConfig(
json_schema_extra={"description": "FSDP configuration"},
deprecated="Configuring FSDP using `fsdp` is deprecated. Please use `fsdp_config` instead. ",
)
- # TODO @SalmanMohammadi strongly type this as its own schema
- fsdp_config: dict[str, Any] | None = Field(
+ fsdp_config: FSDPConfig | None = Field(
default=None, json_schema_extra={"description": "FSDP configuration options"}
)
fsdp_version: int | None = Field(
@@ -827,10 +870,15 @@ class AxolotlInputConfig(
include_tokens_per_second: bool | None = Field(
default=None,
json_schema_extra={
- "description": "bool of whether to include tokens trainer per second in the training metrics. This iterates over the entire dataset once, so it takes some time."
+ "description": "bool of whether to report tokens per second at the end of training. This is not supported with pre-training datasets."
+ },
+ )
+ include_tkps: bool | None = Field(
+ default=True,
+ json_schema_extra={
+ "description": "bool of whether to report tokens per second per-gpu during training by measuring throughput of non-padding tokens."
},
)
-
neftune_noise_alpha: float | None = Field(
default=None,
json_schema_extra={
@@ -924,7 +972,15 @@ class AxolotlInputConfig(
},
)
- fix_untrained_tokens: int | list[int] | None = None
+ fix_untrained_tokens: int | list[int] | None = Field(
+ default=None,
+ json_schema_extra={
+ "description": (
+ "Token index or indices to adjust embedding weights to the mean of the other tokens. "
+ "This is useful when the model has untrained embeddings."
+ )
+ },
+ )
# INTERNALS - document for now, generally not set externally
is_preprocess: bool | None = None
@@ -983,6 +1039,26 @@ class AxolotlInputConfig(
return [ds_config.model_dump(exclude_none=True) for ds_config in ds_configs]
return None
+ @model_validator(mode="before")
+ @classmethod
+ def warn_peft_trainable_token_to_fix_untrained(cls, data):
+ if (
+ peft_trainable_token_indices := data.get("peft_trainable_token_indices")
+ ) and (fix_untrained_tokens := data.get("fix_untrained_tokens")):
+ if isinstance(fix_untrained_tokens, int):
+ fix_untrained_tokens = (fix_untrained_tokens,)
+
+ if isinstance(peft_trainable_token_indices, int):
+ peft_trainable_token_indices = (peft_trainable_token_indices,)
+
+ for untrained_token_id in fix_untrained_tokens:
+ if untrained_token_id not in peft_trainable_token_indices:
+ LOG.warning_once(
+ f"Token {untrained_token_id} is fixed via `fix_untrained_tokens`, yet not in `peft_trainable_token_indices: ` list. "
+ "Please add it, otherwise the token won't be trained on."
+ )
+ return data
+
class AxolotlConfigWCapabilities(AxolotlInputConfig):
"""wrapper to valdiate GPU capabilities with the configured options"""
@@ -1029,7 +1105,6 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
return data
- # pylint: disable=duplicate-code
@model_validator(mode="before")
@classmethod
def check_multigpu_unsloth(cls, data):
@@ -1045,7 +1120,6 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
)
return data
- # pylint: disable=duplicate-code
@model_validator(mode="before")
@classmethod
def check_multigpu_lora_kernels(cls, data):
@@ -1253,8 +1327,31 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
@model_validator(mode="before")
@classmethod
- def default_dataset_processes(cls, data):
- if data.get("dataset_processes") is None:
- data["dataset_processes"] = get_default_process_count()
-
+ def default_dataset_num_proc(cls, data):
+ if data.get("dataset_processes") is not None:
+ if data.get("dataset_num_proc") is None:
+ data["dataset_num_proc"] = data["dataset_processes"]
+ LOG.warning(
+ "dataset_processes is deprecated and will be removed in a future version. "
+ "Please use dataset_num_proc instead."
+ )
+ else:
+ LOG.warning(
+ "Both dataset_processes and dataset_num_proc are set. "
+ "Using dataset_num_proc and ignoring dataset_processes."
+ )
+ del data["dataset_processes"]
+ elif data.get("dataset_num_proc") is None:
+ data["dataset_num_proc"] = get_default_process_count()
+ return data
+
+ @model_validator(mode="before")
+ @classmethod
+ def check_deduplication_with_streaming(cls, data):
+ if data.get("dataset_exact_deduplication") and (
+ data.get("streaming") or data.get("pretraining_dataset")
+ ):
+ raise NotImplementedError(
+ "dataset_exact_deduplication is not available for streaming datasets. "
+ )
return data
diff --git a/src/axolotl/utils/schemas/datasets.py b/src/axolotl/utils/schemas/datasets.py
index d9c8042d4..e32468706 100644
--- a/src/axolotl/utils/schemas/datasets.py
+++ b/src/axolotl/utils/schemas/datasets.py
@@ -203,7 +203,6 @@ class SFTDataset(BaseModel):
@model_validator(mode="before")
@classmethod
- # pylint: disable=duplicate-code
def check_chat_template_config(cls, data):
if isinstance(data, BaseModel):
data = data.model_dump()
diff --git a/src/axolotl/utils/schemas/enums.py b/src/axolotl/utils/schemas/enums.py
index cf2a8b484..bcd03e1a2 100644
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -1,24 +1,25 @@
"""Enums for Axolotl input config"""
-# pylint: disable=invalid-name
-
from enum import Enum
import torch
-class TorchIntDType(Enum):
- """Torch integer data types - `getattr` guards against torch < 2.6 which does not support int4"""
+class TorchAOQuantDType(Enum):
+ int4 = torch.int4
+ int8 = torch.int8
+ float8_e4m3fn = torch.float8_e4m3fn
+ nvfp4 = "nvfp4"
- uint1 = getattr(torch, "uint1", None)
- uint2 = getattr(torch, "uint2", None)
- uint3 = getattr(torch, "uint3", None)
- uint4 = getattr(torch, "uint4", None)
- uint5 = getattr(torch, "uint5", None)
- uint6 = getattr(torch, "uint6", None)
- uint7 = getattr(torch, "uint7", None)
- int4 = getattr(torch, "int4", None)
- int8 = getattr(torch, "int8", None)
+ def from_string(str):
+ if str == "int4":
+ return TorchAOQuantDType.int4
+ if str == "int8":
+ return TorchAOQuantDType.int8
+ if str in ["float8_e4m3fn", "fp8", "float8"]:
+ return TorchAOQuantDType.float8_e4m3fn
+ if str == "nvfp4":
+ return TorchAOQuantDType.nvfp4
class RLType(str, Enum):
diff --git a/src/axolotl/utils/schemas/fsdp.py b/src/axolotl/utils/schemas/fsdp.py
new file mode 100644
index 000000000..f34f40e8e
--- /dev/null
+++ b/src/axolotl/utils/schemas/fsdp.py
@@ -0,0 +1,71 @@
+"""
+FSDP Configuration Schema
+"""
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class FSDPConfig(BaseModel):
+ """
+ FSDP Configuration Schema
+ """
+
+ activation_checkpointing: bool | None = Field(
+ default=None,
+ description="Enable activation checkpointing to reduce memory usage during forward passes",
+ )
+ offload_params: bool | None = Field(
+ default=None,
+ description="Offload parameters to CPU to reduce GPU memory usage",
+ )
+ sync_module_states: bool | None = Field(
+ default=None,
+ description="Synchronize module states across all processes",
+ )
+ cpu_ram_efficient_loading: bool | None = Field(
+ default=None,
+ description="Enable CPU RAM efficient loading to reduce memory usage during model loading",
+ )
+ cpu_offload_pin_memory: bool | None = Field(
+ default=None,
+ description="Disabling this enables swap memory usage for resource-constrained setups when offload_params is enabled.",
+ )
+ use_orig_params: bool | None = Field(
+ default=None,
+ description="Use original parameters instead of flattened parameters",
+ )
+
+ state_dict_type: (
+ Literal["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] | None
+ ) = Field(
+ default=None,
+ description="Type of state dict to use for saving/loading checkpoints",
+ )
+ final_state_dict_type: (
+ Literal["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] | None
+ ) = Field(
+ default=None,
+ description="Final state dict type to use after training completion",
+ )
+
+ auto_wrap_policy: Literal["TRANSFORMER_BASED_WRAP", "SIZE_BASED_WRAP"] | None = (
+ Field(
+ default=None,
+ description="Policy for automatically wrapping modules with FSDP",
+ )
+ )
+ transformer_layer_cls_to_wrap: str | None = Field(
+ default=None,
+ description="Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')",
+ )
+
+ reshard_after_forward: bool | None = Field(
+ default=None,
+ description="Reshard parameters after forward pass to save memory",
+ )
+ mixed_precision_policy: str | None = Field(
+ default=None,
+ description="Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')",
+ )
diff --git a/src/axolotl/utils/schemas/integrations.py b/src/axolotl/utils/schemas/integrations.py
index 7332c7d39..97d675569 100644
--- a/src/axolotl/utils/schemas/integrations.py
+++ b/src/axolotl/utils/schemas/integrations.py
@@ -176,3 +176,27 @@ class RayConfig(BaseModel):
"help": "The resources per worker for Ray training. Default is to use 1 GPU per worker."
},
)
+
+
+class OpenTelemetryConfig(BaseModel):
+ """OpenTelemetry configuration subset"""
+
+ use_otel_metrics: bool | None = Field(
+ default=False,
+ json_schema_extra={
+ "description": "Enable OpenTelemetry metrics collection and Prometheus export"
+ },
+ )
+ otel_metrics_host: str | None = Field(
+ default="localhost",
+ json_schema_extra={
+ "title": "OpenTelemetry Metrics Host",
+ "description": "Host to bind the OpenTelemetry metrics server to",
+ },
+ )
+ otel_metrics_port: int | None = Field(
+ default=8000,
+ json_schema_extra={
+ "description": "Port for the Prometheus metrics HTTP server"
+ },
+ )
diff --git a/src/axolotl/utils/schemas/model.py b/src/axolotl/utils/schemas/model.py
index eb751bfcc..04312eedd 100644
--- a/src/axolotl/utils/schemas/model.py
+++ b/src/axolotl/utils/schemas/model.py
@@ -59,16 +59,21 @@ class ModelInputConfig(BaseModel):
processor_type: str | None = Field(
default=None, json_schema_extra={"description": "transformers processor class"}
)
+ tokenizer_save_jinja_files: bool | None = Field(
+ default=True, # match the default behavior from transformers
+ json_schema_extra={
+ "description": "Whether to save jinja files for tokenizer, transformers default is True"
+ },
+ )
trust_remote_code: bool | None = Field(
default=None,
json_schema_extra={"description": "Trust remote code for untrusted source"},
)
experimental_skip_move_to_device: bool | None = Field(
- default=None,
+ default=True,
json_schema_extra={
- "description": "Don't move the model to the device before sharding. "
- "This is an experimental feature that may be included in the future as the default."
+ "description": "Don't move the model to the device before sharding. Set to `false` to revert to legacy behavior."
},
)
diff --git a/src/axolotl/utils/schemas/peft.py b/src/axolotl/utils/schemas/peft.py
index de29521cb..af22913fd 100644
--- a/src/axolotl/utils/schemas/peft.py
+++ b/src/axolotl/utils/schemas/peft.py
@@ -90,6 +90,16 @@ class LoraConfig(BaseModel):
"description": "How to initialize LoRA weights. Default to True which is MS original implementation."
},
)
+ peft_trainable_token_indices: list[int] | dict[str, list[int]] | None = Field(
+ default=None,
+ json_schema_extra={
+ "description": (
+ "A list of token indices to fine-tune on the `embed_tokens` layer.\n"
+ "Otherwise, a dict mapping an embedding layer name to its trainable token indices.\n"
+ "See https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-tokens-alongside-lora"
+ )
+ },
+ )
qlora_sharded_model_loading: bool | None = Field(
default=False,
diff --git a/src/axolotl/utils/schemas/quantization.py b/src/axolotl/utils/schemas/quantization.py
index 090640c7b..a7c130574 100644
--- a/src/axolotl/utils/schemas/quantization.py
+++ b/src/axolotl/utils/schemas/quantization.py
@@ -6,7 +6,23 @@ from typing import Any
from pydantic import BaseModel, Field, field_validator
-from axolotl.utils.schemas.enums import TorchIntDType
+from axolotl.utils.schemas.enums import TorchAOQuantDType
+
+
+def validate_ao_dtype(v: Any) -> TorchAOQuantDType | None:
+ if v is None:
+ return None
+ if v == "int4":
+ return TorchAOQuantDType.int4
+ if v == "int8":
+ return TorchAOQuantDType.int8
+ if v in ["float8_e4m3fn", "fp8", "float8"]:
+ return TorchAOQuantDType.float8_e4m3fn
+ if v == "nvfp4":
+ return TorchAOQuantDType.nvfp4
+ raise ValueError(
+ f"Invalid dtype: '{v}'. Must be one of: {[e.name for e in TorchAOQuantDType] + ['fp8', 'float8']}"
+ )
class QATConfig(BaseModel):
@@ -14,13 +30,13 @@ class QATConfig(BaseModel):
QAT Config Schema
"""
- activation_dtype: TorchIntDType | None = Field(
+ activation_dtype: TorchAOQuantDType | None = Field(
default=None,
- description='Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"',
+ description="Fake quantization layout to use for activation quantization.",
)
- weight_dtype: TorchIntDType = Field(
- default=TorchIntDType.int8,
- description='Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"',
+ weight_dtype: TorchAOQuantDType = Field(
+ default=TorchAOQuantDType.int8,
+ description="Fake quantization layout to use for weight quantization.",
)
quantize_embedding: bool | None = Field(
default=False, description="Quantize embedding"
@@ -35,12 +51,8 @@ class QATConfig(BaseModel):
@field_validator("activation_dtype", "weight_dtype", mode="before")
@classmethod
- def validate_dtype(cls, v: Any) -> TorchIntDType | None:
- if v == "int4":
- return TorchIntDType.int4
- if v == "int8":
- return TorchIntDType.int8
- raise ValueError(f"Invalid dtype: '{v}'. Must be one of: ['int4', 'int8']")
+ def validate_dtype(cls, v: Any) -> TorchAOQuantDType | None:
+ return validate_ao_dtype(v)
class PTQConfig(BaseModel):
@@ -48,13 +60,13 @@ class PTQConfig(BaseModel):
PTQ Config Schema
"""
- weight_dtype: TorchIntDType = Field(
- default=TorchIntDType.int8,
- description="Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8",
+ weight_dtype: TorchAOQuantDType = Field(
+ default=TorchAOQuantDType.int8,
+ description="Fake quantization layout to use for weight quantization.",
)
- activation_dtype: TorchIntDType | None = Field(
+ activation_dtype: TorchAOQuantDType | None = Field(
default=None,
- description='Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"',
+ description="Fake quantization layout to use for activation quantization.",
)
quantize_embedding: bool | None = Field(
default=None, description="Whether to quantize the embedding layer."
@@ -66,9 +78,5 @@ class PTQConfig(BaseModel):
@field_validator("activation_dtype", "weight_dtype", mode="before")
@classmethod
- def validate_dtype(cls, v: Any) -> TorchIntDType | None:
- if v == "int4":
- return TorchIntDType.int4
- if v == "int8":
- return TorchIntDType.int8
- raise ValueError(f"Invalid dtype: '{v}'. Must be one of: ['int4', 'int8']")
+ def validate_dtype(cls, v: Any) -> TorchAOQuantDType | None:
+ return validate_ao_dtype(v)
diff --git a/src/axolotl/utils/schemas/training.py b/src/axolotl/utils/schemas/training.py
index b1788dcaa..8e06e82cb 100644
--- a/src/axolotl/utils/schemas/training.py
+++ b/src/axolotl/utils/schemas/training.py
@@ -96,9 +96,9 @@ class HyperparametersConfig(BaseModel):
"description": "Path to torch distx for optim 'adamw_anyprecision'"
},
)
- lr_scheduler: (SchedulerType | Literal["one_cycle"] | Literal["rex"]) | None = (
- SchedulerType.COSINE
- )
+ lr_scheduler: (
+ SchedulerType | Literal["one_cycle"] | Literal["rex"]
+ ) | None = SchedulerType.COSINE
lr_scheduler_kwargs: dict[str, Any] | None = Field(
default=None,
json_schema_extra={
diff --git a/src/axolotl/utils/schemas/trl.py b/src/axolotl/utils/schemas/trl.py
index 980474e87..d24d6f477 100644
--- a/src/axolotl/utils/schemas/trl.py
+++ b/src/axolotl/utils/schemas/trl.py
@@ -167,3 +167,15 @@ class TRLConfig(BaseModel):
"description": "Whether to exclude truncated completions from loss calculation."
},
)
+ vllm_enable_sleep_mode: bool | None = Field(
+ default=None,
+ json_schema_extra={
+ "description": "Enable sleep mode for vLLM to offload VRAM when idle"
+ },
+ )
+ rollout_func: str | None = Field(
+ default=None,
+ json_schema_extra={
+ "description": "Path to custom rollout function. Must be importable from current dir."
+ },
+ )
diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
index 72991c947..368976831 100644
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -1,8 +1,7 @@
"""Module with validation methods for config pydantic model."""
-# pylint: disable=too-many-boolean-expressions
-
import json
+import sys
import tempfile
from pathlib import Path
@@ -15,8 +14,6 @@ from transformers.utils.import_utils import is_torch_npu_available
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.enums import ChatTemplate, RingAttnFunc, RLType
-# pylint: disable=too-many-lines
-
LOG = get_logger(__name__)
SUPPORTED_METRICS = {"sacrebleu", "comet", "ter", "chrf", "perplexity"}
@@ -62,6 +59,20 @@ class DatasetValidationMixin:
raise ValueError("either datasets or pretraining_dataset is required")
return data
+ @model_validator(mode="before")
+ @classmethod
+ def check_pretraining_streaming_deprecation(cls, data):
+ # TODO(djsaunde): remove this check + implement change for 0.13.0 release
+ if data.get("pretraining_dataset") and not data.get("streaming"):
+ LOG.warning(
+ "Setting `pretraining_dataset` without explicitly setting `streaming: "
+ "true` is deprecated. In a future release, streaming will not be "
+ "automatically enabled when using pretraining_dataset. Please "
+ "explicitly set `streaming: true` in your configuration to maintain "
+ "current behavior."
+ )
+ return data
+
@model_validator(mode="before")
@classmethod
def check_push_ds_auth(cls, data):
@@ -342,10 +353,33 @@ class TrainingValidationMixin:
)
return data
+ @model_validator(mode="before")
+ @classmethod
+ def check_multipack_buffer_size(cls, data):
+ if data.get("pretrain_multipack_buffer_size") and not data.get(
+ "streaming_multipack_buffer_size"
+ ):
+ LOG.warning(
+ "`pretrain_multipack_buffer_size` is deprecated in v0.13.0, will be "
+ "removed in v0.14.0. Use `streaming_multipack_buffer_size` instead."
+ )
+ data["streaming_multipack_buffer_size"] = data[
+ "pretrain_multipack_buffer_size"
+ ]
+ del data["pretrain_multipack_buffer_size"]
+ elif data.get("pretrain_multipack_buffer_size") and data.get(
+ "streaming_multipack_buffer_size"
+ ):
+ raise ValueError(
+ "pretrain_multipack_buffer_size is deprecated, use "
+ "streaming_multipack_buffer_size; both are set, please remove the "
+ "deprecated pretrain_multipack_buffer_size setting"
+ )
+ return data
+
@model_validator(mode="after")
def check_fft_possible_bad_config(self):
if (
- # pylint: disable=too-many-boolean-expressions
not (self.bf16 or self.bfloat16)
and (self.fp16 or self.float16)
and not self.adapter
@@ -369,10 +403,10 @@ class TrainingValidationMixin:
"see speed improvements. Please consider setting `torch_compile: "
"true` in your config."
)
+ fsdp_config = data.get("fsdp_config") or {}
if data.get("fp8") and (
- data.get("fsdp_config", {}).get("activation_checkpointing", False) is True
- or data.get("fsdp_config", {}).get("fsdp_activation_checkpointing", False)
- is True
+ fsdp_config.get("activation_checkpointing", False) is True
+ or fsdp_config.get("fsdp_activation_checkpointing", False) is True
):
LOG.warning(
"FP8 + FSDP2 + activation checkpointing may be slower than BF16 "
@@ -459,12 +493,12 @@ class TrainingValidationMixin:
@classmethod
def check_mistral_common_import(cls, tokenizer_use_mistral_common):
if tokenizer_use_mistral_common:
- try:
- import mistral_common # noqa: F401 # pylint:disable=unused-import
- except ImportError as exception:
+ import importlib.util
+
+ if importlib.util.find_spec("mistral_common") is None:
raise ImportError(
"mistral-common is required for mistral models. Please install it with `pip install axolotl` or `pip install -e .`."
- ) from exception
+ )
return tokenizer_use_mistral_common
@@ -684,7 +718,7 @@ class RLValidationMixin:
# TODO: SalmanMohammadi
# Distributed RL with QLoRA + gradient checkpointing
# and use_reentrant = True is broken upstream in TRL
- # pylint: disable=too-many-boolean-expressions
+
if (
data.get("rl")
and data.get("gradient_checkpointing")
@@ -749,15 +783,6 @@ class OptimizationValidationMixin:
return data
- @model_validator(mode="before")
- @classmethod
- def check_torch_compile_deepspeed(cls, data):
- if data.get("deepspeed") and data.get("torch_compile"):
- raise ValueError(
- "torch_compile should be set within your deepspeed config file"
- )
- return data
-
@model_validator(mode="before")
@classmethod
def check_xentropy_patch_conflicts(cls, data):
@@ -782,21 +807,22 @@ class OptimizationValidationMixin:
)
return data
- @model_validator(mode="after")
- def check_fsdp2_base_model_quant_ram_efficient_loading(self):
- fsdp_config = self.fsdp_config if hasattr(self, "fsdp_config") else None
- fsdp_version = self.fsdp_version if hasattr(self, "fsdp_version") else None
- load_in_8bit = self.load_in_8bit if hasattr(self, "load_in_8bit") else None
- load_in_4bit = self.load_in_4bit if hasattr(self, "load_in_4bit") else None
- if fsdp_config and fsdp_version == 2:
- if fsdp_config.get("cpu_ram_efficient_loading") and (
- load_in_8bit or load_in_4bit
- ):
+ @model_validator(mode="before")
+ @classmethod
+ def check_fsdp2_cpu_offload_pin_memory(cls, data):
+ if not (fsdp_config := data.get("fsdp_config")):
+ return data
+
+ if fsdp_config.get("cpu_offload_pin_memory") is False:
+ if str(data.get("fsdp_version")) != "2":
raise ValueError(
- "FSDP2 does not support load_in_8bit or load_in_4bit with cpu_ram_efficient_loading. Please do one of the following: use DeepSpeed, "
- "set fsdp_version to 1, or disable cpu_ram_efficient_loading."
+ "FSDP1 does not support disabling cpu_offload_pin_memory, please set `fsdp_version` to 2"
)
- return self
+ if not fsdp_config.get("offload_params"):
+ raise ValueError(
+ "disabling cpu_offload_pin_memory requires enabling offload_params"
+ )
+ return data
@model_validator(mode="before")
@classmethod
@@ -817,13 +843,13 @@ class OptimizationValidationMixin:
@model_validator(mode="before")
@classmethod
def check_fsdp_version_in_fsdp_config(cls, data):
- if data.get("fsdp_config"):
- if data.get("fsdp_config", {}).get("fsdp_version"):
- LOG.warning(
- "Configuring `fsdp_version` in `fsdp_config` is deprecated. "
- "Please configure `fsdp_version` as a top-level field."
- )
- data["fsdp_version"] = data.get("fsdp_config").pop("fsdp_version")
+ fsdp_config = data.get("fsdp_config") or {}
+ if fsdp_config and fsdp_config.get("fsdp_version"):
+ LOG.warning(
+ "Configuring `fsdp_version` in `fsdp_config` is deprecated. "
+ "Please configure `fsdp_version` as a top-level field."
+ )
+ data["fsdp_version"] = fsdp_config.pop("fsdp_version")
return data
@model_validator(mode="before")
@@ -855,7 +881,7 @@ class OptimizationValidationMixin:
and self.fsdp_config
and self.optimizer
and "8bit" in self.optimizer.value
- and self.fsdp_config["offload_params"]
+ and self.fsdp_config.offload_params
and str(self.fsdp_version) != "2"
):
raise ValueError(
@@ -1077,6 +1103,50 @@ class PretrainingValidationMixin:
data["accelerator_config"]["dispatch_batches"] = False
return data
+ @model_validator(mode="before")
+ @classmethod
+ def check_pretraining_w_val_set_size(cls, data):
+ if data.get("pretraining_dataset") and data.get("val_set_size"):
+ raise ValueError(
+ "val_set_size is not supported with pretraining_dataset. "
+ "Use test_datasets to specify evaluation datasets for pretraining."
+ )
+ return data
+
+ @model_validator(mode="before")
+ @classmethod
+ def check_streaming_w_val_set_size(cls, data):
+ if data.get("streaming") and data.get("val_set_size"):
+ raise ValueError(
+ "val_set_size is not supported with streaming datasets. "
+ "Use test_datasets to specify evaluation datasets when streaming is enabled."
+ )
+ return data
+
+ @model_validator(mode="before")
+ @classmethod
+ def check_streaming_w_max_steps(cls, data):
+ if data.get("streaming") and not data.get("max_steps"):
+ raise ValueError(
+ "max_steps must be set when using streaming datasets. "
+ "Trainer cannot infer dataset length for iterable datasets."
+ )
+ return data
+
+ @model_validator(mode="before")
+ @classmethod
+ def check_streaming_w_multiple_datasets(cls, data):
+ if (
+ data.get("streaming")
+ and data.get("sample_packing")
+ and data.get("datasets")
+ and len(data.get("datasets")) > 1
+ ):
+ raise NotImplementedError(
+ "Sample packing with multiple streaming datasets is not yet supported"
+ )
+ return data
+
class ModelCompatibilityValidationMixin:
"""Validation methods for specific model compatibility."""
@@ -1151,10 +1221,8 @@ class ModelCompatibilityValidationMixin:
@classmethod
def check_gpt_oss_fsdp_loading(cls, data):
if data.get("model_quantization_config", "") == "Mxfp4Config":
- if (
- data.get("fsdp_config", {}).get("cpu_ram_efficient_loading", False)
- is True
- ):
+ fsdp_config = data.get("fsdp_config") or {}
+ if fsdp_config.get("cpu_ram_efficient_loading", False) is True:
raise ValueError(
"FSDP cpu_ram_efficient_loading is not supported for Mxfp4Config model quantization."
)
@@ -1251,12 +1319,21 @@ class ComplexValidationMixin:
try:
import transformers.modeling_flash_attention_utils
+ from transformers.utils import is_flash_attn_greater_or_equal
- # pylint: disable=protected-access
- transformers.modeling_flash_attention_utils._flash_supports_window_size = (
- transformers.modeling_flash_attention_utils._flash_supports_window
+ transformers.modeling_flash_attention_utils._flash_supports_window = (
+ True
)
- import ring_flash_attn # noqa: F401 # pylint:disable=unused-import
+ sys.modules[
+ "transformers.modeling_flash_attention_utils"
+ ]._flash_supports_window = True
+ sys.modules[
+ "transformers.modeling_flash_attention_utils"
+ ]._flash_supports_window_size = True
+ sys.modules[
+ "transformers.modeling_flash_attention_utils"
+ ].is_flash_attn_greater_or_equal = is_flash_attn_greater_or_equal
+ import ring_flash_attn # noqa: F401 # Required after monkey-patching
except ImportError as exception:
raise ImportError(
"context_parallel_size > 1 but ring_flash_attn is not installed. "
@@ -1293,6 +1370,21 @@ class ComplexValidationMixin:
return self
+ def hint_gradient_checkpointing_dpo_lora_ddp(self):
+ if (
+ (self.gradient_checkpointing is True or self.gradient_checkpointing is None)
+ and self.capabilities
+ and self.capabilities.get("n_gpu", 1) > 1
+ and self.adapter in ("lora", "qlora")
+ and self.rl == RLType.DPO
+ and not self.fsdp
+ and not self.deepspeed
+ ):
+ LOG.warning(
+ "gradient_checkpointing with DPO + DDP + LoRA is not recommended."
+ )
+ return self
+
class DistributedValidationMixin:
"""validation for distributed training."""
@@ -1321,7 +1413,6 @@ class GRPOVllmValidationMixin:
return self
-# pylint: disable=too-many-ancestors
class ValidationMixin(
DatasetValidationMixin,
AttentionValidationMixin,
diff --git a/src/axolotl/utils/tee.py b/src/axolotl/utils/tee.py
new file mode 100644
index 000000000..7bc8efab0
--- /dev/null
+++ b/src/axolotl/utils/tee.py
@@ -0,0 +1,166 @@
+"""
+Utilities for managing the debug log file and providing a file-only stream for logging
+handlers.
+"""
+
+from __future__ import annotations
+
+import io
+import os
+import sys
+import threading
+from pathlib import Path
+from typing import TextIO, cast
+
+_lock = threading.Lock()
+_file_handle: io.TextIOWrapper | None = None
+_log_path: str | None = None
+_tee_installed: bool = False
+_orig_stdout: TextIO | None = None
+_orig_stderr: TextIO | None = None
+
+
+class _FileOnlyWriter(io.TextIOBase):
+ """A stream-like object that writes only to the tee file.
+
+ Before the file is prepared, writes are dropped (no-op).
+ """
+
+ def write(self, s: str) -> int: # type: ignore[override]
+ with _lock:
+ if _file_handle is not None:
+ _file_handle.write(s)
+ return len(s)
+ return len(s)
+
+ def flush(self) -> None: # type: ignore[override]
+ with _lock:
+ if _file_handle is not None:
+ try:
+ _file_handle.flush()
+ except Exception:
+ pass
+
+
+file_only_stream: io.TextIOBase = _FileOnlyWriter()
+
+
+class _StreamTee(io.TextIOBase):
+ """A minimal tee that mirrors writes to the debug log file.
+
+ Installed only after the debug log is prepared; no buffering.
+ """
+
+ def __init__(self, stream: io.TextIOBase):
+ self._stream = stream
+
+ def write(self, s: str) -> int: # type: ignore[override]
+ with _lock:
+ n = self._stream.write(s)
+ if _file_handle is not None:
+ _file_handle.write(s)
+ return n
+
+ def flush(self) -> None: # type: ignore[override]
+ with _lock:
+ self._stream.flush()
+ if _file_handle is not None:
+ try:
+ _file_handle.flush()
+ except Exception:
+ pass
+
+ @property
+ def encoding(self): # type: ignore[override]
+ return getattr(self._stream, "encoding", None)
+
+ @property
+ def errors(self): # type: ignore[override]
+ return getattr(self._stream, "errors", None)
+
+ def isatty(self): # type: ignore[override]
+ return getattr(self._stream, "isatty", lambda: False)()
+
+ def fileno(self): # type: ignore[override]
+ if hasattr(self._stream, "fileno"):
+ return self._stream.fileno()
+ raise OSError("Underlying stream has no fileno")
+
+
+def prepare_debug_log(cfg, filename: str = "debug.log") -> str:
+ """
+ Prepare the debug log.
+
+ Creates the output directory, handles append/truncate logic based on cfg, and opens
+ the debug log file for subsequent writes via file-only handlers.
+ """
+ global _file_handle, _log_path, _tee_installed
+
+ with _lock:
+ # If already initialized, reuse existing path
+ if _log_path is not None:
+ return _log_path
+
+ output_dir = cfg.output_dir
+ os.makedirs(output_dir, exist_ok=True)
+
+ log_path = Path(output_dir) / filename
+ append = bool(
+ cfg.get("resume_from_checkpoint") or cfg.get("auto_resume_from_checkpoints")
+ )
+
+ if not append:
+ log_path.unlink(missing_ok=True)
+
+ fh = open(log_path, "a", encoding="utf-8")
+ fh.flush()
+
+ _file_handle = fh
+ _log_path = str(log_path)
+
+ # Install a tee so stdout/stderr are mirrored to the debug file
+ # Allow disabling via env for testing or advanced usage.
+ tee_enabled = os.getenv("AXOLOTL_TEE_STDOUT", "1").lower() not in {
+ "0",
+ "false",
+ "no",
+ }
+ if tee_enabled and not _tee_installed:
+ # Save originals so we can restore later (e.g., tests)
+ global _orig_stdout, _orig_stderr
+ _orig_stdout = sys.stdout
+ _orig_stderr = sys.stderr
+ sys.stdout = _StreamTee(cast(io.TextIOBase, sys.stdout))
+ sys.stderr = _StreamTee(cast(io.TextIOBase, sys.stderr))
+ _tee_installed = True
+
+ return _log_path
+
+
+def close_debug_log() -> None:
+ """Flush and close the debug log and uninstall the stdout/stderr tee.
+
+ Safe to call even if not initialized.
+ """
+ global _file_handle, _log_path, _tee_installed, _orig_stdout, _orig_stderr
+ with _lock:
+ # Restore original stdout/stderr if we installed a tee
+ if _tee_installed:
+ if _orig_stdout is not None:
+ sys.stdout = _orig_stdout
+ if _orig_stderr is not None:
+ sys.stderr = _orig_stderr
+ _tee_installed = False
+ _orig_stdout = None
+ _orig_stderr = None
+
+ # Close the file handle if open
+ if _file_handle is not None:
+ try:
+ _file_handle.flush()
+ _file_handle.close()
+ except Exception:
+ pass
+ finally:
+ _file_handle = None
+ _log_path = None
diff --git a/src/axolotl/utils/tokenization.py b/src/axolotl/utils/tokenization.py
index 3526bd5b5..3f44a3429 100644
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -31,7 +31,7 @@ def check_example_labels(example, tokenizer, text_only=False):
# You can compare the input_ids and labels element-wise
# Remember to ignore positions with IGNORE_TOKEN_ID (if you use it) or attention_mask equal to 0
colored_tokens = []
- for _, (input_id, label_id) in enumerate(zip(input_ids, labels)):
+ for _, (input_id, label_id) in enumerate(zip(input_ids, labels, strict=False)):
decoded_input_token = tokenizer.decode(input_id)
# Choose the color based on whether the label has the ignore value or not
color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
diff --git a/src/axolotl/utils/train.py b/src/axolotl/utils/train.py
new file mode 100644
index 000000000..ad3f72be4
--- /dev/null
+++ b/src/axolotl/utils/train.py
@@ -0,0 +1,47 @@
+"""Training utils for checkpoints"""
+
+from pathlib import Path
+
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def determine_last_checkpoint(cfg: DictDefault, update: bool = True) -> str | None:
+ """
+ Determine the checkpoint to resume from based on configuration.
+
+ Args:
+ cfg: Dictionary mapping `axolotl` config keys to values.
+ update: Whether to update the config with the determined checkpoint
+
+ Returns:
+ Path to the checkpoint to resume from, or `None` if not resuming.
+ """
+ last_checkpoint = None
+ checkpoints = sorted(
+ (
+ p
+ for p in Path(cfg.output_dir).glob("checkpoint-*")
+ if p.name.split("-")[-1].isdigit()
+ ),
+ key=lambda p: int(p.name.split("-")[-1]),
+ )
+ if checkpoints:
+ last_checkpoint = str(checkpoints[-1])
+ if not update:
+ LOG.info(f"Resuming from last checkpoint at {last_checkpoint}")
+ return last_checkpoint
+
+ if (
+ cfg.resume_from_checkpoint is None
+ and cfg.auto_resume_from_checkpoints
+ and last_checkpoint is not None
+ ):
+ cfg.resume_from_checkpoint = last_checkpoint
+ LOG.info(
+ "Using auto-resume functionality to resume from checkpoint at "
+ f"{cfg.resume_from_checkpoint}"
+ )
+ return cfg.resume_from_checkpoint
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index e424cb55a..d97577d86 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -6,6 +6,7 @@ import os
import random
from contextlib import contextmanager
from functools import partial
+from tempfile import NamedTemporaryFile
from typing import List, Optional
import numpy as np
@@ -15,6 +16,7 @@ from datasets import IterableDataset, disable_caching, enable_caching
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers.utils import is_torch_bf16_gpu_available
+from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import init_distributed_state, reduce_and_broadcast
from axolotl.utils.environment import check_cuda_p2p_ib_support
from axolotl.utils.logging import get_logger
@@ -276,7 +278,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
prior_len = None
filter_map_kwargs = {}
if not isinstance(train_dataset, IterableDataset):
- filter_map_kwargs["num_proc"] = cfg.dataset_processes
+ filter_map_kwargs["num_proc"] = cfg.dataset_num_proc
filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
drop_long_kwargs = {}
@@ -316,7 +318,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
if cfg.group_by_length:
train_dataset = train_dataset.map(
add_length,
- num_proc=cfg.dataset_processes,
+ num_proc=cfg.dataset_num_proc,
load_from_cache_file=not cfg.is_preprocess,
desc="Group By Length",
)
@@ -333,7 +335,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
)
train_dataset = train_dataset.map(
pose_fn,
- num_proc=cfg.dataset_processes,
+ num_proc=cfg.dataset_num_proc,
load_from_cache_file=not cfg.is_preprocess,
desc="Add position_id column (PoSE)",
)
@@ -342,7 +344,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
if eval_dataset:
eval_dataset = eval_dataset.map(
pose_fn,
- num_proc=cfg.dataset_processes,
+ num_proc=cfg.dataset_num_proc,
load_from_cache_file=not cfg.is_preprocess,
desc="Add position_id column (PoSE)",
)
@@ -467,7 +469,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
bin_size=cfg.sample_packing_bin_size,
sequential=cfg.sample_packing_sequentially,
drop_last=True,
- num_processes=cfg.dataset_processes,
+ num_processes=cfg.dataset_prcoesses,
mp_start_method=cfg.sample_packing_mp_start_method or "fork",
)
@@ -475,7 +477,9 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
train_dataset.remove_columns(["length"]),
batch_sampler=sampler,
)
- data_loader_len = len(data_loader) * cfg.micro_batch_size // cfg.batch_size
+ data_loader_len = max(
+ 1, len(data_loader) * cfg.micro_batch_size // cfg.batch_size
+ )
LOG.debug(f"data_loader_len: {data_loader_len}")
# FIXME: is there a bug here somewhere? the total num steps depends
# on the agreed on value for sample_packing_eff_est
@@ -496,7 +500,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
return max(estimates)
sample_packing_actual_eff_all = reduce_and_broadcast(
- lambda: sampler.efficiency(), # pylint: disable=unnecessary-lambda
+ lambda: sampler.efficiency(),
calc_sample_packing_eff_est,
)
sample_packing_eff_est = (
@@ -538,6 +542,13 @@ def setup_deepspeed_env(cfg, stage=None):
)
os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
+ if isinstance(cfg.deepspeed, DictDefault):
+ with NamedTemporaryFile(
+ mode="w", delete=False, suffix=".json", prefix="deepspeed_config_"
+ ) as temp_file:
+ temp_file.write(json.dumps(cfg.deepspeed.to_dict(), indent=4))
+ temp_file.close()
+ cfg.deepspeed = str(temp_file.name)
os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed
os.environ["ACCELERATE_GRADIENT_ACCUMULATION_STEPS"] = str(
cfg.gradient_accumulation_steps
@@ -547,12 +558,20 @@ def setup_deepspeed_env(cfg, stage=None):
if stage == 3:
os.environ["ACCELERATE_DEEPSPEED_ZERO3_INIT"] = "true"
+ device_count = torch.cuda.device_count()
+ if device_count == 1:
+ os.environ.setdefault("WORLD_SIZE", "1")
+ os.environ.setdefault("LOCAL_RANK", "0")
+ os.environ.setdefault("MASTER_ADDR", "0.0.0.0") # nosec B104
+ os.environ.setdefault("MASTER_PORT", "29500")
+
# NOTE(djsaunde): The distribued state cannot be initialized prior to the
# ACCELERATE_USE_DEEPSPEED assignment, but it must be initialized some time prior
# to model load.
if (
int(os.environ.get("WORLD_SIZE", "1")) == 1
and os.environ.get("AXOLOTL_IS_PREPROCESS", "0") != "1"
+ and cfg.use_ray is not True
):
os.environ["WORLD_SIZE"] = "1" # force it in case not set
os.environ["LOCAL_RANK"] = "0" # force it in case not set
@@ -586,6 +605,10 @@ def setup_fsdp_envs(cfg):
os.environ["FSDP_USE_ORIG_PARAMS"] = "true"
if cfg.fsdp_config.state_dict_type:
os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.state_dict_type
+ if cfg.fsdp_config.cpu_offload_pin_memory is not None:
+ os.environ["FSDP_CPU_OFFLOAD_PIN_MEMORY"] = str(
+ cfg.fsdp_config.cpu_offload_pin_memory
+ ).lower()
if cfg.fsdp_config.auto_wrap_policy:
os.environ["FSDP_AUTO_WRAP_POLICY"] = cfg.fsdp_config.auto_wrap_policy
if cfg.fsdp_config.transformer_layer_cls_to_wrap:
@@ -618,6 +641,7 @@ def setup_parallelism_envs(cfg):
def prepare_optim_env(cfg):
if not check_cuda_p2p_ib_support():
if os.getenv("NCCL_P2P_DISABLE") is None:
+ LOG.warning("P2P support not detected, setting `NCCL_P2P_DISABLE=1`")
os.environ["NCCL_P2P_DISABLE"] = "1"
# TODO @SalmanMohammadi remove the cfg.fsdp check in 0.12
if cfg.fsdp or cfg.fsdp_config:
@@ -625,11 +649,15 @@ def prepare_optim_env(cfg):
setup_fsdp_envs(cfg)
elif cfg.deepspeed:
stage = None
+ deepspeed_config = None
# check if the cfg.deepspeed is a file
- if os.path.isfile(cfg.deepspeed):
+ if isinstance(cfg.deepspeed, DictDefault):
+ deepspeed_config = cfg.deepspeed
+ elif os.path.isfile(cfg.deepspeed):
# parse with json
with open(cfg.deepspeed, "r", encoding="utf-8") as fin:
deepspeed_config = json.load(fin)
+ if deepspeed_config:
stage = deepspeed_config.get("zero_optimization", {}).get("stage", None)
setup_deepspeed_env(cfg, stage=stage)
@@ -646,15 +674,6 @@ def prepare_optim_env(cfg):
os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
-def prepare_opinionated_env(cfg):
- if cfg.qlora_sharded_model_loading:
- # model loading is forked after the tokenizer
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
- if cfg.sample_packing:
- # multipack parallel packing sampler defaults to using fork
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-
def setup_trainer(
cfg,
train_dataset,
diff --git a/src/setuptools_axolotl_dynamic_dependencies.py b/src/setuptools_axolotl_dynamic_dependencies.py
index 02a5b8083..3bb54cda8 100644
--- a/src/setuptools_axolotl_dynamic_dependencies.py
+++ b/src/setuptools_axolotl_dynamic_dependencies.py
@@ -9,7 +9,6 @@ from importlib.metadata import PackageNotFoundError, version
from setuptools.command.build_py import build_py as _build_py
-# pylint: disable=duplicate-code
def parse_requirements():
_install_requires = []
_dependency_links = []
@@ -34,7 +33,6 @@ def parse_requirements():
try:
xformers_version = [req for req in _install_requires if "xformers" in req][0]
torchao_version = [req for req in _install_requires if "torchao" in req][0]
- autoawq_version = [req for req in _install_requires if "autoawq" in req][0]
if "Darwin" in platform.system():
# don't install xformers on MacOS
@@ -64,7 +62,6 @@ def parse_requirements():
_install_requires.append("xformers==0.0.28.post2")
else:
_install_requires.append("xformers==0.0.28.post3")
- _install_requires.pop(_install_requires.index(autoawq_version))
elif (major, minor) >= (2, 4):
if patch == 0:
_install_requires.pop(_install_requires.index(xformers_version))
diff --git a/tests/cli/test_cli_evaluate.py b/tests/cli/test_cli_evaluate.py
index a191bf957..e8b88625a 100644
--- a/tests/cli/test_cli_evaluate.py
+++ b/tests/cli/test_cli_evaluate.py
@@ -1,7 +1,5 @@
"""Tests for evaluate CLI command."""
-# pylint: disable=duplicate-code
-
from unittest.mock import patch
from axolotl.cli.main import cli
@@ -31,7 +29,6 @@ class TestEvaluateCommand(BaseCliTest):
config_path = tmp_path / "config.yml"
config_path.write_text(valid_test_config)
- # pylint: disable=duplicate-code
with patch("axolotl.cli.evaluate.do_evaluate") as mock_evaluate:
result = cli_runner.invoke(
cli,
diff --git a/tests/cli/test_cli_inference.py b/tests/cli/test_cli_inference.py
index 3394c189d..807dc7fa3 100644
--- a/tests/cli/test_cli_inference.py
+++ b/tests/cli/test_cli_inference.py
@@ -1,7 +1,5 @@
"""pytest tests for axolotl CLI inference command."""
-# pylint: disable=duplicate-code
-
from unittest.mock import patch
from axolotl.cli.main import cli
diff --git a/tests/cli/test_cli_merge_sharded_fsdp_weights.py b/tests/cli/test_cli_merge_sharded_fsdp_weights.py
index 4f6a973ea..de13b28ed 100644
--- a/tests/cli/test_cli_merge_sharded_fsdp_weights.py
+++ b/tests/cli/test_cli_merge_sharded_fsdp_weights.py
@@ -1,7 +1,5 @@
"""pytest tests for axolotl CLI merge_sharded_fsdp_weights command."""
-# pylint: disable=duplicate-code
-
from unittest.mock import patch
from axolotl.cli.main import cli
diff --git a/tests/cli/test_cli_train.py b/tests/cli/test_cli_train.py
index d4d90f57f..1251ab3c0 100644
--- a/tests/cli/test_cli_train.py
+++ b/tests/cli/test_cli_train.py
@@ -1,7 +1,5 @@
"""Tests for train CLI command."""
-# pylint: disable=duplicate-code
-
from unittest.mock import MagicMock, patch
from axolotl.cli.main import cli
diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py
index a3e4e9887..431c35c3c 100644
--- a/tests/cli/test_utils.py
+++ b/tests/cli/test_utils.py
@@ -1,7 +1,5 @@
"""pytest tests for axolotl CLI utils."""
-# pylint: disable=redefined-outer-name
-
import json
from unittest.mock import Mock, patch
@@ -25,7 +23,7 @@ MOCK_TREE_RESPONSE = {
def mock_responses():
"""Mock responses for API and file downloads"""
- def mock_get(url, timeout=None): # pylint: disable=unused-argument
+ def mock_get(url, timeout=None):
response = Mock()
if "api.github.com" in url:
response.text = json.dumps(MOCK_TREE_RESPONSE)
@@ -93,21 +91,21 @@ def assert_launcher_args_in_command(
called_cmd = mock_subprocess_call.call_args.args[0]
# Verify launcher
- assert (
- called_cmd[0] == launcher
- ), f"Expected launcher {launcher}, got {called_cmd[0]}"
+ assert called_cmd[0] == launcher, (
+ f"Expected launcher {launcher}, got {called_cmd[0]}"
+ )
# Verify launcher args are present
for arg in expected_launcher_args:
- assert (
- arg in called_cmd
- ), f"Expected launcher arg '{arg}' not found in command: {called_cmd}"
+ assert arg in called_cmd, (
+ f"Expected launcher arg '{arg}' not found in command: {called_cmd}"
+ )
# Verify module is present
assert "-m" in called_cmd, "Expected -m flag for module execution"
- assert (
- command_module in called_cmd
- ), f"Expected module {command_module} not found in command: {called_cmd}"
+ assert command_module in called_cmd, (
+ f"Expected module {command_module} not found in command: {called_cmd}"
+ )
def assert_no_launcher_args_contamination(mock_subprocess_call, launcher: str):
@@ -126,17 +124,17 @@ def assert_no_launcher_args_contamination(mock_subprocess_call, launcher: str):
launch_idx = called_cmd.index("launch")
m_idx = called_cmd.index("-m")
launcher_section = called_cmd[launch_idx + 1 : m_idx]
- assert (
- len(launcher_section) == 0
- ), f"Unexpected launcher args found: {launcher_section}"
+ assert len(launcher_section) == 0, (
+ f"Unexpected launcher args found: {launcher_section}"
+ )
elif launcher == "torchrun":
# For torchrun, launcher args should be between 'torchrun' and '-m'
torchrun_idx = called_cmd.index("torchrun")
m_idx = called_cmd.index("-m")
launcher_section = called_cmd[torchrun_idx + 1 : m_idx]
- assert (
- len(launcher_section) == 0
- ), f"Unexpected launcher args found: {launcher_section}"
+ assert len(launcher_section) == 0, (
+ f"Unexpected launcher args found: {launcher_section}"
+ )
@pytest.fixture
diff --git a/tests/conftest.py b/tests/conftest.py
index 9e1af318d..98847ebad 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -33,10 +33,9 @@ logging.getLogger("filelock").setLevel(logging.CRITICAL)
def retry_on_request_exceptions(max_retries=3, delay=1):
- # pylint: disable=duplicate-code
def decorator(func):
@functools.wraps(func)
- def wrapper(*args, **kwargs): # pylint: disable=inconsistent-return-statements
+ def wrapper(*args, **kwargs):
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
@@ -171,7 +170,7 @@ def download_argilla_distilabel_intel_orca_dpo_dataset():
# @disable_hf_offline
# def dataset_fozzie_alpaca_dpo_dataset(
# download_fozzie_alpaca_dpo_dataset,
-# ): # pylint: disable=unused-argument,redefined-outer-name
+# ):
# return load_dataset("fozziethebeat/alpaca_messages_2k_dpo_test", split="train")
#
#
@@ -179,7 +178,7 @@ def download_argilla_distilabel_intel_orca_dpo_dataset():
# @disable_hf_offline
# def dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff(
# download_fozzie_alpaca_dpo_dataset,
-# ): # pylint: disable=unused-argument,redefined-outer-name
+# ):
# return load_dataset(
# "fozziethebeat/alpaca_messages_2k_dpo_test", split="train", revision="ea82cff"
# )
@@ -359,7 +358,7 @@ def download_llama32_1b_model_fixture():
@enable_hf_offline
def tokenizer_huggyllama(
download_huggyllama_model_fixture,
-): # pylint: disable=unused-argument,redefined-outer-name
+):
tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
tokenizer.pad_token = ""
@@ -370,7 +369,7 @@ def tokenizer_huggyllama(
@enable_hf_offline
def tokenizer_huggyllama_w_special_tokens(
tokenizer_huggyllama,
-): # pylint: disable=redefined-outer-name
+):
tokenizer_huggyllama.add_special_tokens(
{
"bos_token": "",
@@ -386,7 +385,7 @@ def tokenizer_huggyllama_w_special_tokens(
@enable_hf_offline
def tokenizer_llama2_7b(
download_llama2_model_fixture,
-): # pylint: disable=unused-argument,redefined-outer-name
+):
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf")
return tokenizer
@@ -396,7 +395,7 @@ def tokenizer_llama2_7b(
@enable_hf_offline
def tokenizer_mistral_7b_instruct(
download_mlx_mistral_7b_model_fixture,
-): # pylint: disable=unused-argument,redefined-outer-name
+):
return AutoTokenizer.from_pretrained("casperhansen/mistral-7b-instruct-v0.1-awq")
@@ -442,9 +441,7 @@ def cleanup_monkeypatches():
# original_fa2_forward = LlamaFlashAttention2.forward
original_llama_attn_forward = LlamaAttention.forward
original_llama_forward = LlamaForCausalLM.forward
- original_trainer_inner_training_loop = (
- Trainer._inner_training_loop # pylint: disable=protected-access
- )
+ original_trainer_inner_training_loop = Trainer._inner_training_loop
original_trainer_training_step = Trainer.training_step
# monkey patches can happen inside the tests
yield
@@ -452,9 +449,7 @@ def cleanup_monkeypatches():
# LlamaFlashAttention2.forward = original_fa2_forward
LlamaAttention.forward = original_llama_attn_forward
LlamaForCausalLM.forward = original_llama_forward
- Trainer._inner_training_loop = ( # pylint: disable=protected-access
- original_trainer_inner_training_loop
- )
+ Trainer._inner_training_loop = original_trainer_inner_training_loop
Trainer.training_step = original_trainer_training_step
# Reset other known monkeypatches
@@ -490,7 +485,7 @@ def cleanup_monkeypatches():
@pytest.fixture
def dataset_winglian_tiny_shakespeare(
download_ds_fixture_bundle: Path,
-): # pylint: disable=redefined-outer-name
+):
ds_path = download_ds_fixture_bundle / "winglian__tiny-shakespeare"
return datasets.load_from_disk(ds_path)
@@ -498,7 +493,7 @@ def dataset_winglian_tiny_shakespeare(
@pytest.fixture
def dataset_tatsu_lab_alpaca(
download_ds_fixture_bundle: Path,
-): # pylint: disable=redefined-outer-name
+):
ds_path = download_ds_fixture_bundle / "tatsu-lab__alpaca"
return datasets.load_from_disk(ds_path)["train"]
@@ -506,7 +501,7 @@ def dataset_tatsu_lab_alpaca(
@pytest.fixture
def dataset_mhenrichsen_alpaca_2k_test(
download_ds_fixture_bundle: Path,
-): # pylint: disable=redefined-outer-name
+):
ds_path = download_ds_fixture_bundle / "mhenrichsen__alpaca_2k_test"
return datasets.load_from_disk(ds_path)["train"]
@@ -514,7 +509,7 @@ def dataset_mhenrichsen_alpaca_2k_test(
@pytest.fixture
def dataset_argilla_ultrafeedback_binarized_preferences_cleaned(
download_ds_fixture_bundle: Path,
-): # pylint: disable=redefined-outer-name
+):
ds_path = (
download_ds_fixture_bundle
/ "argilla__ultrafeedback-binarized-preferences-cleaned"
@@ -525,7 +520,7 @@ def dataset_argilla_ultrafeedback_binarized_preferences_cleaned(
@pytest.fixture
def dataset_fozziethebeat_alpaca_messages_2k_dpo_test(
download_ds_fixture_bundle: Path,
-): # pylint: disable=redefined-outer-name
+):
ds_path = download_ds_fixture_bundle / "fozziethebeat__alpaca_messages_2k_dpo_test"
return datasets.load_from_disk(ds_path)["train"]
@@ -533,7 +528,7 @@ def dataset_fozziethebeat_alpaca_messages_2k_dpo_test(
@pytest.fixture
def dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff(
download_ds_fixture_bundle: Path,
-): # pylint: disable=redefined-outer-name
+):
ds_path = (
download_ds_fixture_bundle
/ "fozziethebeat__alpaca_messages_2k_dpo_test__rev_ea82cff"
@@ -557,7 +552,7 @@ def fixture_min_base_cfg():
)
-# # pylint: disable=redefined-outer-name,unused-argument
+#
@pytest.mark.skipif(
os.environ.get("AXOLOTL_IS_CI_CACHE_PRELOAD", "-1") != "1",
reason="Not running in CI cache preload",
diff --git a/tests/constants.py b/tests/constants.py
index e024e6920..cd75bd339 100644
--- a/tests/constants.py
+++ b/tests/constants.py
@@ -3,6 +3,7 @@
This module contains constants and configuration dictionaries used for
datasets and other utilities in the Axolotl project, specifically for testing.
"""
+
# Configuration for Alpaca Messages Dataset
ALPACA_MESSAGES_CONFIG_OG = {
"path": "fozziethebeat/alpaca_messages_2k_dpo_test",
diff --git a/tests/core/test_builders.py b/tests/core/test_builders.py
index fab01a644..199777896 100644
--- a/tests/core/test_builders.py
+++ b/tests/core/test_builders.py
@@ -1,7 +1,5 @@
"""Unit tests for axolotl.core.builders"""
-# pylint: disable=protected-access
-
import sys
from pathlib import Path
from unittest.mock import patch
@@ -330,7 +328,6 @@ def rand_reward_func(prompts, completions) -> list[float]:
)
def test_grpo_training_arguments(self, grpo_cfg, model, tokenizer, tmp_path):
-
rewards_dir = tmp_path / "rewards_test"
self._write_rewards_file(rewards_dir)
@@ -399,10 +396,10 @@ def rand_reward_func(prompts, completions) -> list[float]:
),
("orpo_cfg", None), # don't use fixture for orpo to use smaller split
("kto_cfg", None), # no fixture for kto
- (
- "simpo_cfg",
- "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
- ),
+ # (
+ # "simpo_cfg",
+ # "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
+ # ),
],
)
def test_custom_optimizer_cls_and_kwargs(
@@ -443,7 +440,7 @@ def rand_reward_func(prompts, completions) -> list[float]:
]
else:
raise ValueError(f"Unhandled cfg_string: {cfg_string}")
- cfg["dataset_processes"] = 4
+ cfg["dataset_num_proc"] = 4
if cfg_string == "grpo_cfg":
rewards_dir = tmp_path / "rewards_test"
@@ -477,7 +474,7 @@ def rand_reward_func(prompts, completions) -> list[float]:
assert trainer.optimizer_cls_and_kwargs is not None
- from axolotl.contribs.mit.muon import ( # pylint: disable=no-name-in-module
+ from axolotl.contribs.mit.muon import (
Muon,
MuonOptimizerFactory,
)
@@ -559,7 +556,7 @@ class TestHFCausalTrainerBuilder:
assert trainer.optimizer_cls_and_kwargs is not None
- from axolotl.contribs.mit.muon import ( # pylint: disable=no-name-in-module
+ from axolotl.contribs.mit.muon import (
Muon,
MuonOptimizerFactory,
)
@@ -599,6 +596,6 @@ class TestTrainerClsPlugin:
except TypeError as e:
# Error raised if trainer_cls is None
assert "'tuple' object has no attribute 'config'" not in str(e)
- except Exception: # pylint: disable=broad-exception-caught
+ except Exception:
# Another error happens, so we passed trainer_cls to builder
pass
diff --git a/tests/e2e/integrations/test_cut_cross_entropy.py b/tests/e2e/integrations/test_cut_cross_entropy.py
index 34e6c9644..1ba05077c 100644
--- a/tests/e2e/integrations/test_cut_cross_entropy.py
+++ b/tests/e2e/integrations/test_cut_cross_entropy.py
@@ -12,8 +12,6 @@ from axolotl.utils.dict import DictDefault
from ..utils import check_model_output_exists
-# pylint: disable=duplicate-code
-
@pytest.fixture()
def min_cfg(temp_dir):
@@ -53,7 +51,6 @@ class TestCutCrossEntropyIntegration:
e2e tests for cut_cross_entropy integration with Axolotl
"""
- # pylint: disable=redefined-outer-name
def test_llama_w_cce(self, min_cfg, temp_dir):
cfg = DictDefault(min_cfg)
cfg = validate_config(cfg)
@@ -69,7 +66,6 @@ class TestCutCrossEntropyIntegration:
train(cfg=cfg, dataset_meta=dataset_meta)
check_model_output_exists(temp_dir, cfg)
- # pylint: disable=redefined-outer-name
def test_qwen2_w_cce(self, temp_dir):
cfg = DictDefault(
{
diff --git a/tests/e2e/integrations/test_fp8.py b/tests/e2e/integrations/test_fp8.py
index 0302b7e35..7db63cc4d 100644
--- a/tests/e2e/integrations/test_fp8.py
+++ b/tests/e2e/integrations/test_fp8.py
@@ -18,7 +18,7 @@ class FP8IntegrationTestCase:
@require_torch_2_7_0
def test_fp8_single_gpu_smoke(self, temp_dir):
"""Smoke test for single GPU FP8 + torch.compile training"""
- # pylint: disable=duplicate-code
+
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -53,7 +53,6 @@ class FP8IntegrationTestCase:
}
)
- # pylint: disable=duplicate-code
cfg = validate_config(cfg)
normalize_config(cfg)
dataset_meta = load_datasets(cfg=cfg)
diff --git a/tests/e2e/integrations/test_hooks.py b/tests/e2e/integrations/test_hooks.py
index 8743efb98..b85505caa 100644
--- a/tests/e2e/integrations/test_hooks.py
+++ b/tests/e2e/integrations/test_hooks.py
@@ -28,85 +28,81 @@ class LogHooksPlugin(BasePlugin):
except FileNotFoundError:
pass
- def post_trainer_create(self, cfg, trainer): # pylint: disable=unused-argument
+ def post_trainer_create(self, cfg, trainer):
with open(
self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
) as f:
f.write("post_trainer_create\n")
- def pre_model_load(self, cfg): # pylint: disable=unused-argument
+ def pre_model_load(self, cfg):
with open(
self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
) as f:
f.write("pre_model_load\n")
- def post_model_build(self, cfg, model): # pylint: disable=unused-argument
+ def post_model_build(self, cfg, model):
with open(
self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
) as f:
f.write("post_model_build\n")
- def pre_lora_load(self, cfg, model): # pylint: disable=unused-argument
+ def pre_lora_load(self, cfg, model):
with open(
self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
) as f:
f.write("pre_lora_load\n")
- def post_lora_load(self, cfg, model): # pylint: disable=unused-argument
+ def post_lora_load(self, cfg, model):
with open(
self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
) as f:
f.write("post_lora_load\n")
- def post_model_load(self, cfg, model): # pylint: disable=unused-argument
+ def post_model_load(self, cfg, model):
with open(
self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
) as f:
f.write("post_model_load\n")
- def create_optimizer(self, cfg, trainer): # pylint: disable=unused-argument
+ def create_optimizer(self, cfg, trainer):
with open(
self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
) as f:
f.write("create_optimizer\n")
- def get_trainer_cls(self, cfg): # pylint: disable=unused-argument
+ def get_trainer_cls(self, cfg):
with open(
self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
) as f:
f.write("get_trainer_cls\n")
- def create_lr_scheduler(
- self, cfg, trainer, optimizer, num_training_steps
- ): # pylint: disable=unused-argument
+ def create_lr_scheduler(self, cfg, trainer, optimizer, num_training_steps):
with open(
self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
) as f:
f.write("create_lr_scheduler\n")
- def add_callbacks_pre_trainer(self, cfg, model): # pylint: disable=unused-argument
+ def add_callbacks_pre_trainer(self, cfg, model):
with open(
self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
) as f:
f.write("add_callbacks_pre_trainer\n")
return []
- def add_callbacks_post_trainer(
- self, cfg, trainer
- ): # pylint: disable=unused-argument
+ def add_callbacks_post_trainer(self, cfg, trainer):
with open(
self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
) as f:
f.write("add_callbacks_post_trainer\n")
return []
- def post_train(self, cfg, model): # pylint: disable=unused-argument
+ def post_train(self, cfg, model):
with open(
self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
) as f:
f.write("post_train\n")
- def post_train_unload(self, cfg): # pylint: disable=unused-argument
+ def post_train_unload(self, cfg):
with open(
self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
) as f:
@@ -119,7 +115,6 @@ class TestPluginHooks:
"""
def test_plugin_hooks(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/integrations/test_kd.py b/tests/e2e/integrations/test_kd.py
index 1ac3b537e..d89044247 100644
--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -25,7 +25,7 @@ def min_cfg(temp_dir):
"liger_rms_norm": True,
"liger_glu_activation": True,
"torch_compile": True,
- "chat_template": "llama3",
+ "chat_template": "qwen3",
"kd_trainer": True,
"kd_ce_alpha": 0.1,
"kd_alpha": 0.9,
@@ -81,7 +81,7 @@ class TestKnowledgeDistillation:
@require_torch_2_5_1
def test_llama_kd(self, temp_dir, kd_min_cfg):
cfg = DictDefault(kd_min_cfg)
- # pylint: disable=duplicate-code
+
# write cfg to yaml file
Path(temp_dir).mkdir(parents=True, exist_ok=True)
with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
@@ -104,7 +104,6 @@ class TestKnowledgeDistillation:
temp_dir + "/runs", "train/loss", 1.4, "Train Loss (%s) is too high"
)
- @pytest.mark.skip(reason="Chunked KD loss doesn't support PEFT/LoRA")
@pytest.mark.parametrize(
"load_in_8bit",
[True, False],
@@ -120,10 +119,14 @@ class TestKnowledgeDistillation:
"lora_r": 16,
"lora_alpha": 32,
"lora_dropout": 0.0,
+ "lora_modules_to_save": ["embed_tokens", "lm_head"],
+ "lora_mlp_kernel": False,
+ "lora_qkv_kernel": False,
+ "lora_o_kernel": False,
}
| kd_min_cfg
)
- # pylint: disable=duplicate-code
+
# write cfg to yaml file
Path(temp_dir).mkdir(parents=True, exist_ok=True)
with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
diff --git a/tests/e2e/integrations/test_liger.py b/tests/e2e/integrations/test_liger.py
index b1f5befdd..55317151e 100644
--- a/tests/e2e/integrations/test_liger.py
+++ b/tests/e2e/integrations/test_liger.py
@@ -2,6 +2,7 @@
Simple end-to-end test for Liger integration
"""
+import pytest
from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
@@ -17,7 +18,6 @@ class LigerIntegrationTestCase:
@require_torch_2_4_1
def test_llama_wo_flce(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -53,7 +53,7 @@ class LigerIntegrationTestCase:
"save_first_step": False,
}
)
- # pylint: disable=duplicate-code
+
cfg = validate_config(cfg)
prepare_plugins(cfg)
normalize_config(cfg)
@@ -63,8 +63,11 @@ class LigerIntegrationTestCase:
check_model_output_exists(temp_dir, cfg)
@require_torch_2_4_1
- def test_llama_w_flce(self, temp_dir):
- # pylint: disable=duplicate-code
+ @pytest.mark.parametrize(
+ "liger_use_token_scaling",
+ [True, False],
+ )
+ def test_llama_w_flce(self, temp_dir, liger_use_token_scaling):
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -76,6 +79,7 @@ class LigerIntegrationTestCase:
"liger_glu_activation": True,
"liger_cross_entropy": False,
"liger_fused_linear_cross_entropy": True,
+ "liger_use_token_scaling": liger_use_token_scaling,
"sequence_len": 1024,
"val_set_size": 0.05,
"special_tokens": {
@@ -100,7 +104,7 @@ class LigerIntegrationTestCase:
"save_first_step": False,
}
)
- # pylint: disable=duplicate-code
+
cfg = validate_config(cfg)
prepare_plugins(cfg)
normalize_config(cfg)
diff --git a/tests/e2e/kernels/test_geglu.py b/tests/e2e/kernels/test_geglu.py
index 4094a8ce7..78ba74c0e 100644
--- a/tests/e2e/kernels/test_geglu.py
+++ b/tests/e2e/kernels/test_geglu.py
@@ -85,6 +85,6 @@ def test_geglu_inplace_preservation():
assert not torch.equal(gate, gate_copy), "Gate should be modified in-place"
assert not torch.equal(up, up_copy), "Up should be modified in-place"
- assert not torch.equal(
- grad_output, grad_copy
- ), "Grad output should be modified in-place"
+ assert not torch.equal(grad_output, grad_copy), (
+ "Grad output should be modified in-place"
+ )
diff --git a/tests/e2e/kernels/test_lora.py b/tests/e2e/kernels/test_lora.py
index cd6131ff1..9baceb668 100644
--- a/tests/e2e/kernels/test_lora.py
+++ b/tests/e2e/kernels/test_lora.py
@@ -1,7 +1,5 @@
"""Tests for LoRA custom autograd."""
-# pylint: disable=invalid-name,redefined-outer-name
-
import pytest
import torch
from bitsandbytes.functional import QuantState
@@ -333,7 +331,7 @@ def test_lora_qkv(sample_tensors):
X.requires_grad = True
# Test without LoRA adapters
- # pylint: disable=duplicate-code
+
Q1, K1, V1 = LoRA_QKV.apply(
X,
q_weight,
diff --git a/tests/e2e/kernels/test_quantize.py b/tests/e2e/kernels/test_quantize.py
index ea91407ef..60396584c 100644
--- a/tests/e2e/kernels/test_quantize.py
+++ b/tests/e2e/kernels/test_quantize.py
@@ -1,7 +1,5 @@
"""Tests for quantization utility functions."""
-# pylint: disable=invalid-name
-
import torch
from bitsandbytes.functional import QuantState
diff --git a/tests/e2e/kernels/test_swiglu.py b/tests/e2e/kernels/test_swiglu.py
index 60fdafb79..58d5e04a7 100644
--- a/tests/e2e/kernels/test_swiglu.py
+++ b/tests/e2e/kernels/test_swiglu.py
@@ -1,7 +1,5 @@
"""Tests for SwiGLU activation function Triton kernels."""
-# pylint: disable=duplicate-code
-
import torch
import torch.nn.functional as F
@@ -74,6 +72,6 @@ def test_swiglu_inplace_preservation():
assert not torch.equal(gate, gate_copy), "Gate should be modified in-place"
assert not torch.equal(up, up_copy), "Up should be modified in-place"
- assert not torch.equal(
- grad_output, grad_copy
- ), "Grad output should be modified in-place"
+ assert not torch.equal(grad_output, grad_copy), (
+ "Grad output should be modified in-place"
+ )
diff --git a/tests/e2e/multigpu/solo/test_flex.py b/tests/e2e/multigpu/solo/test_flex.py
index cbdf8de96..881d75c25 100644
--- a/tests/e2e/multigpu/solo/test_flex.py
+++ b/tests/e2e/multigpu/solo/test_flex.py
@@ -31,7 +31,6 @@ class TestPackedFlex:
@require_torch_2_6_0
def test_loss_llama(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/multigpu/solo/test_grpo.py b/tests/e2e/multigpu/solo/test_grpo.py
index 92e0f7040..257a388d0 100644
--- a/tests/e2e/multigpu/solo/test_grpo.py
+++ b/tests/e2e/multigpu/solo/test_grpo.py
@@ -80,7 +80,7 @@ def start_vllm(
cmd_env = env.copy()
cmd_env.update({"VLLM_LOGGING_CONFIG_PATH": vllm_logging_json})
# start `trl vllm-serve` command in the background and capture the process id
- process = subprocess.Popen( # pylint: disable=consider-using-with
+ process = subprocess.Popen(
cmd,
env=cmd_env,
stdout=subprocess.DEVNULL if quiet else subprocess.PIPE,
@@ -144,7 +144,7 @@ def recursive_kill(process: subprocess.Popen):
@pytest.mark.skip(reason="flaky vllm tests in modal")
class TestGRPO:
"""
- Test case for GRPO training using multilpe GPUs
+ Test case for GRPO training using multiple GPUs
"""
def _utils_write_yaml_and_rewards(self, cfg, temp_dir, suffix=""):
diff --git a/tests/e2e/multigpu/test_eval.py b/tests/e2e/multigpu/test_eval.py
index 4f86278ff..504659a3a 100644
--- a/tests/e2e/multigpu/test_eval.py
+++ b/tests/e2e/multigpu/test_eval.py
@@ -21,7 +21,6 @@ class TestMultiGPUEval:
"""
def test_eval_sample_packing(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -93,7 +92,6 @@ class TestMultiGPUEval:
check_tensorboard(temp_dir + "/runs", "eval/loss", 2.5, "Eval Loss is too high")
def test_eval(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/multigpu/test_fp8_fsdp2.py b/tests/e2e/multigpu/test_fp8_fsdp2.py
index f7fa29a31..dc369f3de 100644
--- a/tests/e2e/multigpu/test_fp8_fsdp2.py
+++ b/tests/e2e/multigpu/test_fp8_fsdp2.py
@@ -1,7 +1,5 @@
"""Test module for FP8 mixed precision with FSDP2 multi-GPU functionality."""
-# pylint: disable=duplicate-code
-
import os
from pathlib import Path
@@ -28,9 +26,9 @@ def verify_fp8_training_success(temp_dir):
assert len(model_files) > 0, "No model files found - training may have failed"
checkpoint_files = list(output_path.glob("checkpoint-*"))
- assert (
- len(checkpoint_files) > 0
- ), "No checkpoint files found - training may have failed"
+ assert len(checkpoint_files) > 0, (
+ "No checkpoint files found - training may have failed"
+ )
tb_log_path = most_recent_subdir(temp_dir + "/runs")
if tb_log_path:
@@ -42,9 +40,9 @@ def verify_fp8_training_success(temp_dir):
train_loss_df = df[df.tag == "train/train_loss"]
if len(train_loss_df) > 0:
final_loss = train_loss_df.value.values[-1]
- assert not torch.isnan(
- torch.tensor(final_loss)
- ), f"Training loss is NaN: {final_loss}"
+ assert not torch.isnan(torch.tensor(final_loss)), (
+ f"Training loss is NaN: {final_loss}"
+ )
class TestFP8FSDP2:
diff --git a/tests/e2e/multigpu/test_fsdp1.py b/tests/e2e/multigpu/test_fsdp1.py
index fe0badbe2..cb92c80b5 100644
--- a/tests/e2e/multigpu/test_fsdp1.py
+++ b/tests/e2e/multigpu/test_fsdp1.py
@@ -1,7 +1,5 @@
"""Test module for FSDP1 multi-GPU functionality."""
-# pylint: disable=duplicate-code
-
import os
from pathlib import Path
@@ -29,9 +27,9 @@ def verify_training_success(temp_dir):
assert len(model_files) > 0, "No model files found - training may have failed"
checkpoint_files = list(output_path.glob("checkpoint-*"))
- assert (
- len(checkpoint_files) > 0
- ), "No checkpoint files found - training may have failed"
+ assert len(checkpoint_files) > 0, (
+ "No checkpoint files found - training may have failed"
+ )
tb_log_path = most_recent_subdir(temp_dir + "/runs")
if tb_log_path:
@@ -43,9 +41,9 @@ def verify_training_success(temp_dir):
train_loss_df = df[df.tag == "train/train_loss"]
if len(train_loss_df) > 0:
final_loss = train_loss_df.value.values[-1]
- assert not torch.isnan(
- torch.tensor(final_loss)
- ), f"Training loss is NaN: {final_loss}"
+ assert not torch.isnan(torch.tensor(final_loss)), (
+ f"Training loss is NaN: {final_loss}"
+ )
class TestFSDP1:
diff --git a/tests/e2e/multigpu/test_fsdp2.py b/tests/e2e/multigpu/test_fsdp2.py
index 0bb255266..8b7ee710e 100644
--- a/tests/e2e/multigpu/test_fsdp2.py
+++ b/tests/e2e/multigpu/test_fsdp2.py
@@ -1,7 +1,5 @@
"""Test module for FSDP2 multi-GPU functionality."""
-# pylint: disable=duplicate-code
-
import os
from pathlib import Path
@@ -29,9 +27,9 @@ def verify_training_success(temp_dir):
assert len(model_files) > 0, "No model files found - training may have failed"
checkpoint_files = list(output_path.glob("checkpoint-*"))
- assert (
- len(checkpoint_files) > 0
- ), "No checkpoint files found - training may have failed"
+ assert len(checkpoint_files) > 0, (
+ "No checkpoint files found - training may have failed"
+ )
tb_log_path = most_recent_subdir(temp_dir + "/runs")
if tb_log_path:
@@ -43,9 +41,9 @@ def verify_training_success(temp_dir):
train_loss_df = df[df.tag == "train/train_loss"]
if len(train_loss_df) > 0:
final_loss = train_loss_df.value.values[-1]
- assert not torch.isnan(
- torch.tensor(final_loss)
- ), f"Training loss is NaN: {final_loss}"
+ assert not torch.isnan(torch.tensor(final_loss)), (
+ f"Training loss is NaN: {final_loss}"
+ )
class TestFSDP2:
diff --git a/tests/e2e/multigpu/test_gemma3.py b/tests/e2e/multigpu/test_gemma3.py
index 4a7b101a8..51ec68b11 100644
--- a/tests/e2e/multigpu/test_gemma3.py
+++ b/tests/e2e/multigpu/test_gemma3.py
@@ -29,7 +29,6 @@ class TestMultiGPUGemma3:
"""
def test_lora_ddp_packed(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "axolotl-mirrors/gemma-3-4b-pt",
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index aab14dcc4..3383e71d1 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -35,7 +35,6 @@ class TestMultiGPULlama:
"""
def test_lora_ddp(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -99,7 +98,6 @@ class TestMultiGPULlama:
[1, 2],
)
def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -162,7 +160,6 @@ class TestMultiGPULlama:
)
def test_dpo_lora_ddp(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -202,7 +199,7 @@ class TestMultiGPULlama:
"max_steps": 2,
"micro_batch_size": 2,
"gradient_accumulation_steps": 2,
- # "gradient_checkpointing": True,
+ "gradient_checkpointing": False,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"warmup_steps": 0,
@@ -242,7 +239,6 @@ class TestMultiGPULlama:
)
def test_dpo_qlora_ddp(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -282,7 +278,7 @@ class TestMultiGPULlama:
"max_steps": 2,
"micro_batch_size": 2,
"gradient_accumulation_steps": 2,
- # "gradient_checkpointing": True,
+ "gradient_checkpointing": False,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"warmup_steps": 0,
@@ -326,7 +322,6 @@ class TestMultiGPULlama:
[1, 2],
)
def test_fsdp(self, temp_dir, gradient_accumulation_steps):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -358,7 +353,6 @@ class TestMultiGPULlama:
"auto_wrap",
],
"fsdp_config": {
- "fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
@@ -402,7 +396,6 @@ class TestMultiGPULlama:
],
)
def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -437,7 +430,6 @@ class TestMultiGPULlama:
"auto_wrap",
],
"fsdp_config": {
- "fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
@@ -484,7 +476,6 @@ class TestMultiGPULlama:
def test_fsdp2_packed(
self, temp_dir, attention_backend, fsdp_reshard_after_forward
):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -556,7 +547,6 @@ class TestMultiGPULlama:
)
def test_fsdp_qlora_prequant_packed(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
@@ -602,7 +592,6 @@ class TestMultiGPULlama:
"auto_wrap",
],
"fsdp_config": {
- "fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
@@ -656,7 +645,6 @@ class TestMultiGPULlama:
def test_ds_zero3_packed(
self, temp_dir, gradient_accumulation_steps, deepspeed, qlora
):
- # pylint: disable=duplicate-code
if qlora:
adapter = {
"adapter": "qlora",
@@ -732,7 +720,6 @@ class TestMultiGPULlama:
[True, False],
)
def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
- # pylint: disable=duplicate-code
if qlora:
adapter = {
"adapter": "qlora",
@@ -809,7 +796,6 @@ class TestMultiGPULlama:
[True, False],
)
def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
- # pylint: disable=duplicate-code
if qlora:
adapter = {
"adapter": "qlora",
@@ -880,7 +866,6 @@ class TestMultiGPULlama:
reason="fix untrained tokens brittle with lots of edge cases in latest transformers"
)
def test_fix_untrained_tokens(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/multigpu/test_ray.py b/tests/e2e/multigpu/test_ray.py
index 7f1278abf..df41b1444 100644
--- a/tests/e2e/multigpu/test_ray.py
+++ b/tests/e2e/multigpu/test_ray.py
@@ -13,7 +13,6 @@ from axolotl.utils.dict import DictDefault
from tests.e2e.utils import (
check_tensorboard,
require_torch_2_7_0,
- require_torch_lt_2_6_0,
)
AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
@@ -24,9 +23,8 @@ class TestMultiGPURay:
Test cases for AnyScale Ray post training
"""
- @require_torch_lt_2_6_0
+ @require_torch_2_7_0
def test_lora_ddp(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -84,13 +82,12 @@ class TestMultiGPURay:
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
)
- @require_torch_lt_2_6_0
+ @require_torch_2_7_0
@pytest.mark.parametrize(
"gradient_accumulation_steps",
[1, 2],
)
def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -150,7 +147,6 @@ class TestMultiGPURay:
[1, 2],
)
def test_sft_fsdp2_packed(self, temp_dir, gradient_accumulation_steps):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/multigpu/test_tp.py b/tests/e2e/multigpu/test_tp.py
index 87a1c6339..9891a0906 100644
--- a/tests/e2e/multigpu/test_tp.py
+++ b/tests/e2e/multigpu/test_tp.py
@@ -19,7 +19,6 @@ class TestTensorParallel:
)
@require_torch_2_7_0
def test_fft_sft(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "Qwen/Qwen2.5-0.5B",
diff --git a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
index b4dc5de54..73f883858 100644
--- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
+++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
@@ -1,7 +1,5 @@
"""Integration tests for LoRA activation and attention kernels."""
-# pylint: disable=redefined-outer-name
-
from pathlib import Path
import pytest
@@ -88,7 +86,7 @@ def test_attention_patching_integration(model_name, attention_cls):
cfg = DictDefault({"base_model": model_name})
# Store the original implementation
- original_forward = getattr(attention_cls, "forward")
+ original_forward = attention_cls.forward
# Apply patch
patch_self_attn_lora(cfg)
@@ -104,7 +102,7 @@ def test_attention_patching_integration(model_name, attention_cls):
assert hasattr(attention_cls, "_original_forward")
# Clean up
- setattr(attention_cls, "forward", original_forward)
+ attention_cls.forward = original_forward
delattr(attention_cls, "_original_forward")
@@ -162,7 +160,7 @@ def test_geglu_model_integration():
"""Test GeGLU activation with Gemma model."""
model = AutoModelForCausalLM.from_pretrained(
"trl-internal-testing/tiny-Gemma2ForCausalLM",
- torch_dtype=torch.float16,
+ dtype=torch.float16,
device_map="cuda:0",
)
peft_config = get_peft_config(
@@ -379,9 +377,9 @@ def test_model_architecture(model_config):
# Verify correct activation function
layer = patched_model.model.model.layers[0]
- assert (
- layer.mlp.forward.__func__ is model_config["expected_activation"]
- ), f"Wrong activation for {model_config['name']}"
+ assert layer.mlp.forward.__func__ is model_config["expected_activation"], (
+ f"Wrong activation for {model_config['name']}"
+ )
# Test forward pass
inputs = get_test_inputs(model)
@@ -390,12 +388,11 @@ def test_model_architecture(model_config):
patched_output = patched_model(inputs).logits
# Check outputs match
- assert torch.allclose(
- original_output, patched_output, rtol=1e-4
- ), f"Outputs don't match for {model_config['name']}"
+ assert torch.allclose(original_output, patched_output, rtol=1e-4), (
+ f"Outputs don't match for {model_config['name']}"
+ )
-# pylint: disable=duplicate-code
def test_kernel_training_integration(temp_dir):
"""Test model loading with kernel patches enabled."""
from axolotl.cli.utils import load_model_and_tokenizer
@@ -563,15 +560,13 @@ def test_kernel_training_integration_dropout_non_zero(temp_dir):
model_loader = ModelLoader(cfg, tokenizer)
# Apply patch
- model_loader.patch_manager._apply_self_attention_lora_patch() # pylint: disable=protected-access
+ model_loader.patch_manager._apply_self_attention_lora_patch()
# Verify patch was not applied
assert attention_cls.forward == original_forward_method
# Apply apply_lora_kernel_patches
- model_loader.patch_manager._apply_lora_kernel_patch( # pylint: disable=protected-access
- model
- )
+ model_loader.patch_manager._apply_lora_kernel_patch(model)
# Verify patch was not applied
layers = get_layers(model)
diff --git a/tests/e2e/patched/test_4d_multipack_llama.py b/tests/e2e/patched/test_4d_multipack_llama.py
index 1824443e7..ef28cc406 100644
--- a/tests/e2e/patched/test_4d_multipack_llama.py
+++ b/tests/e2e/patched/test_4d_multipack_llama.py
@@ -19,7 +19,6 @@ class Test4dMultipackLlama(unittest.TestCase):
@with_temp_dir
def test_sdp_lora_packing(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -67,7 +66,6 @@ class Test4dMultipackLlama(unittest.TestCase):
@with_temp_dir
def test_torch_lora_packing(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_activation_checkpointing.py b/tests/e2e/patched/test_activation_checkpointing.py
index 06e3de274..e8006c162 100644
--- a/tests/e2e/patched/test_activation_checkpointing.py
+++ b/tests/e2e/patched/test_activation_checkpointing.py
@@ -32,10 +32,9 @@ class TestActivationCheckpointing:
def test_activation_checkpointing_offload(
self,
temp_dir,
- fix_checkpoint_after_test, # pylint: disable=unused-argument,redefined-outer-name
+ fix_checkpoint_after_test,
gradient_checkpointing,
):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -70,7 +69,7 @@ class TestActivationCheckpointing:
"save_safetensors": True,
"gradient_checkpointing": gradient_checkpointing,
"save_first_step": False,
- "dataset_processes": 4,
+ "dataset_num_proc": 4,
}
)
diff --git a/tests/e2e/patched/test_cli_integrations.py b/tests/e2e/patched/test_cli_integrations.py
index 6c908faf1..6eba92689 100644
--- a/tests/e2e/patched/test_cli_integrations.py
+++ b/tests/e2e/patched/test_cli_integrations.py
@@ -10,7 +10,6 @@ from axolotl.cli.config import load_cfg
from axolotl.utils.dict import DictDefault
-# pylint: disable=duplicate-code
class TestPluginArgs:
"""
test class for plugin args loaded from the config file
diff --git a/tests/e2e/patched/test_fa_xentropy.py b/tests/e2e/patched/test_fa_xentropy.py
index 38099b220..9f4699854 100644
--- a/tests/e2e/patched/test_fa_xentropy.py
+++ b/tests/e2e/patched/test_fa_xentropy.py
@@ -23,7 +23,6 @@ class TestFAXentropyLlama:
[1, 4],
)
def test_lora_packing_fa_cross_entropy(self, temp_dir, gradient_accumulation_steps):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_falcon_samplepack.py b/tests/e2e/patched/test_falcon_samplepack.py
index ef31b11c7..cc5091403 100644
--- a/tests/e2e/patched/test_falcon_samplepack.py
+++ b/tests/e2e/patched/test_falcon_samplepack.py
@@ -22,7 +22,6 @@ class TestFalconPatched(unittest.TestCase):
@pytest.mark.skip(reason="no tiny models for testing with safetensors")
@with_temp_dir
def test_qlora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "illuin/tiny-random-FalconForCausalLM",
@@ -71,7 +70,6 @@ class TestFalconPatched(unittest.TestCase):
@pytest.mark.skip(reason="no tiny models for testing with safetensors")
@with_temp_dir
def test_ft(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "illuin/tiny-random-FalconForCausalLM",
diff --git a/tests/e2e/patched/test_flattening.py b/tests/e2e/patched/test_flattening.py
index fdaab558d..2c247d406 100644
--- a/tests/e2e/patched/test_flattening.py
+++ b/tests/e2e/patched/test_flattening.py
@@ -23,7 +23,6 @@ class TestFAFlattening:
[1, 4],
)
def test_lora_packing_flattening(self, temp_dir, gradient_accumulation_steps):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_fsdp2_qlora.py b/tests/e2e/patched/test_fsdp2_qlora.py
index 9dd053ad8..de9c929e1 100644
--- a/tests/e2e/patched/test_fsdp2_qlora.py
+++ b/tests/e2e/patched/test_fsdp2_qlora.py
@@ -1,131 +1,30 @@
-"""Integration tests for FSDP Params4bit patches."""
+"""Integration tests for FSDP2 Params4bit patches."""
-from unittest.mock import Mock, patch
-
-import bitsandbytes as bnb
import pytest
-import torch
from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam
-from axolotl.monkeypatch.fsdp2_qlora import (
- apply_bnb_torch_function_patch,
- patched_torch_function,
-)
-
-
-@pytest.fixture
-def mock_params4bit():
- """Create a mock Params4bit instance with test attributes."""
- mock_instance = Mock()
- mock_instance.requires_grad = True
- mock_instance.quant_state = "test_state"
- mock_instance.blocksize = 128
- mock_instance.compress_statistics = True
- mock_instance.quant_type = "fp4"
- mock_instance.quant_storage = "test_storage"
- mock_instance.module = "test_module"
- mock_instance.bnb_quantized = True
- return mock_instance
-
-
-class TestBnbTorchFunctionPatch:
- """Test the Params4bit.__torch_function__ patch."""
-
- def test_apply_patch(self):
- """Test that the patch can be applied."""
- with patch("bitsandbytes.nn.modules.Params4bit") as mock_cls:
- apply_bnb_torch_function_patch()
- assert hasattr(mock_cls, "__torch_function__")
- assert isinstance(mock_cls.__torch_function__, classmethod)
-
- # pylint: disable=redefined-outer-name
- def test_torch_chunk_preserves_attributes(self, mock_params4bit):
- """Test that torch.chunk preserves Params4bit attributes."""
- mock_cls = Mock()
- chunks = (torch.tensor([1, 2]), torch.tensor([3, 4]))
-
- with patch("torch.nn.Parameter.__torch_function__", return_value=chunks):
- result = patched_torch_function(
- mock_cls,
- torch.chunk,
- (type(mock_params4bit),),
- args=(mock_params4bit, 2),
- )
-
- assert isinstance(result, tuple)
- assert len(result) == 2
-
- # Check that Params4bit constructor was called with preserved attributes
- assert mock_cls.call_count == 2
- for call in mock_cls.call_args_list:
- kwargs = call[1]
- assert kwargs["requires_grad"] == mock_params4bit.requires_grad
- assert kwargs["quant_state"] == mock_params4bit.quant_state
- assert kwargs["blocksize"] == mock_params4bit.blocksize
-
- # pylint: disable=redefined-outer-name
- def test_other_functions_fallback(self, mock_params4bit):
- """Test that non-chunk/split functions use Parameter fallback."""
- mock_cls = Mock()
- fallback_result = torch.tensor([5, 6, 7])
-
- with patch(
- "torch.nn.Parameter.__torch_function__", return_value=fallback_result
- ) as mock_fallback:
- result = patched_torch_function(
- mock_cls, torch.add, (type(mock_params4bit),), args=(mock_params4bit, 1)
- )
-
- # Should call Parameter.__torch_function__ and return its result
- mock_fallback.assert_called_once()
- assert result is fallback_result
- mock_cls.assert_not_called()
-
class TestFSDPPatchIntegration:
"""Test FSDP patch integration."""
@pytest.mark.integration
- def test_all_patches_together(self):
+ def test_fsdp2_init_patches(self):
"""Test that all patches can be applied together."""
from axolotl.monkeypatch.fsdp2_qlora import (
apply_init_sharded_param_patch,
apply_init_unsharded_param_patch,
)
- # Store original methods before patching
- original_torch_function = getattr(
- bnb.nn.modules.Params4bit, "__torch_function__", None
- )
-
- # pylint: disable=protected-access
original_init_sharded = FSDPParam._init_sharded_param
original_init_unsharded = FSDPParam.init_unsharded_param
# Apply patches
- apply_bnb_torch_function_patch()
apply_init_sharded_param_patch()
apply_init_unsharded_param_patch()
- # Verify patches were applied
- current_torch_function = getattr(
- bnb.nn.modules.Params4bit, "__torch_function__", None
+ assert FSDPParam._init_sharded_param != original_init_sharded, (
+ "_init_sharded_param was not patched"
+ )
+ assert FSDPParam.init_unsharded_param != original_init_unsharded, (
+ "init_unsharded_param was not patched"
)
- if original_torch_function is not None:
- assert (
- current_torch_function != original_torch_function
- ), "Params4bit.__torch_function__ was not patched"
- else:
- assert (
- current_torch_function is not None
- ), "Params4bit.__torch_function__ was not added"
-
- # Check that FSDP methods were patched
- assert (
- # pylint: disable=protected-access
- FSDPParam._init_sharded_param
- != original_init_sharded
- ), "_init_sharded_param was not patched"
- assert (
- FSDPParam.init_unsharded_param != original_init_unsharded
- ), "init_unsharded_param was not patched"
diff --git a/tests/e2e/patched/test_fused_llama.py b/tests/e2e/patched/test_fused_llama.py
index f0c4f155f..f0c5df18a 100644
--- a/tests/e2e/patched/test_fused_llama.py
+++ b/tests/e2e/patched/test_fused_llama.py
@@ -23,7 +23,6 @@ class TestFusedLlama(unittest.TestCase):
@with_temp_dir
def test_fft_packing(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_llama_s2_attention.py b/tests/e2e/patched/test_llama_s2_attention.py
index ba5556a59..0dd748945 100644
--- a/tests/e2e/patched/test_llama_s2_attention.py
+++ b/tests/e2e/patched/test_llama_s2_attention.py
@@ -22,7 +22,6 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
@with_temp_dir
def test_lora_s2_attn(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -71,7 +70,6 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
@with_temp_dir
def test_fft_s2_attn(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_lora_llama_multipack.py b/tests/e2e/patched/test_lora_llama_multipack.py
index fdf6adbc6..1833c750b 100644
--- a/tests/e2e/patched/test_lora_llama_multipack.py
+++ b/tests/e2e/patched/test_lora_llama_multipack.py
@@ -22,7 +22,6 @@ class TestLoraLlama(unittest.TestCase):
@with_temp_dir
def test_lora_packing(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -73,7 +72,6 @@ class TestLoraLlama(unittest.TestCase):
@pytest.mark.skipif(not is_auto_gptq_available(), reason="auto-gptq not available")
@with_temp_dir
def test_lora_gptq_packed(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "lilmeaty/SmolLM2-135M-Instruct-GPTQ",
diff --git a/tests/e2e/patched/test_mistral_samplepack.py b/tests/e2e/patched/test_mistral_samplepack.py
index bea0f9c68..e03941b07 100644
--- a/tests/e2e/patched/test_mistral_samplepack.py
+++ b/tests/e2e/patched/test_mistral_samplepack.py
@@ -20,7 +20,6 @@ class TestMistral(unittest.TestCase):
@require_torch_2_6_0
@with_temp_dir
def test_lora_packing(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
@@ -68,7 +67,6 @@ class TestMistral(unittest.TestCase):
@with_temp_dir
def test_ft_packing(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
diff --git a/tests/e2e/patched/test_mixtral_samplepack.py b/tests/e2e/patched/test_mixtral_samplepack.py
index 09e427abd..3517ff3db 100644
--- a/tests/e2e/patched/test_mixtral_samplepack.py
+++ b/tests/e2e/patched/test_mixtral_samplepack.py
@@ -19,7 +19,6 @@ class TestMixtral(unittest.TestCase):
@with_temp_dir
def test_qlora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "hf-internal-testing/Mixtral-tiny",
@@ -64,7 +63,6 @@ class TestMixtral(unittest.TestCase):
@with_temp_dir
def test_ft(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "hf-internal-testing/Mixtral-tiny",
diff --git a/tests/e2e/patched/test_model_patches.py b/tests/e2e/patched/test_model_patches.py
index b90be23e4..aaaaf5fe2 100644
--- a/tests/e2e/patched/test_model_patches.py
+++ b/tests/e2e/patched/test_model_patches.py
@@ -89,5 +89,5 @@ class TestModelPatches(unittest.TestCase):
assert (
"torch.jit"
- in transformers.modeling_flash_attention_utils._get_unpad_data.__module__ # pylint: disable=protected-access
+ in transformers.modeling_flash_attention_utils._get_unpad_data.__module__
)
diff --git a/tests/e2e/patched/test_peft_embeddings.py b/tests/e2e/patched/test_peft_embeddings.py
index 4769319ae..374ef97d8 100644
--- a/tests/e2e/patched/test_peft_embeddings.py
+++ b/tests/e2e/patched/test_peft_embeddings.py
@@ -15,7 +15,6 @@ class TestLlamaPeftEmbeddings:
"""
def test_peft_embeddings_upcast(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_phi_multipack.py b/tests/e2e/patched/test_phi_multipack.py
index 1f0ddd630..77b2d99e5 100644
--- a/tests/e2e/patched/test_phi_multipack.py
+++ b/tests/e2e/patched/test_phi_multipack.py
@@ -19,7 +19,6 @@ class TestPhiMultipack(unittest.TestCase):
@with_temp_dir
def test_ft_packed(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "microsoft/phi-1_5",
@@ -67,7 +66,6 @@ class TestPhiMultipack(unittest.TestCase):
@with_temp_dir
def test_qlora_packed(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "microsoft/phi-1_5",
diff --git a/tests/e2e/patched/test_resume.py b/tests/e2e/patched/test_resume.py
index 54b8245ee..747b79dc7 100644
--- a/tests/e2e/patched/test_resume.py
+++ b/tests/e2e/patched/test_resume.py
@@ -22,7 +22,6 @@ class TestResumeLlama:
@require_torch_2_6_0
def test_resume_lora_packed(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_unsloth_qlora.py b/tests/e2e/patched/test_unsloth_qlora.py
index 2c8ee4eb0..bf00e8a5f 100644
--- a/tests/e2e/patched/test_unsloth_qlora.py
+++ b/tests/e2e/patched/test_unsloth_qlora.py
@@ -12,7 +12,6 @@ from axolotl.utils.dict import DictDefault
from ..utils import check_model_output_exists, check_tensorboard
-# pylint: disable=duplicate-code
@pytest.mark.skip(
reason="Unsloth integration will be broken going into latest transformers"
)
diff --git a/tests/e2e/solo/test_flex.py b/tests/e2e/solo/test_flex.py
index 76364fc0e..abe8fb69a 100644
--- a/tests/e2e/solo/test_flex.py
+++ b/tests/e2e/solo/test_flex.py
@@ -22,7 +22,6 @@ class TestPackedFlex(unittest.TestCase):
@require_torch_2_6_0
@with_temp_dir
def test_loss_llama(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/solo/test_relora_llama.py b/tests/e2e/solo/test_relora_llama.py
index b399b4680..be77684ba 100644
--- a/tests/e2e/solo/test_relora_llama.py
+++ b/tests/e2e/solo/test_relora_llama.py
@@ -20,7 +20,6 @@ class TestReLoraLlama(unittest.TestCase):
@with_temp_dir
def test_relora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -76,9 +75,9 @@ class TestReLoraLlama(unittest.TestCase):
train(cfg=cfg, dataset_meta=dataset_meta)
check_model_output_exists(Path(temp_dir) / "checkpoint-100/adapter", cfg)
- assert (
- Path(temp_dir) / "checkpoint-100/relora/model.safetensors"
- ).exists(), "Relora model checkpoint not found"
+ assert (Path(temp_dir) / "checkpoint-100/relora/model.safetensors").exists(), (
+ "Relora model checkpoint not found"
+ )
check_tensorboard(
temp_dir + "/runs", "train/grad_norm", 0.2, "grad_norm is too high"
diff --git a/tests/e2e/test_activation_offloading.py b/tests/e2e/test_activation_offloading.py
index 06c5c0656..9df85ab31 100644
--- a/tests/e2e/test_activation_offloading.py
+++ b/tests/e2e/test_activation_offloading.py
@@ -11,8 +11,6 @@ from axolotl.utils.dict import DictDefault
from .utils import check_model_output_exists
-# pylint: disable=duplicate-code
-
class TestActivationOffloading:
"""
@@ -28,7 +26,6 @@ class TestActivationOffloading:
temp_dir,
adapter,
):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_deepseekv3.py b/tests/e2e/test_deepseekv3.py
index e4a47fb0a..e11be8265 100644
--- a/tests/e2e/test_deepseekv3.py
+++ b/tests/e2e/test_deepseekv3.py
@@ -25,7 +25,6 @@ class TestDeepseekV3:
[True, False],
)
def test_lora_deepseekv3(self, temp_dir, sample_packing):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "axolotl-ai-co/DeepSeek-V3-11M",
@@ -83,7 +82,6 @@ class TestDeepseekV3:
[True, False],
)
def test_fft_deepseekv3(self, temp_dir, sample_packing):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "axolotl-ai-co/DeepSeek-V3-11M",
diff --git a/tests/e2e/test_diffusion.py b/tests/e2e/test_diffusion.py
new file mode 100644
index 000000000..cc3d8070b
--- /dev/null
+++ b/tests/e2e/test_diffusion.py
@@ -0,0 +1,139 @@
+"""E2E smoke test for diffusion training plugin."""
+
+from axolotl.common.datasets import load_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+from tests.e2e.utils import check_model_output_exists
+
+
+class TestDiffusion:
+ """Test case for diffusion training plugin."""
+
+ def test_diffusion_smoke_test(self, temp_dir):
+ """
+ Smoke test for diffusion training to ensure the plugin loads and trains without
+ error.
+ """
+ cfg = DictDefault(
+ {
+ "base_model": "HuggingFaceTB/SmolLM2-135M",
+ "tokenizer_type": "AutoTokenizer",
+ "trust_remote_code": True,
+ "sequence_len": 256,
+ "val_set_size": 0.1,
+ "special_tokens": {
+ "pad_token": "<|endoftext|>",
+ },
+ "datasets": [
+ {
+ "path": "mhenrichsen/alpaca_2k_test",
+ "type": "alpaca",
+ },
+ ],
+ "num_epochs": 1,
+ "max_steps": 3,
+ "micro_batch_size": 1,
+ "gradient_accumulation_steps": 1,
+ "output_dir": temp_dir,
+ "learning_rate": 0.0001,
+ "optimizer": "adamw_torch",
+ "lr_scheduler": "cosine",
+ "bf16": True,
+ "save_safetensors": True,
+ "save_first_step": False,
+ "logging_steps": 1,
+ "eval_steps": 3,
+ # Diffusion-specific config
+ "plugins": ["axolotl.integrations.diffusion.DiffusionPlugin"],
+ "diffusion": {
+ # sample generation
+ "generate_samples": True,
+ "generation_interval": 1,
+ "num_generation_samples": 1,
+ "generation_steps": 2,
+ "generation_max_length": 32,
+ "generation_temperature": 0.0,
+ # training-specific
+ "mask_token_id": 16,
+ "eps": 1e-3,
+ "importance_weighting": False,
+ },
+ }
+ )
+
+ cfg = validate_config(cfg)
+ normalize_config(cfg)
+ dataset_meta = load_datasets(cfg=cfg)
+
+ train(cfg=cfg, dataset_meta=dataset_meta)
+ check_model_output_exists(temp_dir, cfg)
+
+ def test_diffusion_sft_labels(self, temp_dir):
+ """Test that diffusion training properly handles SFT data with labels."""
+ cfg = DictDefault(
+ {
+ "base_model": "HuggingFaceTB/SmolLM2-135M",
+ "tokenizer_type": "AutoTokenizer",
+ "trust_remote_code": True,
+ "sequence_len": 256,
+ "val_set_size": 0.1,
+ "special_tokens": {
+ "pad_token": "<|endoftext|>",
+ },
+ "datasets": [
+ {
+ "path": "mhenrichsen/alpaca_2k_test",
+ "type": "alpaca",
+ },
+ ],
+ "num_epochs": 1,
+ "max_steps": 3,
+ "micro_batch_size": 1,
+ "gradient_accumulation_steps": 1,
+ "output_dir": temp_dir,
+ "learning_rate": 0.0001,
+ "optimizer": "adamw_torch",
+ "lr_scheduler": "cosine",
+ "bf16": True,
+ "save_safetensors": True,
+ "save_first_step": False,
+ "logging_steps": 1,
+ "eval_steps": 2,
+ # Diffusion-specific config
+ "plugins": ["axolotl.integrations.diffusion.DiffusionPlugin"],
+ "diffusion": {
+ # sample generation
+ "generate_samples": True,
+ "generation_interval": 1,
+ "num_generation_samples": 1,
+ "generation_steps": 2,
+ "generation_max_length": 32,
+ "generation_temperature": 0.0,
+ # training-specific
+ "mask_token_id": 16,
+ "eps": 1e-3,
+ "importance_weighting": True,
+ },
+ # Ensure we have proper SFT labels
+ "train_on_inputs": False,
+ }
+ )
+
+ cfg = validate_config(cfg)
+ normalize_config(cfg)
+ dataset_meta = load_datasets(cfg=cfg)
+
+ # Verify that the dataset has labels
+ sample = dataset_meta.train_dataset[0]
+ assert "labels" in sample, "SFT dataset should have labels"
+
+ # Check that some labels are -100 (prompt tokens)
+ labels = sample["labels"]
+ if hasattr(labels, "tolist"):
+ labels = labels.tolist()
+ assert -100 in labels, "SFT dataset should have -100 labels for prompt tokens"
+
+ train(cfg=cfg, dataset_meta=dataset_meta)
+ check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_dpo.py b/tests/e2e/test_dpo.py
index a1df69535..8f577ef47 100644
--- a/tests/e2e/test_dpo.py
+++ b/tests/e2e/test_dpo.py
@@ -21,7 +21,6 @@ class TestDPOLlamaLora(unittest.TestCase):
@with_temp_dir
def test_dpo_lora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -70,7 +69,6 @@ class TestDPOLlamaLora(unittest.TestCase):
@with_temp_dir
def test_dpo_nll_lora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -120,7 +118,6 @@ class TestDPOLlamaLora(unittest.TestCase):
@with_temp_dir
def test_dpo_use_weighting(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -171,7 +168,6 @@ class TestDPOLlamaLora(unittest.TestCase):
@pytest.mark.skip("kto_pair no longer supported in trl")
@with_temp_dir
def test_kto_pair_lora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -220,7 +216,6 @@ class TestDPOLlamaLora(unittest.TestCase):
@with_temp_dir
def test_ipo_lora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -269,7 +264,6 @@ class TestDPOLlamaLora(unittest.TestCase):
@with_temp_dir
def test_orpo_lora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -322,7 +316,6 @@ class TestDPOLlamaLora(unittest.TestCase):
@pytest.mark.skip(reason="Fix the implementation")
@with_temp_dir
def test_kto_lora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_embeddings_lr.py b/tests/e2e/test_embeddings_lr.py
index e4a06ad14..633e449ef 100644
--- a/tests/e2e/test_embeddings_lr.py
+++ b/tests/e2e/test_embeddings_lr.py
@@ -19,7 +19,6 @@ class TestEmbeddingsLrScale(unittest.TestCase):
@with_temp_dir
def test_train_w_embedding_lr_scale(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -65,7 +64,6 @@ class TestEmbeddingsLrScale(unittest.TestCase):
@with_temp_dir
def test_train_w_embedding_lr(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_evaluate.py b/tests/e2e/test_evaluate.py
index 977497e5e..3b0ab1450 100644
--- a/tests/e2e/test_evaluate.py
+++ b/tests/e2e/test_evaluate.py
@@ -13,7 +13,6 @@ class TestE2eEvaluate:
"""Test cases for evaluate CLI"""
def test_evaluate(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_falcon.py b/tests/e2e/test_falcon.py
index 5be6efcf6..1a363fe6a 100644
--- a/tests/e2e/test_falcon.py
+++ b/tests/e2e/test_falcon.py
@@ -22,7 +22,6 @@ class TestFalcon(unittest.TestCase):
@pytest.mark.skip(reason="no tiny models for testing with safetensors")
@with_temp_dir
def test_lora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "illuin/tiny-random-FalconForCausalLM",
@@ -74,7 +73,6 @@ class TestFalcon(unittest.TestCase):
@pytest.mark.skip(reason="no tiny models for testing with safetensors")
@with_temp_dir
def test_lora_added_vocab(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "illuin/tiny-random-FalconForCausalLM",
@@ -130,7 +128,6 @@ class TestFalcon(unittest.TestCase):
@pytest.mark.skip(reason="no tiny models for testing with safetensors")
@with_temp_dir
def test_ft(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "illuin/tiny-random-FalconForCausalLM",
diff --git a/tests/e2e/test_gemma2.py b/tests/e2e/test_gemma2.py
index c0eba72a7..9e9f1a9cc 100644
--- a/tests/e2e/test_gemma2.py
+++ b/tests/e2e/test_gemma2.py
@@ -22,7 +22,6 @@ class TestGemma2:
[True, False],
)
def test_lora_gemma2(self, temp_dir, sample_packing):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "axolotl-ai-co/gemma-2-33M",
@@ -78,7 +77,6 @@ class TestGemma2:
[True, False],
)
def test_fft_gemma2(self, temp_dir, sample_packing):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "axolotl-ai-co/gemma-2-33M",
diff --git a/tests/e2e/test_gemma3_text.py b/tests/e2e/test_gemma3_text.py
index ef38d028d..6cd999242 100644
--- a/tests/e2e/test_gemma3_text.py
+++ b/tests/e2e/test_gemma3_text.py
@@ -22,7 +22,6 @@ class TestGemma3Text:
[True, False],
)
def test_lora_gemma3_text(self, temp_dir, sample_packing):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "axolotl-ai-co/gemma-3-34M",
@@ -78,7 +77,6 @@ class TestGemma3Text:
[True, False],
)
def test_fft_gemma3_text(self, temp_dir, sample_packing):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "axolotl-ai-co/gemma-3-34M",
diff --git a/tests/e2e/test_imports.py b/tests/e2e/test_imports.py
index 050e4dfb3..4c01e50be 100644
--- a/tests/e2e/test_imports.py
+++ b/tests/e2e/test_imports.py
@@ -11,11 +11,7 @@ class TestImports(unittest.TestCase):
"""
def test_import_causal_trainer(self):
- from axolotl.core.builders import ( # pylint: disable=unused-import # noqa: F401
- HFCausalTrainerBuilder,
- )
+ pass
def test_import_rl_trainer(self):
- from axolotl.core.builders import ( # pylint: disable=unused-import # noqa: F401
- HFRLTrainerBuilder,
- )
+ pass
diff --git a/tests/e2e/test_llama.py b/tests/e2e/test_llama.py
index 1e6df0be9..de085cbe2 100644
--- a/tests/e2e/test_llama.py
+++ b/tests/e2e/test_llama.py
@@ -16,7 +16,6 @@ class TestLlama:
"""
def test_fft_trust_remote_code(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -57,7 +56,6 @@ class TestLlama:
check_model_output_exists(temp_dir, cfg)
def test_fix_untrained_tokens(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -105,7 +103,6 @@ class TestLlama:
check_model_output_exists(temp_dir, cfg)
def test_fix_untrained_tokens_already_trained(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -150,7 +147,6 @@ class TestLlama:
check_model_output_exists(temp_dir, cfg)
def test_batch_flattening(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_llama_pretrain.py b/tests/e2e/test_llama_pretrain.py
index bd5502300..f0daa9dd6 100644
--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -22,7 +22,6 @@ class TestPretrainLlama:
],
)
def test_pretrain(self, temp_dir, sample_packing, pretrain_multipack_attn):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -30,7 +29,7 @@ class TestPretrainLlama:
"sequence_len": 1024,
"sample_packing": sample_packing,
"pretrain_multipack_attn": pretrain_multipack_attn,
- "dataset_processes": 1,
+ "dataset_num_proc": 1,
"special_tokens": {
"pad_token": "<|endoftext|>",
},
diff --git a/tests/e2e/test_llama_vision.py b/tests/e2e/test_llama_vision.py
index 760759bca..0cc927f76 100644
--- a/tests/e2e/test_llama_vision.py
+++ b/tests/e2e/test_llama_vision.py
@@ -19,7 +19,6 @@ class TestLlamaVision(unittest.TestCase):
@with_temp_dir
def test_lora_llama_vision_text_only_dataset(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "axolotl-ai-co/Llama-3.2-39M-Vision",
@@ -67,7 +66,6 @@ class TestLlamaVision(unittest.TestCase):
@with_temp_dir
def test_lora_llama_vision_multimodal_dataset(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "axolotl-ai-co/Llama-3.2-39M-Vision",
diff --git a/tests/e2e/test_load_model.py b/tests/e2e/test_load_model.py
index 8fcffeb11..7c5389a58 100644
--- a/tests/e2e/test_load_model.py
+++ b/tests/e2e/test_load_model.py
@@ -56,13 +56,11 @@ class TestLoadModelUtils:
"context_parallel_size": 1,
}
)
- self.model_loader = ( # pylint: disable=attribute-defined-outside-init
- ModelLoader(
- cfg=self.cfg,
- tokenizer="",
- inference=False,
- reference_model=True,
- )
+ self.model_loader = ModelLoader(
+ cfg=self.cfg,
+ tokenizer="",
+ inference=False,
+ reference_model=True,
)
@pytest.mark.parametrize("embedding_modules", ["embed_tokens", "lm_head"])
@@ -74,7 +72,7 @@ class TestLoadModelUtils:
self, temp_dir, embedding_modules, dist_dtype, before_kbit_train_or_finetune
):
self.cfg.output_dir = temp_dir
- self.model_loader.tokenizer = load_tokenizer(self.cfg) # pylint: disable=all
+ self.model_loader.tokenizer = load_tokenizer(self.cfg)
self.model_loader.load()
self.model_loader._convert_embedding_modules_dtype(
embedding_modules, dist_dtype, before_kbit_train_or_finetune
diff --git a/tests/e2e/test_lora_llama.py b/tests/e2e/test_lora_llama.py
index 7e0ff46cf..b6ee393df 100644
--- a/tests/e2e/test_lora_llama.py
+++ b/tests/e2e/test_lora_llama.py
@@ -19,7 +19,6 @@ class TestLoraLlama(unittest.TestCase):
@with_temp_dir
def test_lora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_mamba.py b/tests/e2e/test_mamba.py
index 73d3bdc26..67935377d 100644
--- a/tests/e2e/test_mamba.py
+++ b/tests/e2e/test_mamba.py
@@ -22,7 +22,6 @@ class TestMamba(unittest.TestCase):
@with_temp_dir
def test_fft(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "state-spaces/mamba-130m",
diff --git a/tests/e2e/test_mistral.py b/tests/e2e/test_mistral.py
index f47f794e0..08b3b05af 100644
--- a/tests/e2e/test_mistral.py
+++ b/tests/e2e/test_mistral.py
@@ -21,7 +21,6 @@ class TestMistral(unittest.TestCase):
@with_temp_dir
def test_lora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
@@ -68,7 +67,6 @@ class TestMistral(unittest.TestCase):
@with_temp_dir
def test_ft(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
diff --git a/tests/e2e/test_mixtral.py b/tests/e2e/test_mixtral.py
index 3fe2bf70f..c46cf906d 100644
--- a/tests/e2e/test_mixtral.py
+++ b/tests/e2e/test_mixtral.py
@@ -22,7 +22,6 @@ class TestMixtral(unittest.TestCase):
@with_temp_dir
def test_qlora_w_fa2(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "hf-internal-testing/Mixtral-tiny",
@@ -78,7 +77,6 @@ class TestMixtral(unittest.TestCase):
@with_temp_dir
def test_qlora_wo_fa2(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "hf-internal-testing/Mixtral-tiny",
@@ -134,7 +132,6 @@ class TestMixtral(unittest.TestCase):
@with_temp_dir
def test_16bit_lora_w_fa2(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "hf-internal-testing/Mixtral-tiny",
@@ -193,7 +190,6 @@ class TestMixtral(unittest.TestCase):
@with_temp_dir
def test_16bit_lora_wo_fa2(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "hf-internal-testing/Mixtral-tiny",
@@ -252,7 +248,6 @@ class TestMixtral(unittest.TestCase):
@with_temp_dir
def test_ft(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "hf-internal-testing/Mixtral-tiny",
diff --git a/tests/e2e/test_optimizers.py b/tests/e2e/test_optimizers.py
index 987d86041..dbea92a5b 100644
--- a/tests/e2e/test_optimizers.py
+++ b/tests/e2e/test_optimizers.py
@@ -25,7 +25,6 @@ class TestCustomOptimizers(unittest.TestCase):
@with_temp_dir
def test_optimi_adamw(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -71,7 +70,6 @@ class TestCustomOptimizers(unittest.TestCase):
@with_temp_dir
@require_torch_2_5_1
def test_adopt_adamw(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -117,7 +115,6 @@ class TestCustomOptimizers(unittest.TestCase):
@with_temp_dir
@require_torch_2_5_1
def test_muon(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -164,7 +161,6 @@ class TestCustomOptimizers(unittest.TestCase):
@with_temp_dir
@require_torch_2_7_0
def test_dion(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -206,7 +202,6 @@ class TestCustomOptimizers(unittest.TestCase):
@with_temp_dir
def test_fft_schedule_free_adamw(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -234,7 +229,6 @@ class TestCustomOptimizers(unittest.TestCase):
"save_first_step": False,
}
)
- # pylint: disable=duplicate-code
cfg = validate_config(cfg)
normalize_config(cfg)
@@ -246,7 +240,6 @@ class TestCustomOptimizers(unittest.TestCase):
@with_temp_dir
@require_torch_2_6_0
def test_came_pytorch(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "JackFram/llama-68m",
diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py
index aec9d95f8..7cb979ce6 100644
--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -21,7 +21,6 @@ class TestPackedLlama(unittest.TestCase):
@with_temp_dir
def test_loss_packed(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_phi.py b/tests/e2e/test_phi.py
index ab3a63674..ae2210249 100644
--- a/tests/e2e/test_phi.py
+++ b/tests/e2e/test_phi.py
@@ -19,7 +19,6 @@ class TestPhi(unittest.TestCase):
@with_temp_dir
def test_phi_ft(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "microsoft/phi-1_5",
@@ -65,7 +64,6 @@ class TestPhi(unittest.TestCase):
@with_temp_dir
def test_phi_qlora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "microsoft/phi-1_5",
diff --git a/tests/e2e/test_preprocess.py b/tests/e2e/test_preprocess.py
index 25f42e832..8f15cbe55 100644
--- a/tests/e2e/test_preprocess.py
+++ b/tests/e2e/test_preprocess.py
@@ -14,8 +14,8 @@ class TestPreprocess:
"""test cases for preprocess"""
def test_w_deepspeed(self, temp_dir):
- """make sure preproces doesn't choke when using deepspeed in the config"""
- # pylint: disable=duplicate-code
+ """make sure preprocess doesn't choke when using deepspeed in the config"""
+
cfg = DictDefault(
{
"base_model": "Qwen/Qwen2.5-0.5B",
diff --git a/tests/e2e/test_process_reward_model_smollm2.py b/tests/e2e/test_process_reward_model_smollm2.py
index bd9eec48b..9d83aabbc 100644
--- a/tests/e2e/test_process_reward_model_smollm2.py
+++ b/tests/e2e/test_process_reward_model_smollm2.py
@@ -19,7 +19,6 @@ class TestProcessRewardSmolLM2(unittest.TestCase):
@with_temp_dir
def test_prm(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_qat.py b/tests/e2e/test_qat.py
index 139ae155a..2f8398ef7 100644
--- a/tests/e2e/test_qat.py
+++ b/tests/e2e/test_qat.py
@@ -18,7 +18,6 @@ class TestQATLlama:
"""
def test_qat(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -44,7 +43,7 @@ class TestQATLlama:
"qat": {
"quantize_embedding": True,
"activation_dtype": "int8",
- "weight_dtype": "int8",
+ "weight_dtype": "int4",
"group_size": 8,
},
"num_epochs": 1,
@@ -68,7 +67,6 @@ class TestQATLlama:
check_model_output_exists(Path(temp_dir) / "checkpoint-5", cfg)
def test_qat_dpo(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -113,7 +111,7 @@ class TestQATLlama:
"qat": {
"quantize_embedding": True,
"activation_dtype": "int8",
- "weight_dtype": "int8",
+ "weight_dtype": "int4",
"group_size": 8,
},
"save_first_step": False,
diff --git a/tests/e2e/test_quantization.py b/tests/e2e/test_quantization.py
index 500b7e556..706279c6c 100644
--- a/tests/e2e/test_quantization.py
+++ b/tests/e2e/test_quantization.py
@@ -5,42 +5,41 @@ Tests for axolotl.utils.quantization
import pytest
import torch
from torch import nn
-from torchao.dtypes.affine_quantized_tensor import AffineQuantizedTensor
-from torchao.quantization.granularity import PerAxis, PerGroup
-from torchao.quantization.linear_activation_quantized_tensor import (
- LinearActivationQuantizedTensor,
-)
+from torchao.quantization import LinearActivationQuantizedTensor
from torchao.quantization.qat.embedding import FakeQuantizedEmbedding
from torchao.quantization.qat.linear import FakeQuantizedLinear
from torchao.quantization.quant_api import (
- Int4DynamicActivationInt4WeightConfig,
- Int4WeightOnlyConfig,
- Int8DynamicActivationInt8WeightConfig,
- Int8WeightOnlyConfig,
- UIntXWeightOnlyConfig,
+ Float8DynamicActivationFloat8WeightConfig,
+ Float8DynamicActivationInt4WeightConfig,
+ Int8DynamicActivationInt4WeightConfig,
)
+from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor
from transformers import AutoModelForCausalLM
from transformers.trainer_callback import TrainerState
from axolotl.utils.callbacks.qat import QATCallback
from axolotl.utils.quantization import (
- convert_qat_model_for_ptq,
- get_ptq_config,
+ convert_qat_model,
+ get_quantization_config,
prepare_model_for_qat,
- quantize_model_for_ptq,
+ quantize_model,
)
-from axolotl.utils.schemas.enums import TorchIntDType
+from axolotl.utils.schemas.enums import TorchAOQuantDType
from axolotl.utils.schemas.quantization import QATConfig
-from tests.e2e.utils import require_torch_2_6_0
+from tests.e2e.utils import (
+ require_torch_2_8_0,
+ requires_cuda_ge_8_9,
+ requires_sm_ge_100,
+)
@pytest.fixture()
def model():
dummy_model = AutoModelForCausalLM.from_pretrained(
- "HuggingFaceTB/SmolLM2-135M",
- device_map="cuda",
- torch_dtype=torch.bfloat16,
+ "Qwen/Qwen2-0.5B",
+ device_map="auto",
+ dtype=torch.bfloat16,
)
with torch.device(dummy_model.device):
dummy_model.model.embed_tokens = torch.nn.Embedding(
@@ -48,45 +47,56 @@ def model():
dummy_model.model.embed_tokens.weight.shape[1],
dtype=dummy_model.model.embed_tokens.weight.dtype,
)
- return dummy_model
+ yield dummy_model
+ del dummy_model
ptq_config_test_cases = [
- # weight_dtype, activation_dtype, group_size, expected_type, expected_params
+ # weight_dtype, activation_dtype, group_size, expected_type
(
- TorchIntDType.uint4,
+ TorchAOQuantDType.int4,
+ TorchAOQuantDType.int8,
None,
- None,
- UIntXWeightOnlyConfig,
- {"dtype": torch.uint4, "group_size": None},
- ),
- (TorchIntDType.int8, None, 32, Int8WeightOnlyConfig, {"group_size": 32}),
- (TorchIntDType.int4, None, 4, Int4WeightOnlyConfig, {"group_size": 4}),
- (
- TorchIntDType.int4,
- TorchIntDType.int4,
- None,
- Int4DynamicActivationInt4WeightConfig,
- {},
+ Int8DynamicActivationInt4WeightConfig,
),
(
- TorchIntDType.int8,
- TorchIntDType.int8,
+ TorchAOQuantDType.float8_e4m3fn,
+ TorchAOQuantDType.float8_e4m3fn,
None,
- Int8DynamicActivationInt8WeightConfig,
- {},
+ Float8DynamicActivationFloat8WeightConfig,
+ ),
+ (
+ TorchAOQuantDType.int4,
+ TorchAOQuantDType.float8_e4m3fn,
+ None,
+ Float8DynamicActivationInt4WeightConfig,
),
]
ptq_test_cases = [
- # weight_dtype, activation_dtype, group_size, quantize_embedding, expected_exception
- (TorchIntDType.int8, None, 8, False, None),
- (TorchIntDType.int4, None, 4, True, None),
- (TorchIntDType.uint4, None, 8, False, None),
- (TorchIntDType.int4, TorchIntDType.int4, 8, False, None),
- (TorchIntDType.int8, TorchIntDType.int8, 8, True, None),
- (TorchIntDType.int8, None, None, False, ValueError),
- (TorchIntDType.int4, None, None, False, ValueError),
+ # weight_dtype, activation_dtype, group_size, quantize_embedding, expected_exception, expected_tensor_class
+ (TorchAOQuantDType.int4, None, 4, True, None, Int4Tensor),
+ (
+ TorchAOQuantDType.int4,
+ TorchAOQuantDType.int8,
+ 8,
+ False,
+ None,
+ LinearActivationQuantizedTensor,
+ ),
+ # (
+ # TorchAOQuantDType.int4,
+ # TorchAOQuantDType.float8_e4m3fn,
+ # None,
+ # False,
+ # None,
+ # Int4Tensor,
+ # ),
+ (TorchAOQuantDType.int4, None, None, False, None, Int4Tensor),
+ # Deprecated configs
+ (TorchAOQuantDType.int8, None, 8, False, ValueError, None),
+ (TorchAOQuantDType.int4, TorchAOQuantDType.int4, 8, False, ValueError, None),
+ (TorchAOQuantDType.int8, TorchAOQuantDType.int8, 8, True, ValueError, None),
]
@@ -96,44 +106,132 @@ class TestQuantization:
"""
@pytest.mark.parametrize(
- "weight_dtype,activation_dtype,group_size,expected_type,expected_params",
+ "weight_dtype,activation_dtype,group_size,expected_type",
ptq_config_test_cases,
)
- @require_torch_2_6_0
+ @requires_cuda_ge_8_9
+ @require_torch_2_8_0
def test_get_ptq_config(
- self, weight_dtype, activation_dtype, group_size, expected_type, expected_params
+ self, weight_dtype, activation_dtype, group_size, expected_type
):
- config = get_ptq_config(weight_dtype, activation_dtype, group_size)
-
+ config = get_quantization_config(weight_dtype, activation_dtype, group_size)
assert isinstance(config, expected_type)
- for param_name, param_value in expected_params.items():
- if isinstance(param_value, (PerAxis, PerGroup)):
- if isinstance(param_value, PerAxis):
- assert isinstance(getattr(config, param_name), PerAxis)
- assert getattr(config, param_name).axis == param_value.axis
- else:
- assert isinstance(getattr(config, param_name), PerGroup)
- assert (
- getattr(config, param_name).group_size == param_value.group_size
- )
- else:
- assert getattr(config, param_name) == param_value
+ @requires_cuda_ge_8_9
+ @require_torch_2_8_0
+ def test_get_ptq_config_int4_weight_only(self):
+ from torchao.quantization.quant_api import Int4WeightOnlyConfig
+
+ config = get_quantization_config(TorchAOQuantDType.int4, None, 4)
+ assert isinstance(config, Int4WeightOnlyConfig)
@pytest.mark.parametrize(
- "weight_dtype", [TorchIntDType.int8, TorchIntDType.int4, TorchIntDType.uint4]
+ "weight_dtype,activation_dtype,group_size,quantize_embedding,expected_exception,expected_tensor_class",
+ ptq_test_cases,
)
+ @requires_cuda_ge_8_9
+ @require_torch_2_8_0
+ def test_quantize_model_for_ptq(
+ self,
+ model,
+ weight_dtype,
+ activation_dtype,
+ group_size,
+ quantize_embedding,
+ expected_exception,
+ expected_tensor_class,
+ ):
+ if expected_exception:
+ with pytest.raises(expected_exception):
+ quantize_model(
+ model,
+ weight_dtype,
+ group_size,
+ activation_dtype,
+ quantize_embedding,
+ )
+ else:
+ quantize_model(
+ model, weight_dtype, group_size, activation_dtype, quantize_embedding
+ )
+ if quantize_embedding:
+ assert isinstance(
+ model.model.embed_tokens.weight, expected_tensor_class
+ ), "Embedding weight should be quantized"
+ for child in list(model.children()):
+ if isinstance(child, torch.nn.Linear):
+ assert isinstance(child.weight, expected_tensor_class)
+
+ @require_torch_2_8_0
+ @requires_sm_ge_100
+ def test_quantize_model_for_ptq_fp8(
+ self,
+ model,
+ ):
+ from torchao.quantization.quantize_.workflows.float8.float8_tensor import (
+ Float8Tensor,
+ QuantizeTensorToFloat8Kwargs,
+ )
+
+ quantize_model(
+ model,
+ TorchAOQuantDType.float8_e4m3fn,
+ None,
+ TorchAOQuantDType.float8_e4m3fn,
+ )
+ for child in list(model.children()):
+ if isinstance(child, torch.nn.Linear):
+ assert isinstance(child.weight, Float8Tensor)
+ assert child.weight.act_quant_kwargs is not None and isinstance(
+ child.weight.act_quant_kwargs, QuantizeTensorToFloat8Kwargs
+ )
+
+ @require_torch_2_8_0
+ @requires_sm_ge_100
+ def test_quantize_model_for_ptq_nvfp4(
+ self,
+ model,
+ ):
+ from torchao.prototype.mx_formats.nvfp4_tensor import (
+ NVFP4Tensor,
+ QuantizeTensorToNVFP4Kwargs,
+ )
+
+ quantize_model(model, TorchAOQuantDType.nvfp4, 16, TorchAOQuantDType.nvfp4)
+ for child in list(model.children()):
+ if isinstance(child, torch.nn.Linear):
+ assert isinstance(child.weight, NVFP4Tensor)
+ assert child.weight.act_quant_kwargs is not None and isinstance(
+ child.weight.act_quant_kwargs, QuantizeTensorToNVFP4Kwargs
+ )
+
@pytest.mark.parametrize(
- "activation_dtype", [None, TorchIntDType.int4, TorchIntDType.int8]
+ "weight_dtype,activation_dtype,group_size,quantize_embedding",
+ [
+ (TorchAOQuantDType.int4, None, 8, False),
+ (TorchAOQuantDType.int4, None, 16, True),
+ (TorchAOQuantDType.int4, TorchAOQuantDType.int8, 8, False),
+ (TorchAOQuantDType.int4, TorchAOQuantDType.int8, 16, True),
+ (
+ TorchAOQuantDType.float8_e4m3fn,
+ TorchAOQuantDType.float8_e4m3fn,
+ None,
+ False,
+ ),
+ (TorchAOQuantDType.int4, TorchAOQuantDType.float8_e4m3fn, None, True),
+ ],
)
- @pytest.mark.parametrize("group_size", [4, 8])
- @pytest.mark.parametrize("quantize_embedding", [False, True])
- @require_torch_2_6_0
+ @require_torch_2_8_0
+ @requires_cuda_ge_8_9
def test_prepare_model_for_qat(
self, model, weight_dtype, activation_dtype, group_size, quantize_embedding
- ): # pylint: disable=redefined-outer-name
+ ):
prepare_model_for_qat(
- model, weight_dtype, group_size, activation_dtype, quantize_embedding
+ model,
+ weight_dtype,
+ group_size,
+ activation_dtype,
+ quantize_embedding,
)
if quantize_embedding:
assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
@@ -142,17 +240,19 @@ class TestQuantization:
model.model.embed_tokens.weight_fake_quantizer.config.dtype
== weight_dtype.value
)
- assert (
- model.model.embed_tokens.weight_fake_quantizer.config.group_size
- == group_size
- )
+ if group_size:
+ assert (
+ model.model.embed_tokens.weight_fake_quantizer.config.group_size
+ == group_size
+ )
for child in list(model.children()):
if isinstance(child, torch.nn.Linear):
assert isinstance(child, FakeQuantizedLinear)
assert hasattr(child, "weight_fake_quantizer")
assert child.weight_fake_quantizer.config.dtype == weight_dtype.value
- assert child.weight_fake_quantizer.config.group_size == group_size
+ if group_size:
+ assert child.weight_fake_quantizer.config.group_size == group_size
if activation_dtype:
assert hasattr(child, "activation_fake_quantizer")
assert (
@@ -162,47 +262,40 @@ class TestQuantization:
else:
assert child.activation_fake_quantizer is None
- @pytest.mark.parametrize(
- "weight_dtype,activation_dtype,group_size,quantize_embedding,expected_exception",
- ptq_test_cases,
- )
- @require_torch_2_6_0
- def test_quantize_model_for_ptq(
- self,
- model,
- weight_dtype,
- activation_dtype,
- group_size,
- quantize_embedding,
- expected_exception,
- ): # pylint: disable=redefined-outer-name
- if expected_exception:
- with pytest.raises(expected_exception):
- quantize_model_for_ptq(
- model,
- weight_dtype,
- group_size,
- activation_dtype,
- quantize_embedding,
- )
- else:
- quantize_model_for_ptq(
- model, weight_dtype, group_size, activation_dtype, quantize_embedding
- )
- if quantize_embedding:
- assert isinstance(
- model.model.embed_tokens.weight, AffineQuantizedTensor
- ), "Embedding weight should be quantized"
- for child in list(model.children()):
- if isinstance(child, torch.nn.Linear):
- if activation_dtype:
- assert isinstance(
- child.weight, LinearActivationQuantizedTensor
- ), "Linear weight should be quantized with activation quantization"
- else:
- assert isinstance(
- child.weight, AffineQuantizedTensor
- ), "Linear weight should be quantized without activation quantization"
+ @require_torch_2_8_0
+ @requires_cuda_ge_8_9
+ def test_convert_qat_model(self, model):
+ config = QATConfig(
+ weight_dtype="int4",
+ activation_dtype="int8",
+ group_size=8,
+ quantize_embedding=True,
+ )
+
+ # quantize model for qat
+ prepare_model_for_qat(
+ model,
+ config.weight_dtype,
+ config.group_size,
+ config.activation_dtype,
+ config.quantize_embedding,
+ )
+
+ assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
+ assert isinstance(model.lm_head, FakeQuantizedLinear)
+
+ # apply conversion
+ convert_qat_model(
+ model,
+ config.quantize_embedding,
+ )
+ # ensure modules have been swapped out
+ assert not isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
+ assert not isinstance(model.lm_head, FakeQuantizedLinear)
+
+ # ensure weights have been quantized
+ assert isinstance(model.model.embed_tokens.weight, nn.Parameter)
+ assert isinstance(model.lm_head.weight, nn.Parameter)
class TestQuantizationCallback:
@@ -216,12 +309,10 @@ class TestQuantizationCallback:
global_step=0,
)
- @require_torch_2_6_0
- def test_qat_callback_fake_quant_after_n_steps(
- self, model, trainer_state
- ): # pylint: disable=redefined-outer-name
+ @require_torch_2_8_0
+ def test_qat_callback_fake_quant_after_n_steps(self, model, trainer_state):
cfg = QATConfig(
- weight_dtype="int8",
+ weight_dtype="int4",
activation_dtype="int8",
group_size=8,
quantize_embedding=True,
@@ -268,12 +359,10 @@ class TestQuantizationCallback:
assert model.model.embed_tokens.weight_fake_quantizer.enabled
assert model.lm_head.weight_fake_quantizer.enabled
- @require_torch_2_6_0
- def test_qat_callback_fake_quant_after_n_steps_is_none(
- self, model, trainer_state
- ): # pylint: disable=redefined-outer-name
+ @require_torch_2_8_0
+ def test_qat_callback_fake_quant_after_n_steps_is_none(self, model, trainer_state):
cfg = QATConfig(
- weight_dtype="int8",
+ weight_dtype="int4",
activation_dtype="int8",
group_size=8,
quantize_embedding=True,
@@ -306,45 +395,3 @@ class TestQuantizationCallback:
# quantization should be enabled from the get-go
assert model.model.embed_tokens.weight_fake_quantizer.enabled
assert model.lm_head.weight_fake_quantizer.enabled
-
-
-class TestConvertQATModelForPTQ:
- """
- Test convert_qat_model_for_ptq
- """
-
- @require_torch_2_6_0
- def test_convert_qat_model_for_ptq(
- self, model
- ): # pylint: disable=redefined-outer-name
- config = QATConfig(
- weight_dtype="int8",
- activation_dtype="int8",
- group_size=8,
- quantize_embedding=True,
- )
-
- # quantize model for qat
- prepare_model_for_qat(
- model,
- config.weight_dtype,
- config.group_size,
- config.activation_dtype,
- config.quantize_embedding,
- )
-
- assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
- assert isinstance(model.lm_head, FakeQuantizedLinear)
-
- # apply conversion
- convert_qat_model_for_ptq(
- model,
- quantize_embedding=config.quantize_embedding,
- )
- # ensure modules have been swapped out
- assert not isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
- assert not isinstance(model.lm_head, FakeQuantizedLinear)
-
- # ensure weights have been quantized
- assert isinstance(model.model.embed_tokens.weight, nn.Parameter)
- assert isinstance(model.lm_head.weight, nn.Parameter)
diff --git a/tests/e2e/test_qwen.py b/tests/e2e/test_qwen.py
index 59267d14d..1c75d817b 100644
--- a/tests/e2e/test_qwen.py
+++ b/tests/e2e/test_qwen.py
@@ -19,7 +19,6 @@ class TestE2eQwen:
@pytest.mark.parametrize("base_model", ["Qwen/Qwen2-0.5B", "Qwen/Qwen2.5-0.5B"])
def test_dpo(self, base_model, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": base_model,
diff --git a/tests/e2e/test_reward_model_smollm2.py b/tests/e2e/test_reward_model_smollm2.py
index 82513f99f..cc768b173 100644
--- a/tests/e2e/test_reward_model_smollm2.py
+++ b/tests/e2e/test_reward_model_smollm2.py
@@ -19,7 +19,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
@with_temp_dir
def test_rm_lora(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_save_first_step.py b/tests/e2e/test_save_first_step.py
index 5bbd2302b..ce2d3f145 100644
--- a/tests/e2e/test_save_first_step.py
+++ b/tests/e2e/test_save_first_step.py
@@ -20,7 +20,6 @@ class TestSaveFirstStepCallback(unittest.TestCase):
@with_temp_dir
def test_save_first_step(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -61,7 +60,6 @@ class TestSaveFirstStepCallback(unittest.TestCase):
@with_temp_dir
def test_no_save_first_step(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_schedulers.py b/tests/e2e/test_schedulers.py
index 8f7a13aee..5b9c56288 100644
--- a/tests/e2e/test_schedulers.py
+++ b/tests/e2e/test_schedulers.py
@@ -19,7 +19,6 @@ class TestCustomSchedulers(unittest.TestCase):
@with_temp_dir
def test_rex_scheduler(self, temp_dir):
- # pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_streaming.py b/tests/e2e/test_streaming.py
new file mode 100644
index 000000000..5dccf00dd
--- /dev/null
+++ b/tests/e2e/test_streaming.py
@@ -0,0 +1,73 @@
+"""E2E tests for streaming dataset functionality"""
+
+# pylint: disable=duplicate-code
+
+import pytest
+
+from axolotl.common.datasets import load_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+from .utils import check_model_output_exists, check_tensorboard
+
+
+class TestStreamingDatasets:
+ """Test case for streaming datasets"""
+
+ @pytest.mark.parametrize(
+ "sample_packing",
+ [True, False],
+ )
+ def test_streaming_dataset(self, temp_dir, sample_packing):
+ """Test streaming datasets"""
+
+ cfg = DictDefault(
+ {
+ "base_model": "HuggingFaceTB/SmolLM2-135M",
+ "flash_attention": True,
+ "sequence_len": 1024,
+ "sample_packing": sample_packing,
+ "pretrain_multipack_attn": sample_packing,
+ "streaming_multipack_buffer_size": 10000,
+ "dataset_processes": 1,
+ "special_tokens": {
+ "pad_token": "<|endoftext|>",
+ },
+ "datasets": [
+ {
+ "path": "mhenrichsen/alpaca_2k_test",
+ "type": "alpaca",
+ },
+ ],
+ # Streaming config
+ "streaming": True,
+ "max_steps": 3,
+ "micro_batch_size": 1,
+ "gradient_accumulation_steps": 1,
+ "val_set_size": 0.0,
+ "output_dir": temp_dir,
+ "learning_rate": 0.00001,
+ "optimizer": "adamw_torch_fused",
+ "lr_scheduler": "cosine",
+ "save_safetensors": True,
+ "bf16": "auto",
+ "use_tensorboard": True,
+ "save_first_step": False,
+ }
+ )
+
+ cfg = validate_config(cfg)
+ normalize_config(cfg)
+ dataset_meta = load_datasets(cfg=cfg)
+
+ train(cfg=cfg, dataset_meta=dataset_meta)
+ check_model_output_exists(temp_dir, cfg)
+
+ # Verify training actually happened by checking loss decrease
+ check_tensorboard(
+ temp_dir + "/runs",
+ "train/train_loss",
+ 3.0,
+ "Train Loss (%s) is too high",
+ )
diff --git a/tests/e2e/test_tokenizer.py b/tests/e2e/test_tokenizer.py
new file mode 100644
index 000000000..a65c17ac3
--- /dev/null
+++ b/tests/e2e/test_tokenizer.py
@@ -0,0 +1,63 @@
+"""
+e2e test for saving the tokenizer
+"""
+
+from unittest.mock import patch
+
+from axolotl.common.datasets import load_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+from tests.e2e.utils import check_model_output_exists
+
+
+def test_tokenizer_no_save_jinja_files(temp_dir):
+ # pylint: disable=duplicate-code
+ cfg = DictDefault(
+ {
+ "base_model": "HuggingFaceTB/SmolLM2-135M",
+ "tokenizer_type": "AutoTokenizer",
+ "sequence_len": 1024,
+ "load_in_8bit": True,
+ "adapter": "lora",
+ "lora_r": 8,
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "lora_target_linear": True,
+ "val_set_size": 0.02,
+ "special_tokens": {
+ "pad_token": "<|endoftext|>",
+ },
+ "chat_template": "chatml",
+ "datasets": [
+ {
+ "path": "mhenrichsen/alpaca_2k_test",
+ "type": "alpaca",
+ },
+ ],
+ "num_epochs": 1,
+ "micro_batch_size": 2,
+ "gradient_accumulation_steps": 1,
+ "output_dir": temp_dir,
+ "learning_rate": 0.00001,
+ "optimizer": "adamw_torch_fused",
+ "lr_scheduler": "cosine",
+ "max_steps": 5,
+ "save_first_step": False,
+ "fp16": False,
+ "tokenizer_save_jinja_files": False,
+ }
+ )
+
+ cfg = validate_config(cfg)
+ normalize_config(cfg)
+ dataset_meta = load_datasets(cfg=cfg)
+
+ with patch("axolotl.train.execute_training"):
+ train(cfg=cfg, dataset_meta=dataset_meta)
+
+ check_model_output_exists(temp_dir, cfg)
+ with open(f"{temp_dir}/tokenizer_config.json", "r", encoding="utf-8") as f:
+ tokenizer_config = f.read()
+ assert "chat_template" in tokenizer_config
diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py
index 5931fe148..a2dd8bc5e 100644
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -2,6 +2,7 @@
helper utils for tests
"""
+import importlib.util
import os
import shutil
import tempfile
@@ -89,6 +90,18 @@ def require_torch_2_7_0(test_case):
return unittest.skipUnless(is_min_2_7_0(), "test requires torch>=2.7.0")(test_case)
+def require_torch_2_8_0(test_case):
+ """
+ Decorator marking a test that requires torch >= 2.7.0
+ """
+
+ def is_min_2_8_0():
+ torch_version = version.parse(torch.__version__)
+ return torch_version >= version.parse("2.8.0")
+
+ return unittest.skipUnless(is_min_2_8_0(), "test requires torch>=2.8.0")(test_case)
+
+
def require_torch_lt_2_6_0(test_case):
"""
Decorator marking a test that requires torch < 2.6.0
@@ -107,12 +120,7 @@ def require_vllm(test_case):
"""
def is_vllm_installed():
- try:
- import vllm # pylint: disable=unused-import # noqa: F401
-
- return True
- except ImportError:
- return False
+ return importlib.util.find_spec("vllm") is not None
return unittest.skipUnless(
is_vllm_installed(), "test requires vllm to be installed"
@@ -125,18 +133,31 @@ def require_llmcompressor(test_case):
"""
def is_llmcompressor_installed():
- try:
- import llmcompressor # pylint: disable=unused-import # noqa: F401
-
- return True
- except ImportError:
- return False
+ return importlib.util.find_spec("llmcompressor") is not None
return unittest.skipUnless(
is_llmcompressor_installed(), "test requires llmcompressor to be installed"
)(test_case)
+def requires_sm_ge_100(test_case):
+ is_sm_ge_100 = (
+ torch.cuda.is_available()
+ and torch.version.cuda
+ and torch.cuda.get_device_capability() >= (10, 0)
+ )
+ return unittest.skipUnless(is_sm_ge_100, "test requires sm>=100")(test_case)
+
+
+def requires_cuda_ge_8_9(test_case):
+ is_cuda_ge_8_9 = (
+ torch.cuda.is_available()
+ and torch.version.cuda
+ and torch.cuda.get_device_capability() >= (8, 9)
+ )
+ return unittest.skipUnless(is_cuda_ge_8_9, "test requires cuda>=8.9")(test_case)
+
+
def is_hopper():
compute_capability = torch.cuda.get_device_capability()
return compute_capability == (9, 0)
@@ -147,7 +168,11 @@ def require_hopper(test_case):
def check_tensorboard(
- temp_run_dir: str, tag: str, lt_val: float, assertion_err: str
+ temp_run_dir: str,
+ tag: str,
+ lt_val: float,
+ assertion_err: str,
+ rtol: float = 0.02,
) -> None:
"""
helper function to parse and check tensorboard logs
@@ -155,8 +180,9 @@ def check_tensorboard(
tb_log_path = most_recent_subdir(temp_run_dir)
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
reader = SummaryReader(event_file)
- df = reader.scalars # pylint: disable=invalid-name
- df = df[(df.tag == tag)] # pylint: disable=invalid-name
+ df = reader.scalars
+ df = df[(df.tag == tag)]
+ lt_val = (1 + rtol) * lt_val
if "%s" in assertion_err:
assert df.value.values[-1] < lt_val, assertion_err % df.value.values[-1]
else:
diff --git a/tests/hf_offline_utils.py b/tests/hf_offline_utils.py
index 385e61f18..0e4a2f067 100644
--- a/tests/hf_offline_utils.py
+++ b/tests/hf_offline_utils.py
@@ -20,7 +20,7 @@ def reload_modules(hf_hub_offline):
importlib.reload(huggingface_hub.constants)
huggingface_hub.constants.HF_HUB_OFFLINE = hf_hub_offline
importlib.reload(datasets.config)
- setattr(datasets.config, "HF_HUB_OFFLINE", hf_hub_offline)
+ datasets.config.HF_HUB_OFFLINE = hf_hub_offline
reset_sessions()
diff --git a/tests/integrations/test_diffusion.py b/tests/integrations/test_diffusion.py
new file mode 100644
index 000000000..141d8d150
--- /dev/null
+++ b/tests/integrations/test_diffusion.py
@@ -0,0 +1,274 @@
+"""Tests for diffusion trainer integration."""
+
+# pylint: disable=redefined-outer-name,protected-access
+
+from unittest.mock import Mock
+
+import pytest
+import torch
+
+from axolotl.integrations.diffusion import DiffusionTrainer
+from axolotl.integrations.diffusion.utils import create_bidirectional_attention_mask
+from axolotl.utils.dict import DictDefault
+
+
+@pytest.fixture
+def mock_tokenizer():
+ """Create a mock tokenizer."""
+ tokenizer = Mock()
+ tokenizer.bos_token_id = 1
+ tokenizer.eos_token_id = 2
+ tokenizer.pad_token_id = 0
+ return tokenizer
+
+
+@pytest.fixture
+def diffusion_config():
+ """Create a diffusion config."""
+ return DictDefault(
+ {
+ "diffusion": {
+ "mask_token_id": 32000,
+ "eps": 1e-3,
+ "importance_weighting": False,
+ },
+ "sample_packing": False,
+ }
+ )
+
+
+@pytest.fixture
+def diffusion_trainer_instance(mock_tokenizer, diffusion_config):
+ """Create a diffusion trainer instance for testing methods directly."""
+ # Create a minimal trainer instance just for testing methods
+ trainer = object.__new__(DiffusionTrainer) # Bypass __init__
+ trainer.cfg = diffusion_config
+ trainer._special_token_ids = {0, 1, 2} # pad, bos, eos
+ trainer.processing_class = mock_tokenizer
+ trainer.store_metrics = Mock() # Mock metrics storage
+ return trainer
+
+
+class TestDiffusionTrainer:
+ """Test the DiffusionTrainer class."""
+
+ def test_forward_process_basic(self, diffusion_trainer_instance):
+ """Test basic forward process without labels."""
+ input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
+
+ noisy_batch, masked_indices, p_mask = (
+ diffusion_trainer_instance._forward_process(input_ids, eps=0.1)
+ )
+
+ # Check shapes
+ assert noisy_batch.shape == input_ids.shape
+ assert masked_indices.shape == input_ids.shape
+ assert p_mask.shape == input_ids.shape
+
+ # Check that special tokens are not masked
+ special_token_positions = (input_ids == 1) | (input_ids == 2) | (input_ids == 0)
+ assert not masked_indices[special_token_positions].any()
+
+ # Check that mask token is applied
+ mask_token_id = diffusion_trainer_instance.cfg.diffusion.mask_token_id
+ masked_positions = masked_indices
+ if masked_positions.any():
+ assert (noisy_batch[masked_positions] == mask_token_id).all()
+
+ def test_forward_process_with_labels(self, diffusion_trainer_instance):
+ """Test forward process with SFT labels."""
+ input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
+ labels = torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long)
+
+ noisy_batch, masked_indices, p_mask = (
+ diffusion_trainer_instance._forward_process(
+ input_ids, labels=labels, eps=0.1
+ )
+ )
+
+ # Check shapes
+ assert noisy_batch.shape == input_ids.shape
+ assert masked_indices.shape == input_ids.shape
+ assert p_mask.shape == input_ids.shape
+
+ # Check that only answer tokens can be masked (where labels != -100)
+ non_answer_mask = labels == -100
+
+ # No masking should occur on non-answer tokens
+ assert not masked_indices[non_answer_mask].any()
+
+ # p_mask should be the same for all positions (sampled timestep),
+ # but masking is only applied to answer tokens
+ assert p_mask.shape == input_ids.shape
+ # Verify that masked_indices respects the answer mask
+ assert not masked_indices[non_answer_mask].any()
+
+ def test_forward_process_with_attention_mask(self, diffusion_trainer_instance):
+ """Test forward process with attention mask."""
+ input_ids = torch.tensor([[1, 10, 20, 0]], dtype=torch.long)
+ attention_mask = torch.tensor([[1, 1, 1, 0]], dtype=torch.long)
+
+ _, masked_indices, p_mask = diffusion_trainer_instance._forward_process(
+ input_ids, attention_mask=attention_mask, eps=0.1
+ )
+
+ # Check that padding tokens are not masked
+ padding_positions = attention_mask == 0
+ assert not masked_indices[padding_positions].any()
+ assert (p_mask[padding_positions] == 0).all()
+
+ def test_bidirectional_attention_mask_no_packing(self, diffusion_trainer_instance):
+ """Test bidirectional attention mask without sample packing."""
+ input_ids = torch.tensor([[1, 10, 20, 2]], dtype=torch.long)
+
+ mask = create_bidirectional_attention_mask(input_ids)
+
+ # Should be all-to-all attention
+ expected_shape = (1, 1, 4, 4)
+ assert mask.shape == expected_shape
+ assert mask.all()
+
+ def test_bidirectional_attention_mask_with_packing(
+ self, diffusion_trainer_instance
+ ):
+ """Test bidirectional attention mask with sample packing."""
+ diffusion_trainer_instance.cfg.sample_packing = True
+ input_ids = torch.tensor([[1, 10, 20, 30, 40, 2]], dtype=torch.long)
+ # Sample IDs: first sample (1), second sample (2)
+ attention_mask = torch.tensor([[1, 1, 1, 2, 2, 2]], dtype=torch.long)
+
+ mask = create_bidirectional_attention_mask(
+ input_ids, attention_mask, sample_packing=True
+ )
+
+ # Check that tokens within same sample can attend to each other
+ # but not across samples
+ assert mask[0, 0, 0, 1].item() # First sample tokens can attend to each other
+ assert mask[0, 0, 1, 2].item()
+ assert not mask[0, 0, 0, 3].item() # Can't attend across samples
+ assert not mask[0, 0, 2, 4].item()
+ assert mask[0, 0, 3, 4].item() # Second sample tokens can attend to each other
+
+ def test_compute_loss_basic(self, diffusion_trainer_instance):
+ """Test basic loss computation."""
+ # Mock model that returns logits
+ mock_model = Mock()
+ mock_outputs = Mock()
+ vocab_size = 1000
+ seq_len = 5
+ mock_outputs.logits = torch.randn(1, seq_len, vocab_size, requires_grad=True)
+ mock_model.return_value = mock_outputs
+ mock_model.training = True
+
+ input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
+
+ loss, outputs = diffusion_trainer_instance._compute_diffusion_loss(
+ mock_model, input_ids
+ )
+
+ # Check that loss is computed
+ assert isinstance(loss, torch.Tensor)
+ assert loss.requires_grad
+ assert outputs == mock_outputs
+
+ # Check that metrics were stored
+ diffusion_trainer_instance.store_metrics.assert_called_once()
+
+ def test_compute_loss_sft(self, diffusion_trainer_instance):
+ """Test loss computation with SFT labels."""
+ # Mock model
+ mock_model = Mock()
+ mock_outputs = Mock()
+ vocab_size = 1000
+ seq_len = 5
+ mock_outputs.logits = torch.randn(1, seq_len, vocab_size, requires_grad=True)
+ mock_model.return_value = mock_outputs
+ mock_model.training = True
+ diffusion_trainer_instance.cfg.datasets = Mock()
+
+ input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
+ labels = torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long)
+
+ loss, _ = diffusion_trainer_instance._compute_diffusion_loss(
+ mock_model, input_ids, labels=labels
+ )
+
+ # Check that loss is computed
+ assert isinstance(loss, torch.Tensor)
+ assert loss.requires_grad
+
+ # Check that SFT metrics were added
+ call_args = diffusion_trainer_instance.store_metrics.call_args[0][0]
+ assert "answer_ratio" in call_args
+ assert "avg_answer_length" in call_args
+
+ def test_compute_loss_no_masked_tokens(self, diffusion_trainer_instance):
+ """Test loss computation when no tokens are masked."""
+ # Mock model
+ mock_model = Mock()
+ mock_outputs = Mock()
+ vocab_size = 1000
+ seq_len = 3
+ mock_outputs.logits = torch.randn(1, seq_len, vocab_size)
+ mock_model.return_value = mock_outputs
+ mock_model.training = True
+
+ # Only special tokens (which won't be masked)
+ input_ids = torch.tensor([[1, 0, 2]], dtype=torch.long)
+
+ loss, _ = diffusion_trainer_instance._compute_diffusion_loss(
+ mock_model, input_ids
+ )
+
+ # Loss should be zero when no tokens are masked
+ assert loss.item() == 0.0
+ assert loss.requires_grad
+
+ def test_cache_special_token_ids(self, mock_tokenizer):
+ """Test caching of special token IDs."""
+ trainer = object.__new__(DiffusionTrainer)
+ trainer.processing_class = mock_tokenizer
+ trainer._cache_special_token_ids()
+ assert trainer._special_token_ids == {0, 1, 2}
+
+ def test_cache_special_token_ids_no_tokenizer(self):
+ """Test caching when no tokenizer is available."""
+ trainer = object.__new__(DiffusionTrainer)
+ trainer.processing_class = None
+ trainer._cache_special_token_ids()
+
+ assert trainer._special_token_ids == set()
+
+ def test_main_compute_loss_interface(self, diffusion_trainer_instance):
+ """Test the main compute_loss interface."""
+ # Mock model
+ mock_model = Mock()
+ mock_outputs = Mock()
+ mock_outputs.logits = torch.randn(1, 5, 1000)
+ mock_model.return_value = mock_outputs
+ mock_model.training = True
+
+ inputs = {
+ "input_ids": torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long),
+ "attention_mask": torch.tensor([[1, 1, 1, 1, 1]], dtype=torch.long),
+ "labels": torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long),
+ }
+
+ # Test without return_outputs
+ loss = diffusion_trainer_instance.compute_loss(mock_model, inputs)
+ assert isinstance(loss, torch.Tensor)
+
+ # Test with return_outputs
+ loss, outputs = diffusion_trainer_instance.compute_loss(
+ mock_model, inputs, return_outputs=True
+ )
+ assert isinstance(loss, torch.Tensor)
+ assert outputs == mock_outputs
+
+ def test_missing_input_ids_raises_error(self, diffusion_trainer_instance):
+ """Test that missing input_ids raises ValueError."""
+ mock_model = Mock()
+ inputs = {"attention_mask": torch.tensor([[1, 1, 1]])}
+
+ with pytest.raises(ValueError, match="input_ids is required"):
+ diffusion_trainer_instance.compute_loss(mock_model, inputs)
diff --git a/tests/integrations/test_diffusion_callback.py b/tests/integrations/test_diffusion_callback.py
new file mode 100644
index 000000000..3e8785fe0
--- /dev/null
+++ b/tests/integrations/test_diffusion_callback.py
@@ -0,0 +1,92 @@
+"""Tests for diffusion generation callback dataloader selection and triggering."""
+
+from types import SimpleNamespace
+from unittest.mock import Mock
+
+import pytest
+
+from axolotl.integrations.diffusion import DiffusionGenerationCallback
+
+
+class DummyTrainer:
+ """Minimal trainer double with required attributes/methods for the callback."""
+
+ def __init__(self, use_eval: bool):
+ # Config used by callback
+ self.cfg = SimpleNamespace(
+ diffusion=SimpleNamespace(
+ generation_interval=1,
+ num_generation_samples=1,
+ generation_max_length=32,
+ generation_steps=4,
+ generation_temperature=0.0,
+ mask_token_id=16,
+ ),
+ use_wandb=False,
+ )
+
+ # Model/tokenizer are passed through to generate_samples; not used here
+ self.model = Mock()
+ self.processing_class = Mock()
+
+ # Datasets and loaders
+ self.eval_dataset = object() if use_eval else None
+ self._train_loader = object()
+ self._eval_loader = object()
+
+ # State for world process check
+ self.state = SimpleNamespace(is_world_process_zero=True)
+
+ # Track which loader was requested
+ self.requested: list[str] = []
+
+ def get_train_dataloader(self):
+ self.requested.append("train")
+ return self._train_loader
+
+ def get_eval_dataloader(self):
+ self.requested.append("eval")
+ return self._eval_loader
+
+
+@pytest.mark.parametrize("use_eval", [False, True])
+def test_callback_uses_correct_dataloader(monkeypatch, use_eval):
+ trainer = DummyTrainer(use_eval=use_eval)
+ callback = DiffusionGenerationCallback(trainer)
+
+ captured = {}
+
+ # Patch generate_samples in the callback module's namespace
+ def fake_generate_samples(**kwargs):
+ captured["dataloader"] = kwargs.get("dataloader")
+ # Return one dummy sample to exercise logging path
+ return [
+ {
+ "original": "o",
+ "masked": "m",
+ "generated": "g",
+ "mask_ratio": 0.5,
+ "masked_tokens": 1,
+ "total_tokens": 2,
+ }
+ ]
+
+ monkeypatch.setattr(
+ "axolotl.integrations.diffusion.callbacks.generate_samples",
+ fake_generate_samples,
+ )
+
+ # Trigger at step 1 (interval=1)
+ args = SimpleNamespace()
+ state = SimpleNamespace(global_step=1)
+ control = SimpleNamespace()
+
+ callback.on_step_end(args=args, state=state, control=control)
+
+ # Assert the expected dataloader path was used
+ if use_eval:
+ assert trainer.requested[0] == "eval"
+ assert captured["dataloader"] is trainer._eval_loader
+ else:
+ assert trainer.requested[0] == "train"
+ assert captured["dataloader"] is trainer._train_loader
diff --git a/tests/integrations/test_liger.py b/tests/integrations/test_liger.py
index 5c4bd1028..6865306c9 100644
--- a/tests/integrations/test_liger.py
+++ b/tests/integrations/test_liger.py
@@ -10,7 +10,6 @@ from axolotl.utils.config import prepare_plugins, validate_config
from axolotl.utils.dict import DictDefault
-# pylint: disable=duplicate-code
@pytest.fixture(name="minimal_liger_cfg")
def fixture_cfg():
return DictDefault(
@@ -30,7 +29,6 @@ def fixture_cfg():
)
-# pylint: disable=too-many-public-methods
class TestValidation:
"""
Test the validation module for liger
@@ -77,3 +75,19 @@ class TestValidation:
):
prepare_plugins(test_cfg)
validate_config(test_cfg)
+
+ def test_use_token_scaling_require_flce(self, minimal_liger_cfg):
+ test_cfg = DictDefault(
+ {
+ "liger_fused_linear_cross_entropy": False,
+ "liger_use_token_scaling": True,
+ }
+ | minimal_liger_cfg
+ )
+
+ with pytest.raises(
+ ValueError,
+ match=r"`liger_use_token_scaling: true` requires `liger_fused_linear_cross_entropy` enabled.",
+ ):
+ prepare_plugins(test_cfg)
+ validate_config(test_cfg)
diff --git a/tests/monkeypatch/test_mistral_tokenizer_patch.py b/tests/monkeypatch/test_mistral_tokenizer_patch.py
new file mode 100644
index 000000000..cb82c0890
--- /dev/null
+++ b/tests/monkeypatch/test_mistral_tokenizer_patch.py
@@ -0,0 +1,35 @@
+"""Integration tests for MistralCommonTokenizer patches."""
+
+import pytest
+
+
+class TestMistralTokenizerPatchIntegration:
+ """Test MistralCommonTokenizer patch integration."""
+
+ @pytest.mark.integration
+ def test_mistral_tokenizer_image_patch(self):
+ """Test that MistralCommonTokenizer image patch can be applied."""
+ try:
+ from transformers.tokenization_mistral_common import MistralCommonTokenizer
+ except ImportError:
+ pytest.skip("MistralCommonTokenizer not available")
+
+ from axolotl.monkeypatch.models.mistral3.mistral_common_tokenizer import (
+ apply_mistral_tokenizer_image_patch,
+ )
+
+ # Store original method
+ original_apply_chat_template = MistralCommonTokenizer.apply_chat_template
+
+ # Apply patch
+ apply_mistral_tokenizer_image_patch()
+
+ # Verify patch was applied
+ assert (
+ MistralCommonTokenizer.apply_chat_template != original_apply_chat_template
+ ), "apply_chat_template was not patched"
+
+ # Verify the method is still callable
+ assert callable(MistralCommonTokenizer.apply_chat_template), (
+ "Patched method is not callable"
+ )
diff --git a/tests/monkeypatch/test_pixtral_flash_attention_patch.py b/tests/monkeypatch/test_pixtral_flash_attention_patch.py
new file mode 100644
index 000000000..285fde41e
--- /dev/null
+++ b/tests/monkeypatch/test_pixtral_flash_attention_patch.py
@@ -0,0 +1,77 @@
+"""Integration tests for Pixtral Flash Attention patches."""
+
+import pytest
+import torch
+
+
+class TestPixtralFlashAttentionPatchIntegration:
+ """Test Pixtral Flash Attention patch integration."""
+
+ @pytest.mark.integration
+ def test_pixtral_flash_attention_patch(self):
+ """Test that Pixtral Flash Attention patch can be applied and works correctly."""
+ try:
+ from transformers import modeling_flash_attention_utils
+ except ImportError:
+ pytest.skip("Flash Attention utils not available")
+
+ from axolotl.monkeypatch.models.pixtral.modeling_flash_attention_utils import (
+ apply_patch_is_packed_sequence,
+ )
+
+ # Store original method
+ original_is_packed_sequence = modeling_flash_attention_utils._is_packed_sequence
+
+ # Apply patch and get unpatch function
+ unpatch_fn = apply_patch_is_packed_sequence()
+
+ # Verify patch was applied
+ assert (
+ modeling_flash_attention_utils._is_packed_sequence
+ != original_is_packed_sequence
+ ), "_is_packed_sequence was not patched"
+
+ # Test the patched function with 1D position_ids
+ patched_fn = modeling_flash_attention_utils._is_packed_sequence
+
+ # Test 1D position_ids 1 sequence
+ position_ids_1d = torch.tensor([0, 1, 2, 3])
+ result = patched_fn(position_ids_1d, batch_size=1)
+ assert isinstance(result, bool), "Function should return a boolean"
+ assert result is False, "1D sequential position_ids should not be packed"
+
+ # Test 1D packed 2 sequences
+ position_ids_1d_packed = torch.tensor([0, 1, 2, 0, 1, 2])
+ result = patched_fn(position_ids_1d_packed, batch_size=1)
+ assert isinstance(result, bool), "Function should return a boolean"
+ assert result is True, "1D packed position_ids should be detected as packed"
+
+ # Test 2D packed 2 sequences
+ position_ids_2d_packed = torch.tensor([[0, 1, 2, 3, 0, 1]])
+ result = patched_fn(position_ids_2d_packed, batch_size=1)
+ assert isinstance(result, bool), "Function should return a boolean"
+ assert result is True, "2D packed position_ids should be detected as packed"
+
+ # Test 2D 1 sequence
+ position_ids_2d_normal = torch.tensor([[0, 1, 2, 3, 4, 5]])
+ result = patched_fn(position_ids_2d_normal, batch_size=1)
+ assert isinstance(result, bool), "Function should return a boolean"
+ assert result is False, "2D sequential position_ids should not be packed"
+
+ # Test 2D batch size 2
+ position_ids_2d_normal = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8]])
+ result = patched_fn(position_ids_2d_normal, batch_size=2)
+ assert isinstance(result, bool), "Function should return a boolean"
+ assert result is False, "2D position_ids batch 2 should not be packed"
+
+ # Test None case
+ result = patched_fn(None, batch_size=1)
+ assert isinstance(result, bool), "Function should return a boolean"
+ assert result is False, "None position_ids should return False"
+
+ # Test unpatch function
+ unpatch_fn()
+ assert (
+ modeling_flash_attention_utils._is_packed_sequence
+ == original_is_packed_sequence
+ ), "unpatch function did not restore original method"
diff --git a/tests/monkeypatch/test_qwen3_next_modeling_patch.py b/tests/monkeypatch/test_qwen3_next_modeling_patch.py
new file mode 100644
index 000000000..91d9fc1cf
--- /dev/null
+++ b/tests/monkeypatch/test_qwen3_next_modeling_patch.py
@@ -0,0 +1,111 @@
+"""Integration tests for Qwen3 Next modeling patches."""
+
+import pytest
+import torch
+
+# Skip entire module if qwen3_next not available
+qwen3_next = pytest.importorskip("transformers.models.qwen3_next.modeling_qwen3_next")
+
+
+class TestQwen3NextModelingPatchIntegration:
+ """Test Qwen3 Next modeling patch integration."""
+
+ @pytest.mark.integration
+ def test_qwen3_next_decoder_layer_patch(self):
+ """Test that Qwen3Next decoder layer patch can be applied."""
+ from axolotl.monkeypatch.models.qwen3_next.modeling import (
+ patch_qwen3_next_decoder_layer,
+ )
+
+ # Store original method
+ original_forward = qwen3_next.Qwen3NextDecoderLayer.forward
+
+ # Apply patch and get unpatch function
+ unpatch_fn = patch_qwen3_next_decoder_layer()
+
+ # Verify patch was applied
+ assert qwen3_next.Qwen3NextDecoderLayer.forward != original_forward, (
+ "decoder layer forward method was not patched"
+ )
+
+ # Verify the method is still callable
+ assert callable(qwen3_next.Qwen3NextDecoderLayer.forward), (
+ "Patched method is not callable"
+ )
+
+ # Test unpatch function
+ if unpatch_fn:
+ unpatch_fn()
+ assert qwen3_next.Qwen3NextDecoderLayer.forward == original_forward, (
+ "unpatch function did not restore original method"
+ )
+
+ @pytest.mark.integration
+ def test_qwen3_next_gateddelta_layer_patch(self):
+ """Test that Qwen3Next GatedDeltaNet patch can be applied."""
+ from axolotl.monkeypatch.models.qwen3_next.modeling import (
+ patch_qwen3_next_gateddelta_layer,
+ )
+
+ # Store original method
+ original_forward = qwen3_next.Qwen3NextGatedDeltaNet.forward
+
+ # Apply patch and get unpatch function
+ unpatch_fn = patch_qwen3_next_gateddelta_layer()
+
+ # Verify patch was applied
+ assert qwen3_next.Qwen3NextGatedDeltaNet.forward != original_forward, (
+ "GatedDeltaNet forward method was not patched"
+ )
+
+ # Verify the method is still callable
+ assert callable(qwen3_next.Qwen3NextGatedDeltaNet.forward), (
+ "Patched method is not callable"
+ )
+
+ # Test unpatch function
+ if unpatch_fn:
+ unpatch_fn()
+ assert qwen3_next.Qwen3NextGatedDeltaNet.forward == original_forward, (
+ "unpatch function did not restore original method"
+ )
+
+ @pytest.mark.integration
+ def test_qwen3_next_imports_patch(self):
+ """Test that Qwen3Next imports patch can be applied without errors."""
+ from axolotl.monkeypatch.models.qwen3_next.modeling import (
+ patch_qwen3_next_imports,
+ )
+
+ # Apply patch - should not raise any exceptions even if modules unavailable
+ unpatch_fn = patch_qwen3_next_imports()
+
+ # Test that unpatch function is returned (or None if skipped)
+ assert unpatch_fn is None or callable(unpatch_fn), (
+ "patch_qwen3_next_imports should return None or callable unpatch function"
+ )
+
+ @pytest.mark.integration
+ def test_qwen3_next_modeling_packing_patch(self):
+ """Test that all Qwen3Next modeling patches can be applied together."""
+ from axolotl.monkeypatch.models.qwen3_next.modeling import (
+ patch_qwen3_next_modeling_packing,
+ )
+
+ # This should not raise any exceptions
+ patch_qwen3_next_modeling_packing()
+
+
+@pytest.mark.integration
+def test_get_cu_seqlens_utility():
+ """Test the get_cu_seqlens utility function."""
+ from axolotl.monkeypatch.models.qwen3_next.modeling import get_cu_seqlens
+
+ # Test with simple position_ids
+ position_ids = torch.tensor([[0, 1, 2, 0, 1]])
+ cu_seqlens = get_cu_seqlens(position_ids)
+ assert cu_seqlens.dtype == torch.int32, "Should be int32 dtype"
+
+ # Should return tensor with start positions and total length
+ expected = torch.tensor([0, 3, 5], dtype=torch.int32)
+ assert torch.equal(cu_seqlens, expected), f"Expected {expected}, got {cu_seqlens}"
diff --git a/tests/monkeypatch/test_trainer_context_parallel_patch.py b/tests/monkeypatch/test_trainer_context_parallel_patch.py
new file mode 100644
index 000000000..84c883e91
--- /dev/null
+++ b/tests/monkeypatch/test_trainer_context_parallel_patch.py
@@ -0,0 +1,66 @@
+"""Tests for the HF Trainer context parallel patch."""
+
+import pytest
+from transformers import Trainer
+
+from axolotl.monkeypatch.transformers.trainer_context_parallel import (
+ GUARD_PATTERN,
+ PATCHED_GUARD,
+ patch_prepare_context_parallel_inputs,
+)
+
+
+@pytest.fixture
+def restore_trainer_prepare_method():
+ """Ensure Trainer._prepare_context_parallel_inputs is restored after a test."""
+ original_method = getattr(
+ Trainer,
+ "_original_prepare_context_parallel_inputs",
+ Trainer._prepare_context_parallel_inputs,
+ )
+ patched_attr_present = hasattr(
+ Trainer, "_axolotl_prepare_context_parallel_inputs_patched"
+ )
+
+ yield
+
+ Trainer._prepare_context_parallel_inputs = original_method
+ if patched_attr_present:
+ delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched")
+ if hasattr(Trainer, "_original_prepare_context_parallel_inputs"):
+ delattr(Trainer, "_original_prepare_context_parallel_inputs")
+ if hasattr(Trainer, "_axolotl_prepare_context_parallel_inputs_source"):
+ delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_source")
+
+
+def test_patch_attention_guard(restore_trainer_prepare_method):
+ """Patch should swap the guard to allow sdpa or flash attention."""
+ # Ensure we start from the unpatched method
+ if hasattr(Trainer, "_original_prepare_context_parallel_inputs"):
+ Trainer._prepare_context_parallel_inputs = (
+ Trainer._original_prepare_context_parallel_inputs
+ )
+ delattr(Trainer, "_original_prepare_context_parallel_inputs")
+ if hasattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched"):
+ delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched")
+
+ patch_prepare_context_parallel_inputs()
+
+ patched_method = Trainer._prepare_context_parallel_inputs
+ assert patched_method is not None
+ assert getattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched", False)
+
+ source = Trainer._axolotl_prepare_context_parallel_inputs_source
+ assert GUARD_PATTERN not in source
+ assert PATCHED_GUARD in source
+
+
+def test_patch_is_idempotent(restore_trainer_prepare_method):
+ """Calling the patch twice should leave the same patched function in place."""
+ patch_prepare_context_parallel_inputs()
+ first_patched = Trainer._prepare_context_parallel_inputs
+
+ patch_prepare_context_parallel_inputs()
+ second_patched = Trainer._prepare_context_parallel_inputs
+
+ assert first_patched is second_patched
diff --git a/tests/monkeypatch/test_trainer_loss_calc.py b/tests/monkeypatch/test_trainer_loss_calc.py
index de3e92621..c72cb621b 100644
--- a/tests/monkeypatch/test_trainer_loss_calc.py
+++ b/tests/monkeypatch/test_trainer_loss_calc.py
@@ -3,7 +3,6 @@
import unittest
from axolotl.monkeypatch.transformers.trainer_loss_calc import (
- check_evaluation_loop_is_fsdp2_patchable,
check_evaluation_loop_is_patchable,
check_maybe_log_save_evaluate_is_patchable,
)
@@ -20,7 +19,6 @@ class TestTrainerLossCalc(unittest.TestCase):
the patched code changes upstream.
"""
assert check_evaluation_loop_is_patchable()
- assert check_evaluation_loop_is_fsdp2_patchable()
assert check_maybe_log_save_evaluate_is_patchable()
diff --git a/tests/monkeypatch/test_voxtral_modeling_patch.py b/tests/monkeypatch/test_voxtral_modeling_patch.py
new file mode 100644
index 000000000..878bbc185
--- /dev/null
+++ b/tests/monkeypatch/test_voxtral_modeling_patch.py
@@ -0,0 +1,43 @@
+"""Integration tests for Voxtral modeling patches."""
+
+import pytest
+
+
+class TestVoxtralModelingPatchIntegration:
+ """Test Voxtral modeling patch integration."""
+
+ @pytest.mark.integration
+ def test_voxtral_conditional_generation_patch(self):
+ """Test that Voxtral conditional generation patch can be applied."""
+ try:
+ from transformers.models.voxtral.modeling_voxtral import (
+ VoxtralForConditionalGeneration,
+ )
+ except ImportError:
+ pytest.skip("VoxtralForConditionalGeneration not available")
+
+ from axolotl.monkeypatch.models.voxtral.modeling import (
+ patch_voxtral_conditional_generation_forward,
+ )
+
+ # Store original method
+ original_forward = VoxtralForConditionalGeneration.forward
+
+ # Apply patch and get unpatch function
+ unpatch_fn = patch_voxtral_conditional_generation_forward()
+
+ # Verify patch was applied
+ assert VoxtralForConditionalGeneration.forward != original_forward, (
+ "forward method was not patched"
+ )
+
+ # Verify the method is still callable
+ assert callable(VoxtralForConditionalGeneration.forward), (
+ "Patched method is not callable"
+ )
+
+ # Test unpatch function
+ unpatch_fn()
+ assert VoxtralForConditionalGeneration.forward == original_forward, (
+ "unpatch function did not restore original method"
+ )
diff --git a/tests/patched/test_validation.py b/tests/patched/test_validation.py
index 677512d3d..21299ed98 100644
--- a/tests/patched/test_validation.py
+++ b/tests/patched/test_validation.py
@@ -1,4 +1,3 @@
-# pylint: disable=too-many-lines
"""Module for testing the validation module"""
import os
@@ -49,7 +48,6 @@ class BaseValidation:
self._caplog = caplog
-# pylint: disable=too-many-public-methods
class TestValidation(BaseValidation):
"""
Test the validation module
@@ -241,7 +239,7 @@ class TestValidation(BaseValidation):
def test_lr_as_float(self, minimal_cfg):
cfg = (
- DictDefault( # pylint: disable=unsupported-binary-operation
+ DictDefault(
{
"learning_rate": "5e-5",
}
@@ -303,7 +301,7 @@ class TestValidation(BaseValidation):
)
cfg = (
- DictDefault( # pylint: disable=unsupported-binary-operation
+ DictDefault(
{
"load_in_8bit": True,
}
@@ -315,7 +313,7 @@ class TestValidation(BaseValidation):
validate_config(cfg)
cfg = (
- DictDefault( # pylint: disable=unsupported-binary-operation
+ DictDefault(
{
"gptq": True,
}
@@ -327,7 +325,7 @@ class TestValidation(BaseValidation):
validate_config(cfg)
cfg = (
- DictDefault( # pylint: disable=unsupported-binary-operation
+ DictDefault(
{
"load_in_4bit": False,
}
@@ -339,7 +337,7 @@ class TestValidation(BaseValidation):
validate_config(cfg)
cfg = (
- DictDefault( # pylint: disable=unsupported-binary-operation
+ DictDefault(
{
"load_in_4bit": True,
}
@@ -361,7 +359,7 @@ class TestValidation(BaseValidation):
)
cfg = (
- DictDefault( # pylint: disable=unsupported-binary-operation
+ DictDefault(
{
"load_in_8bit": True,
}
@@ -373,7 +371,7 @@ class TestValidation(BaseValidation):
validate_config(cfg)
cfg = (
- DictDefault( # pylint: disable=unsupported-binary-operation
+ DictDefault(
{
"gptq": True,
}
@@ -385,7 +383,7 @@ class TestValidation(BaseValidation):
validate_config(cfg)
cfg = (
- DictDefault( # pylint: disable=unsupported-binary-operation
+ DictDefault(
{
"load_in_4bit": True,
}
diff --git a/tests/prompt_strategies/conftest.py b/tests/prompt_strategies/conftest.py
index 7f942e0ef..0af7b3e93 100644
--- a/tests/prompt_strategies/conftest.py
+++ b/tests/prompt_strategies/conftest.py
@@ -30,7 +30,6 @@ def fixture_assistant_dataset():
@pytest.fixture(name="sharegpt_dataset")
def fixture_sharegpt_dataset():
- # pylint: disable=duplicate-code
return Dataset.from_list(
[
{
@@ -47,7 +46,6 @@ def fixture_sharegpt_dataset():
@pytest.fixture(name="basic_dataset")
def fixture_basic_dataset():
- # pylint: disable=duplicate-code
return Dataset.from_list(
[
{
@@ -65,7 +63,6 @@ def fixture_basic_dataset():
@pytest.fixture(name="toolcalling_dataset")
def fixture_toolcalling_dataset():
- # pylint: disable=duplicate-code
return Dataset.from_list(
[
{
@@ -112,7 +109,7 @@ def fixture_toolcalling_dataset():
@enable_hf_offline
def fixture_llama3_tokenizer(
download_llama3_8b_instruct_model_fixture,
-): # pylint: disable=unused-argument,redefined-outer-name
+):
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct")
return tokenizer
@@ -129,7 +126,7 @@ def fixture_smollm2_tokenizer():
@enable_hf_offline
def fixture_mistralv03_tokenizer(
download_mlx_mistral_7b_model_fixture,
-): # pylint: disable=unused-argument,redefined-outer-name
+):
tokenizer = AutoTokenizer.from_pretrained(
"mlx-community/Mistral-7B-Instruct-v0.3-4bit"
)
@@ -180,6 +177,15 @@ def fixture_devstral_1_1_tokenizer():
return tokenizer
+@pytest.fixture(name="qwen3_tokenizer")
+def qwen3_tokenizer_fixture(
+ download_qwen3_half_billion_model,
+): # pylint: disable=unused-argument,redefined-outer-name
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
+
+ return tokenizer
+
+
@pytest.fixture(name="mistralv03_tokenizer_chat_template_jinja")
def fixture_mistralv03_chat_template_jinja_w_system() -> str:
return '{%- if messages[0]["role"] == "system" %}\n {%- set system_message = messages[0]["content"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == "tool" or message.role == "tool_results" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message["role"] == "user") != (ns.index % 2 == 0) %}\n {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if message["role"] == "user" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- "[AVAILABLE_TOOLS] [" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- \'{"type": "function", "function": {\' }}\n {%- for key, val in tool.items() if key != "return" %}\n {%- if val is string %}\n {{- \'"\' + key + \'": "\' + val + \'"\' }}\n {%- else %}\n {{- \'"\' + key + \'": \' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- ", " }}\n {%- endif %}\n {%- endfor %}\n {{- "}}" }}\n {%- if not loop.last %}\n {{- ", " }}\n {%- else %}\n {{- "]" }}\n {%- endif %}\n {%- endfor %}\n {{- "[/AVAILABLE_TOOLS]" }}\n {%- endif %}\n {%- if loop.first and system_message is defined %}\n {{- "[INST] " + system_message + "\\n\\n" + message["content"] + "[/INST]" }}\n {%- else %}\n {{- "[INST] " + message["content"] + "[/INST]" }}\n {%- endif %}\n {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n {{- "[TOOL_CALLS] [" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}\n {%- endif %}\n {{- \', "id": "\' + tool_call.id + \'"}\' }}\n {%- if not loop.last %}\n {{- ", " }}\n {%- else %}\n {{- "]" + eos_token }}\n {%- endif %}\n {%- endfor %}\n {%- elif message["role"] == "assistant" %}\n {{- " " + message["content"]|trim + eos_token}}\n {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- \'[TOOL_RESULTS] {"content": \' + content|string + ", " }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}\n {%- endif %}\n {{- \'"call_id": "\' + message.tool_call_id + \'"}[/TOOL_RESULTS]\' }}\n {%- else %}\n {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}\n {%- endif %}\n{%- endfor %}\n'
diff --git a/tests/prompt_strategies/messages/test_chat.py b/tests/prompt_strategies/messages/test_chat.py
index a4c2ae67f..f083232a8 100644
--- a/tests/prompt_strategies/messages/test_chat.py
+++ b/tests/prompt_strategies/messages/test_chat.py
@@ -2,7 +2,6 @@
tests for chat_template prompt strategy
"""
-# pylint: disable=duplicate-code
import unittest
from axolotl.prompt_strategies.messages.chat import load
@@ -53,9 +52,9 @@ class TestMessagesChatLlama3:
# fmt: on
LOG.debug(f"Expected input_ids: {expected_input_ids}")
LOG.debug(f"Actual input_ids: {input_ids}")
- assert (
- input_ids == expected_input_ids
- ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ assert input_ids == expected_input_ids, (
+ f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ )
if __name__ == "__main__":
diff --git a/tests/prompt_strategies/test_alpaca.py b/tests/prompt_strategies/test_alpaca.py
index 78f783747..b96ebce19 100644
--- a/tests/prompt_strategies/test_alpaca.py
+++ b/tests/prompt_strategies/test_alpaca.py
@@ -30,7 +30,6 @@ def fixture_alpaca_dataset():
@pytest.fixture(name="tokenizer")
@enable_hf_offline
def fixture_tokenizer():
- # pylint: disable=all
tokenizer = AutoTokenizer.from_pretrained(
"casperhansen/mistral-7b-instruct-v0.1-awq"
)
diff --git a/tests/prompt_strategies/test_chat_template_ds_schema_unification.py b/tests/prompt_strategies/test_chat_template_ds_schema_unification.py
index 502efae4b..4f4e32208 100644
--- a/tests/prompt_strategies/test_chat_template_ds_schema_unification.py
+++ b/tests/prompt_strategies/test_chat_template_ds_schema_unification.py
@@ -6,7 +6,6 @@ import json
import pytest
from datasets import Dataset
-from transformers import AutoTokenizer
from axolotl.prompt_strategies.chat_template import StrategyLoader
from axolotl.utils.dict import DictDefault
@@ -18,22 +17,11 @@ def fixture_messages_w_tools():
{"messages":[{"role":"user","content":"move to (0, 1)"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"move","arguments":{"x":0,"y":1}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false}
{"messages":[{"role":"user","content":"turn 270 degree"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"turn","arguments":{"theta": 270}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false}
{"messages":[{"role":"user","content":"jump high"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"invalid_prompt","arguments":{"message": "jump is not a valid action"}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false}
- """.strip().split(
- "\n"
- )
+ """.strip().split("\n")
rows = [json.loads(row) for row in jsons]
return Dataset.from_list(rows)
-@pytest.fixture(name="qwen3_tokenizer")
-def qwen3_tokenizer_fixture(
- download_qwen3_half_billion_model,
-): # pylint: disable=unused-argument
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
-
- return tokenizer
-
-
@pytest.fixture(name="qwen3_prompt_strategy")
def qwen3_chat_template_strategy(qwen3_tokenizer):
cfg = DictDefault(
diff --git a/tests/prompt_strategies/test_chat_templates.py b/tests/prompt_strategies/test_chat_templates.py
index 371ccf616..90e0e274b 100644
--- a/tests/prompt_strategies/test_chat_templates.py
+++ b/tests/prompt_strategies/test_chat_templates.py
@@ -67,9 +67,9 @@ class TestAssistantChatTemplateLlama3:
# fmt: on
LOG.debug(f"Expected input_ids: {expected_input_ids}")
LOG.debug(f"Actual input_ids: {input_ids}")
- assert (
- input_ids == expected_input_ids
- ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ assert input_ids == expected_input_ids, (
+ f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ )
def test_llama3(self, llama3_tokenizer, assistant_dataset):
LOG.info("Testing llama-3 with assistant dataset")
@@ -109,9 +109,9 @@ class TestAssistantChatTemplateLlama3:
# fmt: on
LOG.debug(f"Expected input_ids: {expected_input_ids}")
LOG.debug(f"Actual input_ids: {input_ids}")
- assert (
- input_ids == expected_input_ids
- ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ assert input_ids == expected_input_ids, (
+ f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ )
def test_phi35(self, phi35_tokenizer, assistant_dataset):
LOG.info("Testing phi-3.5 with assistant dataset")
@@ -161,15 +161,15 @@ class TestAssistantChatTemplateLlama3:
# fmt: on
LOG.debug(f"Expected input_ids: {expected_input_ids}")
LOG.debug(f"Actual input_ids: {input_ids}")
- assert (
- input_ids == expected_input_ids
- ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ assert input_ids == expected_input_ids, (
+ f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ )
LOG.debug(f"Expected labels : {expected_labels}")
LOG.debug(f"Actual labels : {labels}")
- assert (
- labels == expected_labels
- ), f"Input IDs mismatch: {labels} != {expected_labels}"
+ assert labels == expected_labels, (
+ f"Input IDs mismatch: {labels} != {expected_labels}"
+ )
def test_llama3_with_training_data(self, llama3_tokenizer, assistant_dataset):
LOG.info("Testing llama-3 with assistant dataset including training data")
@@ -234,7 +234,7 @@ class TestSharegptChatTemplateLlama3:
def test_llama3_assistant(self, llama3_tokenizer, sharegpt_dataset):
LOG.info("Testing ShareGPT style datasets with llama-3 assistant prompts")
- # pylint: disable=duplicate-code
+
strategy = ChatTemplateStrategy(
ChatTemplatePrompter(
llama3_tokenizer,
@@ -285,16 +285,16 @@ class TestSharegptChatTemplateLlama3:
LOG.debug(f"Expected labels: {expected_labels}")
LOG.debug(f"Actual labels: {labels}")
- assert (
- input_ids == expected_input_ids
- ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
- assert (
- labels == expected_labels
- ), f"Labels mismatch: {labels} != {expected_labels}"
+ assert input_ids == expected_input_ids, (
+ f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ )
+ assert labels == expected_labels, (
+ f"Labels mismatch: {labels} != {expected_labels}"
+ )
def test_llama3_human(self, llama3_tokenizer, sharegpt_dataset):
LOG.info("Testing ShareGPT style datasets with llama-3 human prompts")
- # pylint: disable=duplicate-code
+
strategy = ChatTemplateStrategy(
ChatTemplatePrompter(
llama3_tokenizer,
@@ -345,16 +345,16 @@ class TestSharegptChatTemplateLlama3:
LOG.debug(f"Expected labels: {expected_labels}")
LOG.debug(f"Actual labels: {labels}")
- assert (
- input_ids == expected_input_ids
- ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
- assert (
- labels == expected_labels
- ), f"Labels mismatch: {labels} != {expected_labels}"
+ assert input_ids == expected_input_ids, (
+ f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ )
+ assert labels == expected_labels, (
+ f"Labels mismatch: {labels} != {expected_labels}"
+ )
def test_llama3_system_human(self, llama3_tokenizer, basic_dataset):
LOG.info("Testing ShareGPT style datasets with llama-3 system/human prompts")
- # pylint: disable=duplicate-code
+
strategy = ChatTemplateStrategy(
ChatTemplatePrompter(
llama3_tokenizer,
@@ -409,12 +409,12 @@ class TestSharegptChatTemplateLlama3:
LOG.debug(f"Expected labels: {expected_labels}")
LOG.debug(f"Actual labels: {labels}")
- assert (
- input_ids == expected_input_ids
- ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
- assert (
- labels == expected_labels
- ), f"Labels mismatch: {labels} != {expected_labels}"
+ assert input_ids == expected_input_ids, (
+ f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ )
+ assert labels == expected_labels, (
+ f"Labels mismatch: {labels} != {expected_labels}"
+ )
class TestAssistantToolCallingChatTemplateLlama32Vision:
@@ -481,13 +481,13 @@ class TestAssistantToolCallingChatTemplateLlama32Vision:
]
# fmt: on
- assert (
- input_ids == expected_input_ids
- ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ assert input_ids == expected_input_ids, (
+ f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ )
- assert (
- labels == expected_labels
- ), f"Labels mismatch: {labels} != {expected_labels}"
+ assert labels == expected_labels, (
+ f"Labels mismatch: {labels} != {expected_labels}"
+ )
def test_llama32vision_train_on_tools(
self, llama3_tokenizer, toolcalling_dataset, llama3_2_vision_chat_template_jinja
@@ -495,7 +495,6 @@ class TestAssistantToolCallingChatTemplateLlama32Vision:
LOG.info(
"Testing assistant style datasets with tool_calling with llama-32 chat template, training on tools"
)
- # pylint: disable=duplicate-code
strategy = ChatTemplateStrategy(
ChatTemplatePrompter(
@@ -549,13 +548,13 @@ class TestAssistantToolCallingChatTemplateLlama32Vision:
]
# fmt: on
- assert (
- input_ids == expected_input_ids
- ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ assert input_ids == expected_input_ids, (
+ f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ )
- assert (
- labels == expected_labels
- ), f"Labels mismatch: {labels} != {expected_labels}"
+ assert labels == expected_labels, (
+ f"Labels mismatch: {labels} != {expected_labels}"
+ )
if __name__ == "__main__":
diff --git a/tests/prompt_strategies/test_chat_templates_advanced.py b/tests/prompt_strategies/test_chat_templates_advanced.py
index f847cab4a..fd39a4305 100644
--- a/tests/prompt_strategies/test_chat_templates_advanced.py
+++ b/tests/prompt_strategies/test_chat_templates_advanced.py
@@ -2,8 +2,6 @@
tests for chat_template prompt strategy
"""
-# pylint: disable=too-many-lines
-
from copy import deepcopy
import pytest
@@ -96,9 +94,9 @@ class TestChatTemplateConfigurations:
and turn.get("from") in ["system", "context"]
and ("mistral" in tokenizer.name_or_path.lower())
):
- assert (
- start_idx == -1 and end_idx == -1
- ), "Expected system message to be skipped"
+ assert start_idx == -1 and end_idx == -1, (
+ "Expected system message to be skipped"
+ )
return True
return False
@@ -155,7 +153,9 @@ class TestChatTemplateConfigurations:
assert all(
label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
- ), f"Expected labels for input '{response}' to be ignored, but got {labels[start_idx:end_idx]}"
+ ), (
+ f"Expected labels for input '{response}' to be ignored, but got {labels[start_idx:end_idx]}"
+ )
LOG.debug("Full labels: %s", labels)
LOG.debug("Full input_ids: %s", input_ids)
@@ -215,11 +215,15 @@ class TestChatTemplateConfigurations:
if is_assistant:
assert all(
label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
- ), f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:end_idx]}"
+ ), (
+ f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:end_idx]}"
+ )
else:
assert all(
label == IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
- ), f"Expected labels for human input '{response}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:end_idx]}"
+ ), (
+ f"Expected labels for human input '{response}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:end_idx]}"
+ )
def test_roles_to_train_human_assistant_only(
self,
@@ -276,11 +280,15 @@ class TestChatTemplateConfigurations:
if should_be_labelled:
assert all(
label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
- ), f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:end_idx]}"
+ ), (
+ f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:end_idx]}"
+ )
else:
assert all(
label == IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
- ), f"Expected labels for human input '{response}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:end_idx]}"
+ ), (
+ f"Expected labels for human input '{response}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:end_idx]}"
+ )
def test_roles_to_train_all(
self,
@@ -327,13 +335,15 @@ class TestChatTemplateConfigurations:
continue
decoded_response = tokenizer.decode(input_ids[start_idx:end_idx])
- assert (
- response in decoded_response
- ), f"Response {response} not found in index {start_idx}:{end_idx} decoded:{decoded_response}"
+ assert response in decoded_response, (
+ f"Response {response} not found in index {start_idx}:{end_idx} decoded:{decoded_response}"
+ )
assert all(
label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
- ), f"Expected labels for response '{response}' to be set, but got {labels[start_idx:end_idx]}"
+ ), (
+ f"Expected labels for response '{response}' to be set, but got {labels[start_idx:end_idx]}"
+ )
def test_empty_roles_to_train(
self,
@@ -371,9 +381,9 @@ class TestChatTemplateConfigurations:
# Verify that no labels are set when roles_to_train is empty
LOG.debug("Full labels: %s", labels)
- assert all(
- label == IGNORE_TOKEN_ID for label in labels
- ), "Expected all labels to be IGNORE_TOKEN_ID when roles_to_train is empty"
+ assert all(label == IGNORE_TOKEN_ID for label in labels), (
+ "Expected all labels to be IGNORE_TOKEN_ID when roles_to_train is empty"
+ )
def test_train_on_eos_all(
self,
@@ -417,9 +427,9 @@ class TestChatTemplateConfigurations:
assert len(eos_indices) > 0, "Expected at least one EOS token in the input"
for eos_idx in eos_indices:
- assert (
- labels[eos_idx] != IGNORE_TOKEN_ID
- ), f"Expected EOS token at index {eos_idx} to be labeled"
+ assert labels[eos_idx] != IGNORE_TOKEN_ID, (
+ f"Expected EOS token at index {eos_idx} to be labeled"
+ )
def test_train_on_eos_turn(
self,
@@ -477,9 +487,9 @@ class TestChatTemplateConfigurations:
while eos_idx < len(input_ids) and input_ids[eos_idx] != eos_token_id:
eos_idx += 1
- assert eos_idx < len(
- input_ids
- ), f"Could not find EOS token after '{response}'"
+ assert eos_idx < len(input_ids), (
+ f"Could not find EOS token after '{response}'"
+ )
LOG.debug(
f"Turn {i}: role={turn['from']}, content='{turn['value']}', start_idx={start_idx}, end_idx={end_idx}, eos_idx={eos_idx}"
@@ -492,13 +502,13 @@ class TestChatTemplateConfigurations:
# Verify EOS token labeling based on role
is_assistant = turn["from"] == "assistant"
if is_assistant:
- assert (
- labels[eos_idx] != IGNORE_TOKEN_ID
- ), f"Expected EOS token after assistant response '{response}' to be labeled"
+ assert labels[eos_idx] != IGNORE_TOKEN_ID, (
+ f"Expected EOS token after assistant response '{response}' to be labeled"
+ )
else:
- assert (
- labels[eos_idx] == IGNORE_TOKEN_ID
- ), f"Expected EOS token after non-assistant input '{response}' to not be labeled"
+ assert labels[eos_idx] == IGNORE_TOKEN_ID, (
+ f"Expected EOS token after non-assistant input '{response}' to not be labeled"
+ )
def test_train_on_eos_last(
self,
@@ -545,12 +555,12 @@ class TestChatTemplateConfigurations:
# Check that only the last EOS token is labeled
for idx in eos_indices[:-1]:
- assert (
- labels[idx] == IGNORE_TOKEN_ID
- ), f"Expected EOS token at index {idx} to not be labeled"
- assert (
- labels[last_eos_idx] != IGNORE_TOKEN_ID
- ), f"Expected last EOS token at index {last_eos_idx} to be labeled"
+ assert labels[idx] == IGNORE_TOKEN_ID, (
+ f"Expected EOS token at index {idx} to not be labeled"
+ )
+ assert labels[last_eos_idx] != IGNORE_TOKEN_ID, (
+ f"Expected last EOS token at index {last_eos_idx} to be labeled"
+ )
def test_train_on_eos_none(
self,
@@ -594,9 +604,9 @@ class TestChatTemplateConfigurations:
assert len(eos_indices) > 0, "Expected at least one EOS token in the input"
for eos_idx in eos_indices:
- assert (
- labels[eos_idx] == IGNORE_TOKEN_ID
- ), f"Expected EOS token at index {eos_idx} to not be labeled"
+ assert labels[eos_idx] == IGNORE_TOKEN_ID, (
+ f"Expected EOS token at index {eos_idx} to not be labeled"
+ )
def test_drop_system_message(
self,
@@ -634,9 +644,9 @@ class TestChatTemplateConfigurations:
# Check if system message is not present in input_ids
system_message = "You are an AI assistant."
decoded_message = tokenizer.decode(input_ids)
- assert (
- system_message not in decoded_message
- ), "Expected system message to be dropped"
+ assert system_message not in decoded_message, (
+ "Expected system message to be dropped"
+ )
def test_custom_roles(
self,
@@ -711,7 +721,9 @@ class TestChatTemplateConfigurations:
else:
assert all(
label == IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
- ), f"Expected labels for non-AI message '{response}' to be IGNORE_TOKEN_ID"
+ ), (
+ f"Expected labels for non-AI message '{response}' to be IGNORE_TOKEN_ID"
+ )
def test_message_field_training(
self,
@@ -776,13 +788,13 @@ class TestChatTemplateConfigurations:
def verify_labels(labels_span, should_train, context_message):
"""Helper to verify if a span of labels matches expected training state"""
if should_train:
- assert all(
- label != IGNORE_TOKEN_ID for label in labels_span
- ), f"Expected all labels for {context_message} to be set, but got {labels_span}"
+ assert all(label != IGNORE_TOKEN_ID for label in labels_span), (
+ f"Expected all labels for {context_message} to be set, but got {labels_span}"
+ )
else:
- assert all(
- label == IGNORE_TOKEN_ID for label in labels_span
- ), f"Expected all labels for {context_message} to be {IGNORE_TOKEN_ID}, but got {labels_span}"
+ assert all(label == IGNORE_TOKEN_ID for label in labels_span), (
+ f"Expected all labels for {context_message} to be {IGNORE_TOKEN_ID}, but got {labels_span}"
+ )
# Process all turns and verify labeling
for i, turn in enumerate(modified_dataset[0]["messages"]):
@@ -861,9 +873,9 @@ class TestChatTemplateConfigurations:
actual_labels = labels[
start_idx : start_idx + len(token_offsets_masked)
]
- assert (
- actual_labels == expected_labels
- ), f"Labels mismatch for turn: {turn['value']}\nExpected: {expected_labels}\nActual: {actual_labels}"
+ assert actual_labels == expected_labels, (
+ f"Labels mismatch for turn: {turn['value']}\nExpected: {expected_labels}\nActual: {actual_labels}"
+ )
# Verify each detail section
for detail in adjusted_train_details:
@@ -958,7 +970,7 @@ class TestChatTemplateConfigurations:
chat_template,
chat_template_jinja,
eos_token,
- basic_dataset, # pylint: disable=unused-argument
+ basic_dataset,
request,
):
"""Test that an error is raised when eot_tokens contains eos_token and train_on_eot/train_on_eos conflict"""
@@ -1005,7 +1017,7 @@ class TestChatTemplateConfigurations:
chat_template,
chat_template_jinja,
eos_token,
- basic_dataset, # pylint: disable=unused-argument
+ basic_dataset,
request,
):
"""Test that eot_tokens inherits from eos_token when not specified"""
@@ -1032,12 +1044,12 @@ class TestChatTemplateConfigurations:
)
# In backward compatibility mode, eot_tokens should be derived from eos_token
- assert strategy.eot_tokens == [
- tokenizer.eos_token
- ], f"Expected eot_tokens to inherit from eos_token, got {strategy.eot_tokens}"
- assert (
- strategy.train_on_eot == "turn"
- ), f"Expected train_on_eot to inherit from train_on_eos, got {strategy.train_on_eot}"
+ assert strategy.eot_tokens == [tokenizer.eos_token], (
+ f"Expected eot_tokens to inherit from eos_token, got {strategy.eot_tokens}"
+ )
+ assert strategy.train_on_eot == "turn", (
+ f"Expected train_on_eot to inherit from train_on_eos, got {strategy.train_on_eot}"
+ )
def test_token_not_in_template(
self,
@@ -1091,7 +1103,7 @@ class TestChatTemplateConfigurations:
tokenizer,
chat_template,
chat_template_jinja,
- eos_token, # pylint: disable=unused-argument
+ eos_token,
basic_dataset,
request,
):
@@ -1157,13 +1169,13 @@ class TestChatTemplateConfigurations:
)
if is_after_assistant:
- assert (
- labels[eot_idx] != IGNORE_TOKEN_ID
- ), f"Expected EOT token after assistant turn at index {eot_idx} to be labeled"
+ assert labels[eot_idx] != IGNORE_TOKEN_ID, (
+ f"Expected EOT token after assistant turn at index {eot_idx} to be labeled"
+ )
else:
- assert (
- labels[eot_idx] == IGNORE_TOKEN_ID
- ), f"Expected EOT token not after assistant turn at index {eot_idx} to not be labeled"
+ assert labels[eot_idx] == IGNORE_TOKEN_ID, (
+ f"Expected EOT token not after assistant turn at index {eot_idx} to not be labeled"
+ )
def test_multiple_train_on_eot_settings(
self,
@@ -1224,9 +1236,9 @@ class TestChatTemplateConfigurations:
i for i, token_id in enumerate(input_ids) if token_id == eos_token_id
]
- assert (
- len(eos_indices) > 0
- ), "Expected at least one EOS/EOT token in the input"
+ assert len(eos_indices) > 0, (
+ "Expected at least one EOS/EOT token in the input"
+ )
# Check labeling for each EOS/EOT token
for idx, eos_idx in enumerate(eos_indices):
@@ -1252,13 +1264,13 @@ class TestChatTemplateConfigurations:
)
if expected_label:
- assert (
- labels[eos_idx] == IGNORE_TOKEN_ID
- ), f"Expected EOT token at index {eos_idx} to not be labeled with train_on_eot='{setting}'"
+ assert labels[eos_idx] == IGNORE_TOKEN_ID, (
+ f"Expected EOT token at index {eos_idx} to not be labeled with train_on_eot='{setting}'"
+ )
else:
- assert (
- labels[eos_idx] != IGNORE_TOKEN_ID
- ), f"Expected EOT token at index {eos_idx} to be labeled with train_on_eot='{setting}'"
+ assert labels[eos_idx] != IGNORE_TOKEN_ID, (
+ f"Expected EOT token at index {eos_idx} to be labeled with train_on_eot='{setting}'"
+ )
class TestChatTemplateToolCalling:
@@ -1378,29 +1390,27 @@ class TestChatTemplateToolCalling:
decoded_conversation = tokenizer.decode(input_ids)
# Verify tool calling structure is present in the decoded conversation
- assert (
- '"type": "function",' in decoded_conversation
- ), "Tool type function should be in conversation"
- assert (
- '"name": "multiples",' in decoded_conversation
- ), "Tool function name should be in conversation"
+ assert '"type": "function",' in decoded_conversation, (
+ "Tool type function should be in conversation"
+ )
+ assert '"name": "multiples",' in decoded_conversation, (
+ "Tool function name should be in conversation"
+ )
assert (
'<|python_start|><|python_end|>{"name": "multiples", "parameters": {"number": 5, "limit": 20}}<|eot|>'
in decoded_conversation
), "Assistant tool call should be in conversation"
- assert (
- "<|header_start|>ipython<|header_end|>" in decoded_conversation
- ), "IPython header should be in conversation"
- assert (
- '"5,10,15"' in decoded_conversation
- ), "Tool response should be in conversation"
+ assert "<|header_start|>ipython<|header_end|>" in decoded_conversation, (
+ "IPython header should be in conversation"
+ )
+ assert '"5,10,15"' in decoded_conversation, (
+ "Tool response should be in conversation"
+ )
# Get conversation turns to verify labeling
turns = strategy.get_conversation_thread(tool_calling_dataset[0])
- tools = strategy._get_tools( # pylint: disable=protected-access
- tool_calling_dataset[0]
- )
+ tools = strategy._get_tools(tool_calling_dataset[0])
# Check that assistant responses are properly labeled
for i, turn in enumerate(tool_calling_dataset[0]["messages"]):
@@ -1409,12 +1419,12 @@ class TestChatTemplateToolCalling:
turns=turns, turn_idx=i, tools=tools
)
- assert (
- start_idx != -1 and end_idx != -1
- ), f"Assistant turn {i} should be found"
+ assert start_idx != -1 and end_idx != -1, (
+ f"Assistant turn {i} should be found"
+ )
# Verify that assistant responses have proper labels
turn_labels = labels[start_idx:end_idx]
- assert all(
- label != IGNORE_TOKEN_ID for label in turn_labels
- ), f"Assistant turn {i} should be unmasked"
+ assert all(label != IGNORE_TOKEN_ID for label in turn_labels), (
+ f"Assistant turn {i} should be unmasked"
+ )
diff --git a/tests/prompt_strategies/test_chat_templates_mistral.py b/tests/prompt_strategies/test_chat_templates_mistral.py
index a5b31a771..85aa72111 100644
--- a/tests/prompt_strategies/test_chat_templates_mistral.py
+++ b/tests/prompt_strategies/test_chat_templates_mistral.py
@@ -28,7 +28,7 @@ def test_mistral_chat_template(
request: pytest.FixtureRequest,
):
"""Test chat template with the Magistral/Devstral tokenizer"""
- # pylint: disable=duplicate-code
+
from axolotl.prompt_strategies.chat_template import MistralPrompter, MistralStrategy
tokenizer: HFMistralTokenizer = request.getfixturevalue(tokenizer_str)
diff --git a/tests/prompt_strategies/test_chat_templates_thinking.py b/tests/prompt_strategies/test_chat_templates_thinking.py
index e807111aa..054012e00 100644
--- a/tests/prompt_strategies/test_chat_templates_thinking.py
+++ b/tests/prompt_strategies/test_chat_templates_thinking.py
@@ -4,7 +4,6 @@ Tests for splitting reasoning/thinking from content into separate field
import pytest
from datasets import Dataset
-from transformers import AutoTokenizer
from axolotl.prompt_strategies.chat_template import (
load,
@@ -56,22 +55,12 @@ def messages_w_reasoning_fixture():
)
-@pytest.fixture(name="qwen3_tokenizer")
-def qwen3_tokenizer_fixture(
- download_qwen3_half_billion_model,
-): # pylint: disable=unused-argument
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
-
- return tokenizer
-
-
class TestSplitThinking:
"""
test class to make sure datasets with reasoning content conforms to the chat_template strategy
"""
def test_splits_think(self, messages_w_reasoning, qwen3_tokenizer):
- # pylint: disable=duplicate-code
strategy = load(
qwen3_tokenizer,
DictDefault(
@@ -130,6 +119,6 @@ class TestSplitThinking:
198, # \n
]
# fmt: on
- assert (
- input_ids == expected_input_ids
- ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ assert input_ids == expected_input_ids, (
+ f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+ )
diff --git a/tests/prompt_strategies/test_chat_templates_tool_call_string_arguments.py b/tests/prompt_strategies/test_chat_templates_tool_call_string_arguments.py
new file mode 100644
index 000000000..7de21b940
--- /dev/null
+++ b/tests/prompt_strategies/test_chat_templates_tool_call_string_arguments.py
@@ -0,0 +1,214 @@
+"""
+Tests for handling json tool content
+"""
+
+import json
+
+import pytest
+from datasets import Dataset
+
+from axolotl.prompt_strategies.chat_template import (
+ load,
+)
+from axolotl.utils.dict import DictDefault
+
+
+@pytest.fixture(name="qwen3_instruct_prompt_strategy")
+def qwen3_instruct_chat_template_strategy(qwen3_tokenizer):
+ strategy = load(
+ qwen3_tokenizer,
+ DictDefault(
+ {
+ "train_on_inputs": False,
+ "sequence_len": 512,
+ }
+ ),
+ DictDefault(
+ {
+ "chat_template": "qwen3",
+ "message_field_role": "role",
+ "message_field_content": "content",
+ "message_property_mappings": {
+ "role": "role",
+ "content": "content",
+ },
+ "roles": {
+ "user": ["user"],
+ "assistant": ["assistant"],
+ "system": ["system"],
+ },
+ "field_messages": "messages",
+ }
+ ),
+ )
+ return strategy
+
+
+class TestQwen3IdenticalConversationArgs:
+ """
+ Test Qwen3 tools is identical between JSON and dict
+ """
+
+ @pytest.fixture(name="conversation_dict_args_dataset")
+ def fixture_conversation_dict_args_dataset(self):
+ """
+ Provides a dataset with conversation where arguments is a dict.
+ """
+ user_content = "What is the weather in Boston?"
+ function_name = "get_current_weather"
+ arguments_dict = {"location": "Boston, MA", "unit": "celsius"}
+
+ data = [
+ {
+ "messages": [
+ {"role": "user", "content": user_content},
+ {
+ "role": "assistant",
+ "content": "",
+ "tool_calls": [
+ {
+ "function": {
+ "name": function_name,
+ "arguments": arguments_dict, # dict格式
+ }
+ }
+ ],
+ },
+ ],
+ }
+ ]
+ return Dataset.from_list(data)
+
+ @pytest.fixture(name="conversation_str_args_dataset")
+ def fixture_conversation_str_args_dataset(self):
+ """
+ Provides a dataset with conversation where arguments is a JSON string.
+ """
+ user_content = "What is the weather in Boston?"
+ function_name = "get_current_weather"
+ arguments_dict = {"location": "Boston, MA", "unit": "celsius"}
+ arguments_str = json.dumps(arguments_dict)
+
+ data = [
+ {
+ "messages": [
+ {"role": "user", "content": user_content},
+ {
+ "role": "assistant",
+ "content": "",
+ "tool_calls": [
+ {
+ "function": {
+ "name": function_name,
+ "arguments": arguments_str, # str格式
+ }
+ }
+ ],
+ },
+ ],
+ }
+ ]
+ return Dataset.from_list(data)
+
+ @pytest.fixture(name="conversation_mixed_time_types_dataset")
+ def fixture_conversation_mixed_time_types_dataset(self):
+ """
+ Provides a dataset where 'time' field has different types in different tool calls.
+ """
+ data = [
+ {
+ "messages": [
+ {
+ "role": "user",
+ "content": "Get weather information at different times",
+ },
+ {
+ "role": "assistant",
+ "content": "",
+ "tool_calls": [
+ {
+ "function": {
+ "name": "func1",
+ "arguments": json.dumps(
+ {"time": "2025-08-01"}
+ ), # string type
+ }
+ },
+ {
+ "function": {
+ "name": "func2",
+ "arguments": json.dumps(
+ {"time": 1690876800}
+ ), # number type
+ }
+ },
+ ],
+ },
+ ],
+ }
+ ]
+ return Dataset.from_list(data)
+
+ def test_dict_and_str_args_produce_identical_output(
+ self,
+ conversation_dict_args_dataset,
+ conversation_str_args_dataset,
+ qwen3_instruct_prompt_strategy,
+ qwen3_tokenizer,
+ ):
+ """
+ Tests that after tokenization and decoding, the outputs for both
+ dict and string `arguments` are exactly the same.
+ """
+ processed_dict_args = conversation_dict_args_dataset.map(
+ qwen3_instruct_prompt_strategy.tokenize_prompt,
+ batched=True,
+ remove_columns=["messages"],
+ )
+
+ processed_str_args = conversation_str_args_dataset.map(
+ qwen3_instruct_prompt_strategy.tokenize_prompt,
+ batched=True,
+ remove_columns=["messages"],
+ )
+
+ decoded_prompt_from_dict = qwen3_tokenizer.decode(
+ processed_dict_args[0]["input_ids"]
+ )
+
+ decoded_prompt_from_str = qwen3_tokenizer.decode(
+ processed_str_args[0]["input_ids"]
+ )
+
+ assert decoded_prompt_from_dict == decoded_prompt_from_str, (
+ f"Dict format output:\n{decoded_prompt_from_dict}\n"
+ f"String format output:\n{decoded_prompt_from_str}"
+ )
+
+ assert (
+ processed_dict_args[0]["input_ids"] == processed_str_args[0]["input_ids"]
+ ), "The tokenized input_ids should be identical for dict and str arguments"
+
+ def test_str_args_with_mixed_time_types_no_error(
+ self,
+ conversation_mixed_time_types_dataset,
+ qwen3_instruct_prompt_strategy,
+ qwen3_tokenizer,
+ ):
+ """
+ Tests that when 'time' field has different types (string vs number)
+ in different tool calls, str format arguments don't cause errors.
+ """
+ processed = conversation_mixed_time_types_dataset.map(
+ qwen3_instruct_prompt_strategy.tokenize_prompt,
+ batched=True,
+ remove_columns=["messages"],
+ )
+
+ assert len(processed) == 1
+ assert "input_ids" in processed[0]
+ assert len(processed[0]["input_ids"]) > 0
+
+ decoded = qwen3_tokenizer.decode(processed[0]["input_ids"])
+ assert "2025-08-01" in decoded, "String time value should be present"
+ assert "1690876800" in decoded, "Number time value should be present"
diff --git a/tests/prompt_strategies/test_dpo_chat_templates.py b/tests/prompt_strategies/test_dpo_chat_templates.py
index e5f30a6c4..b5c121726 100644
--- a/tests/prompt_strategies/test_dpo_chat_templates.py
+++ b/tests/prompt_strategies/test_dpo_chat_templates.py
@@ -8,7 +8,7 @@ import pytest
from datasets import Dataset
from transformers import AutoTokenizer
-from axolotl.prompt_strategies.dpo.chat_template import default
+from axolotl.prompt_strategies.dpo.chat_template import argilla_chat, default
from axolotl.utils.dict import DictDefault
from tests.hf_offline_utils import enable_hf_offline
@@ -16,7 +16,6 @@ from tests.hf_offline_utils import enable_hf_offline
@pytest.fixture(name="assistant_dataset")
def fixture_assistant_dataset():
- # pylint: disable=duplicate-code
return Dataset.from_list(
[
{
@@ -49,7 +48,6 @@ def fixture_assistant_dataset():
@pytest.fixture(name="custom_assistant_dataset")
def fixture_custom_assistant_dataset():
- # pylint: disable=duplicate-code
return Dataset.from_list(
[
{
@@ -80,6 +78,36 @@ def fixture_custom_assistant_dataset():
)
+@pytest.fixture(name="argilla_chat_dataset")
+def fixture_argilla_chat_dataset():
+ return Dataset.from_list(
+ [
+ {
+ "chosen": [
+ {
+ "role": "user",
+ "content": "hello",
+ },
+ {
+ "role": "assistant",
+ "content": "goodbye",
+ },
+ ],
+ "rejected": [
+ {
+ "role": "user",
+ "content": "hello",
+ },
+ {
+ "role": "assistant",
+ "content": "party on",
+ },
+ ],
+ }
+ ]
+ )
+
+
@pytest.fixture(name="phi3_tokenizer")
@enable_hf_offline
def fixture_phi3_tokenizer():
@@ -102,7 +130,6 @@ class TestAssistantDPOChatTemplateLlama3:
"""
def test_llama3_defaults(self, llama3_tokenizer, assistant_dataset):
- # pylint: disable=duplicate-code
transform_fn, _ = default(
DictDefault(
{
@@ -127,7 +154,6 @@ class TestAssistantDPOChatTemplateLlama3:
assert result["rejected"] == "party on<|eot_id|>"
def test_llama3_configured(self, llama3_tokenizer, custom_assistant_dataset):
- # pylint: disable=duplicate-code
transform_fn, _ = default(
DictDefault(
{
@@ -168,7 +194,6 @@ class TestAssistantDPOChatTemplatePhi3:
"""
def test_phi3_defaults(self, phi3_tokenizer, assistant_dataset):
- # pylint: disable=duplicate-code
transform_fn, _ = default(
DictDefault(
{
@@ -198,7 +223,6 @@ class TestAssistantDPOChatTemplateGemma:
"""
def test_gemma_defaults(self, gemma_tokenizer, assistant_dataset):
- # pylint: disable=duplicate-code
transform_fn, _ = default(
DictDefault(
{
@@ -222,5 +246,51 @@ class TestAssistantDPOChatTemplateGemma:
assert result["rejected"] == "party on