Compare commits

..

15 Commits

Author SHA1 Message Date
Wing Lian
31799bdcc0 more parity across tests and docker images for packaging/setuptools 2025-03-21 08:56:01 -04:00
Wing Lian
25455ac25f make sure packaging version is consistent 2025-03-21 08:27:17 -04:00
Wing Lian
edea25bd58 comment out license for validation for now 2025-03-21 08:20:28 -04:00
Wing Lian
42e32223c9 try rolling back packaging and setuptools versions 2025-03-21 08:12:07 -04:00
Wing Lian
6e0fed0ce7 use license instead of license-file 2025-03-21 07:25:09 -04:00
Wing Lian
5ece44b4a8 try with reversion of packaging/setuptools/wheel install 2025-03-21 07:19:12 -04:00
Wing Lian
e7532c9b0c make sure ninja is installed 2025-03-21 06:57:06 -04:00
Wing Lian
2518a9b2a2 multiline fix 2025-03-20 20:51:16 -04:00
Wing Lian
faeae323cb install deepspeed by itself 2025-03-20 20:04:39 -04:00
Wing Lian
bb683644c3 deepspeed binary fixes hopefully 2025-03-20 19:52:07 -04:00
Wing Lian
7009a48398 bump deepspeed and set no binary 2025-03-20 14:01:01 -04:00
Wing Lian
ee529e2354 use nightly 2025-03-20 11:24:30 -04:00
Wing Lian
b2976e64ec add 12.8.1 cuda to the base matrix 2025-03-20 11:24:30 -04:00
Wing Lian
38df5a36ea bump HF versions except for trl (#2427) 2025-03-20 10:22:05 -04:00
Wing Lian
4d92a68a96 use default torch fused adamw optimizer as default as adamw_hf is deprecated (#2425)
* use default torch fused adamw optimizer as default as adamw_hf is deprecated

* make sure to have latest packaging installed

* bump packagingin requirements.txt too
2025-03-19 23:58:33 -04:00
19 changed files with 96 additions and 56 deletions

View File

@@ -40,6 +40,12 @@ jobs:
python_version: "3.11" python_version: "3.11"
pytorch: 2.6.0 pytorch: 2.6.0
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
- cuda: "128"
cuda_version: 12.8.1
cudnn_version: ""
python_version: "3.11"
pytorch: nightly
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -61,7 +67,7 @@ jobs:
uses: docker/build-push-action@v4 uses: docker/build-push-action@v4
with: with:
context: . context: .
file: ./docker/Dockerfile-base file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || './docker/Dockerfile-base' }}
push: ${{ github.event_name != 'pull_request' }} push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
labels: ${{ steps.metadata.outputs.labels }} labels: ${{ steps.metadata.outputs.labels }}

View File

@@ -40,7 +40,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
pip3 install wheel packaging pip3 install wheel packaging==23.2
pip3 install --no-build-isolation -e . pip3 install --no-build-isolation -e .
pip3 install -r requirements-dev.txt -r requirements-tests.txt pip3 install -r requirements-dev.txt -r requirements-tests.txt

View File

@@ -42,7 +42,7 @@ jobs:
- name: upgrade pip - name: upgrade pip
run: | run: |
pip3 install --upgrade pip pip3 install --upgrade pip
pip3 install --upgrade packaging setuptools wheel pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
- name: Install PyTorch - name: Install PyTorch
run: | run: |
@@ -59,7 +59,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
pip3 install --upgrade pip pip3 install --upgrade pip
pip3 install --upgrade packaging pip3 install --upgrade packaging==23.2
pip3 install --no-build-isolation -U -e . pip3 install --no-build-isolation -U -e .
python scripts/unsloth_install.py | sh python scripts/unsloth_install.py | sh
python scripts/cutcrossentropy_install.py | sh python scripts/cutcrossentropy_install.py | sh

View File

@@ -74,7 +74,7 @@ jobs:
- name: upgrade pip - name: upgrade pip
run: | run: |
pip3 install --upgrade pip pip3 install --upgrade pip
pip3 install --upgrade packaging setuptools wheel pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
- name: Install PyTorch - name: Install PyTorch
run: | run: |
@@ -147,7 +147,7 @@ jobs:
- name: upgrade pip - name: upgrade pip
run: | run: |
pip3 install --upgrade pip pip3 install --upgrade pip
pip3 install --upgrade packaging setuptools setuptools_scm build wheel pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel
- name: Install PyTorch - name: Install PyTorch
run: | run: |

View File

@@ -22,8 +22,8 @@ repos:
rev: 6.1.0 rev: 6.1.0
hooks: hooks:
- id: flake8 - id: flake8
- repo: https://github.com/PyCQA/pylint - repo: https://github.com/pylint-dev/pylint
rev: v3.3.0 rev: c8c96d20cde3552a79858c7456bb1483bf83d633
hooks: hooks:
- id: pylint - id: pylint
- repo: https://github.com/pre-commit/mirrors-mypy - repo: https://github.com/pre-commit/mirrors-mypy

View File

@@ -55,7 +55,7 @@ Features:
### Installation ### Installation
```bash ```bash
pip3 install -U packaging setuptools wheel ninja pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation axolotl[flash-attn,deepspeed] pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
# Download example axolotl configs, deepspeed configs # Download example axolotl configs, deepspeed configs

View File

@@ -31,6 +31,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \ sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
fi fi
RUN pip install packaging==23.2 setuptools==75.8.0
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
else \ else \

View File

@@ -28,7 +28,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
WORKDIR /workspace WORKDIR /workspace
RUN python3 -m pip install --upgrade pip && pip3 install packaging && \ RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \ python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \ python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"

View File

@@ -0,0 +1,39 @@
ARG CUDA_VERSION="12.8.1"
ARG CUDNN_VERSION="8"
ARG UBUNTU_VERSION="22.04"
ARG MAX_JOBS=4
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
ENV PATH="/root/miniconda3/bin:${PATH}"
ARG PYTHON_VERSION="3.11"
ARG PYTORCH_VERSION="nightly"
ARG CUDA="128"
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
ENV PYTHON_VERSION=$PYTHON_VERSION
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
RUN apt-get update \
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
&& wget \
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
&& mkdir /root/.conda \
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
WORKDIR /workspace
RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
RUN git lfs install --skip-repo && \
pip3 install awscli && \
# The base image ships with `pydantic==1.8.2` which is not working
pip3 install -U --no-cache-dir pydantic==1.10.10

View File

@@ -55,7 +55,7 @@ tf32: true
gradient_checkpointing: true gradient_checkpointing: true
gradient_checkpointing_kwargs: gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: true
early_stopping_patience: early_stopping_patience:
resume_from_checkpoint: resume_from_checkpoint:
local_rank: local_rank:

View File

@@ -1,5 +1,5 @@
[build-system] [build-system]
requires = ["setuptools>=64", "wheel", "setuptools_scm>=8"] requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==23.2"]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
[project] [project]
@@ -8,6 +8,7 @@ dynamic = ["version", "dependencies", "optional-dependencies"]
description = "LLM Trainer" description = "LLM Trainer"
readme = "README.md" readme = "README.md"
requires-python = ">=3.10" requires-python = ">=3.10"
# license = "Apache-2.0"
[project.scripts] [project.scripts]
axolotl = "axolotl.cli.main:main" axolotl = "axolotl.cli.main:main"

View File

@@ -1,7 +1,7 @@
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
# START section of dependencies that don't install on Darwin/MacOS # START section of dependencies that don't install on Darwin/MacOS
bitsandbytes==0.45.2 bitsandbytes==0.45.3
triton>=3.0.0 triton>=3.0.0
mamba-ssm==1.2.0.post1 mamba-ssm==1.2.0.post1
flash-attn==2.7.4.post1 flash-attn==2.7.4.post1
@@ -12,12 +12,12 @@ liger-kernel==0.5.3
packaging==23.2 packaging==23.2
peft==0.14.0 peft==0.15.0
transformers==4.49.0 transformers==4.49.0
tokenizers>=0.21.0 tokenizers>=0.21.1
accelerate==1.3.0 accelerate==1.5.2
datasets==3.2.0 datasets==3.4.1
deepspeed==0.16.1 deepspeed==0.16.4
trl==0.15.1 trl==0.15.1
optimum==1.16.2 optimum==1.16.2

View File

@@ -17,12 +17,12 @@ if v < V("2.4.0"):
cce_spec = importlib.util.find_spec("cut_cross_entropy") cce_spec = importlib.util.find_spec("cut_cross_entropy")
UNINSTALL_PREFIX = "" uninstall_prefix = ""
if cce_spec: if cce_spec:
if not importlib.util.find_spec("cut_cross_entropy.transformers"): if not importlib.util.find_spec("cut_cross_entropy.transformers"):
UNINSTALL_PREFIX = "pip uninstall -y cut-cross-entropy && " uninstall_prefix = "pip uninstall -y cut-cross-entropy && "
print( print(
UNINSTALL_PREFIX uninstall_prefix
+ 'pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@24fbe4b5dab9a6c250a014573613c1890190536c"' + 'pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@24fbe4b5dab9a6c250a014573613c1890190536c"'
) )

View File

@@ -128,7 +128,7 @@ setup(
"flash-attn==2.7.4.post1", "flash-attn==2.7.4.post1",
], ],
"deepspeed": [ "deepspeed": [
"deepspeed==0.16.1", "deepspeed==0.16.4",
"deepspeed-kernels", "deepspeed-kernels",
], ],
"mamba-ssm": [ "mamba-ssm": [

View File

@@ -507,7 +507,7 @@ class HyperparametersConfig(BaseModel):
weight_decay: Optional[float] = 0.0 weight_decay: Optional[float] = 0.0
optimizer: Optional[ optimizer: Optional[
Union[OptimizerNames, CustomSupportedOptimizers] Union[OptimizerNames, CustomSupportedOptimizers]
] = OptimizerNames.ADAMW_HF ] = OptimizerNames.ADAMW_TORCH_FUSED
optim_args: Optional[Union[str, Dict[str, Any]]] = Field( optim_args: Optional[Union[str, Dict[str, Any]]] = Field(
default=None, default=None,
json_schema_extra={"description": "Optional arguments to supply to optimizer."}, json_schema_extra={"description": "Optional arguments to supply to optimizer."},
@@ -1679,30 +1679,6 @@ class AxolotlInputConfig(
return data return data
@model_validator(mode="before")
@classmethod
def check_rl_config_gradient_checkpointing(cls, data):
# TODO: SalmanMohammadi
# Distributed RL with QLoRA + gradient checkpointing
# and use_reentrant = True is broken upstream in TRL
# pylint: disable=too-many-boolean-expressions
if (
data.get("rl")
and data.get("gradient_checkpointing")
and data.get("gradient_checkpointing_kwargs")
and data.get("gradient_checkpointing_kwargs").get("use_reentrant")
and data.get("load_in_4bit")
and data.get("adapter") == "qlora"
and data.get("capabilities")
and data.get("capabilities").get("n_gpu", 1) > 1
):
raise ValueError(
"The `use_reentrant: True` implementation of gradient checkpointing "
"is not supported for distributed RL training with QLoRA. Please set "
"`use_reentrant: False` in `gradient_checkpointing_kwargs`."
)
return data
@model_validator(mode="before") @model_validator(mode="before")
@classmethod @classmethod
def check_kto_config(cls, data): def check_kto_config(cls, data):
@@ -1713,6 +1689,15 @@ class AxolotlInputConfig(
if data.get("remove_unused_columns") is not False: if data.get("remove_unused_columns") is not False:
raise ValueError("Set `remove_unused_columns: False` when using kto") raise ValueError("Set `remove_unused_columns: False` when using kto")
if data.get("gradient_checkpointing") and not (
data.get("gradient_checkpointing_kwargs")
and isinstance(data.get("gradient_checkpointing_kwargs"), dict)
and data["gradient_checkpointing_kwargs"].get("use_reentrant")
):
raise ValueError(
"Set `gradient_checkpointing_kwargs: {use_reentrant: true}` for when kto is enabled"
)
return data return data

View File

@@ -2,6 +2,7 @@
import functools import functools
import logging import logging
import os
from pathlib import Path from pathlib import Path
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
@@ -344,6 +345,7 @@ def load_tokenized_prepared_datasets(
) )
ds_from_iter.save_to_disk(str(prepared_ds_path)) ds_from_iter.save_to_disk(str(prepared_ds_path))
else: else:
os.makedirs(prepared_ds_path, exist_ok=True)
dataset.save_to_disk(str(prepared_ds_path)) dataset.save_to_disk(str(prepared_ds_path))
if cfg.push_dataset_to_hub: if cfg.push_dataset_to_hub:
LOG.info( LOG.info(

View File

@@ -108,6 +108,12 @@ def download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset():
) )
@pytest.fixture(scope="session", autouse=True)
def download_tiny_shakespeare_dataset():
# download the dataset
snapshot_download_w_retry("Trelis/tiny-shakespeare", repo_type="dataset")
@pytest.fixture @pytest.fixture
def temp_dir(): def temp_dir():
# Create a temporary directory # Create a temporary directory

View File

@@ -40,8 +40,8 @@ class TestReLoraLlama(unittest.TestCase):
"lora_alpha": 16, "lora_alpha": 16,
"lora_dropout": 0.05, "lora_dropout": 0.05,
"lora_target_modules": ["q_proj", "v_proj"], "lora_target_modules": ["q_proj", "v_proj"],
"relora_steps": 100, "relora_steps": 50,
"relora_warmup_steps": 20, "relora_warmup_steps": 10,
"relora_anneal_steps": 10, "relora_anneal_steps": 10,
"relora_prune_ratio": 0.9, "relora_prune_ratio": 0.9,
"relora_cpu_offload": True, "relora_cpu_offload": True,
@@ -60,9 +60,9 @@ class TestReLoraLlama(unittest.TestCase):
"message_field_content": "value", "message_field_content": "value",
}, },
], ],
"warmup_steps": 20, "warmup_steps": 10,
"num_epochs": 2, "num_epochs": 2,
"max_steps": 205, # at least 2x relora_steps "max_steps": 105, # at least 2x relora_steps
"micro_batch_size": 2, "micro_batch_size": 2,
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
"output_dir": temp_dir, "output_dir": temp_dir,

View File

@@ -7,13 +7,13 @@ import tempfile
import unittest import unittest
from pathlib import Path from pathlib import Path
from conftest import snapshot_download_w_retry
from constants import ( from constants import (
ALPACA_MESSAGES_CONFIG_OG, ALPACA_MESSAGES_CONFIG_OG,
ALPACA_MESSAGES_CONFIG_REVISION, ALPACA_MESSAGES_CONFIG_REVISION,
SPECIAL_TOKENS, SPECIAL_TOKENS,
) )
from datasets import Dataset from datasets import Dataset
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer from transformers import AutoTokenizer
from axolotl.utils.data import load_tokenized_prepared_datasets from axolotl.utils.data import load_tokenized_prepared_datasets
@@ -69,7 +69,7 @@ class TestDatasetPreparation(unittest.TestCase):
with tempfile.TemporaryDirectory() as tmp_dir: with tempfile.TemporaryDirectory() as tmp_dir:
tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test" tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
tmp_ds_path.mkdir(parents=True, exist_ok=True) tmp_ds_path.mkdir(parents=True, exist_ok=True)
snapshot_download( snapshot_download_w_retry(
repo_id="mhenrichsen/alpaca_2k_test", repo_id="mhenrichsen/alpaca_2k_test",
repo_type="dataset", repo_type="dataset",
local_dir=tmp_ds_path, local_dir=tmp_ds_path,
@@ -81,7 +81,7 @@ class TestDatasetPreparation(unittest.TestCase):
# how to load it. # how to load it.
cfg = DictDefault( cfg = DictDefault(
{ {
"tokenizer_config": "huggyllama/llama-7b", "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
"sequence_len": 1024, "sequence_len": 1024,
"datasets": [ "datasets": [
{ {
@@ -339,7 +339,7 @@ class TestDatasetPreparation(unittest.TestCase):
with tempfile.TemporaryDirectory() as tmp_dir: with tempfile.TemporaryDirectory() as tmp_dir:
tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test" tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
tmp_ds_path.mkdir(parents=True, exist_ok=True) tmp_ds_path.mkdir(parents=True, exist_ok=True)
snapshot_download( snapshot_download_w_retry(
repo_id="mhenrichsen/alpaca_2k_test", repo_id="mhenrichsen/alpaca_2k_test",
repo_type="dataset", repo_type="dataset",
local_dir=tmp_ds_path, local_dir=tmp_ds_path,
@@ -381,7 +381,7 @@ class TestDatasetPreparation(unittest.TestCase):
with tempfile.TemporaryDirectory() as tmp_dir: with tempfile.TemporaryDirectory() as tmp_dir:
tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test" tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
tmp_ds_path.mkdir(parents=True, exist_ok=True) tmp_ds_path.mkdir(parents=True, exist_ok=True)
snapshot_download( snapshot_download_w_retry(
repo_id="mhenrichsen/alpaca_2k_test", repo_id="mhenrichsen/alpaca_2k_test",
repo_type="dataset", repo_type="dataset",
local_dir=tmp_ds_path, local_dir=tmp_ds_path,