more parity across tests and docker images for packaging/setuptools

make sure packaging version is consistent
comment out license for validation for now
2025-03-21 08:56:01 -04:00 · 2025-03-21 08:27:17 -04:00 · 2025-03-21 08:20:28 -04:00 · 2025-03-21 08:12:07 -04:00 · 2025-03-21 07:25:09 -04:00 · 2025-03-21 07:19:12 -04:00
19 changed files with 96 additions and 56 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -40,6 +40,12 @@ jobs:
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "128"
+            cuda_version: 12.8.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: nightly
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -61,7 +67,7 @@ jobs:
        uses: docker/build-push-action@v4
        with:
          context: .
-          file: ./docker/Dockerfile-base
+          file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || './docker/Dockerfile-base' }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -40,7 +40,7 @@ jobs:

      - name: Install dependencies
        run: |
-          pip3 install wheel packaging
+          pip3 install wheel packaging==23.2
          pip3 install --no-build-isolation -e .
          pip3 install -r requirements-dev.txt -r requirements-tests.txt

--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -42,7 +42,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging setuptools wheel
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel

      - name: Install PyTorch
        run: |
@@ -59,7 +59,7 @@ jobs:
      - name: Install dependencies
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging
+          pip3 install --upgrade packaging==23.2
          pip3 install --no-build-isolation -U -e .
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -74,7 +74,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging setuptools wheel
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel

      - name: Install PyTorch
        run: |
@@ -147,7 +147,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging setuptools setuptools_scm build wheel
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel

      - name: Install PyTorch
        run: |
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,8 +22,8 @@ repos:
    rev: 6.1.0
    hooks:
    - id: flake8
-   repo: https://github.com/PyCQA/pylint
-    rev: v3.3.0
+-   repo: https://github.com/pylint-dev/pylint
+    rev: c8c96d20cde3552a79858c7456bb1483bf83d633
    hooks:
    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ Features:
 ### Installation

 ```bash
-pip3 install -U packaging setuptools wheel ninja
+pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]

 # Download example axolotl configs, deepspeed configs
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -31,6 +31,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi

+RUN pip install packaging==23.2 setuptools==75.8.0
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -28,7 +28,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"

 WORKDIR /workspace

-RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
+RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
--- a/docker/Dockerfile-base-nightly
+++ b/docker/Dockerfile-base-nightly
@@ -0,0 +1,39 @@
+ARG CUDA_VERSION="12.8.1"
+ARG CUDNN_VERSION="8"
+ARG UBUNTU_VERSION="22.04"
+ARG MAX_JOBS=4
+
+FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
+
+ENV PATH="/root/miniconda3/bin:${PATH}"
+
+ARG PYTHON_VERSION="3.11"
+ARG PYTORCH_VERSION="nightly"
+ARG CUDA="128"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+
+ENV PYTHON_VERSION=$PYTHON_VERSION
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+
+RUN apt-get update \
+    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
+    && wget \
+    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+    && mkdir /root/.conda \
+    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && rm -f Miniconda3-latest-Linux-x86_64.sh \
+    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
+
+ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
+
+WORKDIR /workspace
+
+RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
+    python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
+    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
+    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
+
+RUN git lfs install --skip-repo && \
+    pip3 install awscli && \
+    # The base image ships with `pydantic==1.8.2` which is not working
+    pip3 install -U --no-cache-dir pydantic==1.10.10
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -55,7 +55,7 @@ tf32: true

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
-  use_reentrant: false
+  use_reentrant: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=64", "wheel", "setuptools_scm>=8"]
+requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==23.2"]
 build-backend = "setuptools.build_meta"

 [project]
@@ -8,6 +8,7 @@ dynamic = ["version", "dependencies", "optional-dependencies"]
 description = "LLM Trainer"
 readme = "README.md"
 requires-python = ">=3.10"
+# license = "Apache-2.0"

 [project.scripts]
 axolotl = "axolotl.cli.main:main"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/

 # START section of dependencies that don't install on Darwin/MacOS
-bitsandbytes==0.45.2
+bitsandbytes==0.45.3
 triton>=3.0.0
 mamba-ssm==1.2.0.post1
 flash-attn==2.7.4.post1
@@ -12,12 +12,12 @@ liger-kernel==0.5.3

 packaging==23.2

-peft==0.14.0
+peft==0.15.0
 transformers==4.49.0
-tokenizers>=0.21.0
-accelerate==1.3.0
-datasets==3.2.0
-deepspeed==0.16.1
+tokenizers>=0.21.1
+accelerate==1.5.2
+datasets==3.4.1
+deepspeed==0.16.4
 trl==0.15.1

 optimum==1.16.2
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -17,12 +17,12 @@ if v < V("2.4.0"):

 cce_spec = importlib.util.find_spec("cut_cross_entropy")

-UNINSTALL_PREFIX = ""
+uninstall_prefix = ""
 if cce_spec:
    if not importlib.util.find_spec("cut_cross_entropy.transformers"):
-        UNINSTALL_PREFIX = "pip uninstall -y cut-cross-entropy && "
+        uninstall_prefix = "pip uninstall -y cut-cross-entropy && "

 print(
-    UNINSTALL_PREFIX
+    uninstall_prefix
    + 'pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@24fbe4b5dab9a6c250a014573613c1890190536c"'
 )
--- a/setup.py
+++ b/setup.py
@@ -128,7 +128,7 @@ setup(
            "flash-attn==2.7.4.post1",
        ],
        "deepspeed": [
-            "deepspeed==0.16.1",
+            "deepspeed==0.16.4",
            "deepspeed-kernels",
        ],
        "mamba-ssm": [
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -507,7 +507,7 @@ class HyperparametersConfig(BaseModel):
    weight_decay: Optional[float] = 0.0
    optimizer: Optional[
        Union[OptimizerNames, CustomSupportedOptimizers]
-    ] = OptimizerNames.ADAMW_HF
+    ] = OptimizerNames.ADAMW_TORCH_FUSED
    optim_args: Optional[Union[str, Dict[str, Any]]] = Field(
        default=None,
        json_schema_extra={"description": "Optional arguments to supply to optimizer."},
@@ -1679,30 +1679,6 @@ class AxolotlInputConfig(

        return data

-    @model_validator(mode="before")
-    @classmethod
-    def check_rl_config_gradient_checkpointing(cls, data):
-        # TODO: SalmanMohammadi
-        # Distributed RL with QLoRA + gradient checkpointing
-        # and use_reentrant = True is broken upstream in TRL
-        # pylint: disable=too-many-boolean-expressions
-        if (
-            data.get("rl")
-            and data.get("gradient_checkpointing")
-            and data.get("gradient_checkpointing_kwargs")
-            and data.get("gradient_checkpointing_kwargs").get("use_reentrant")
-            and data.get("load_in_4bit")
-            and data.get("adapter") == "qlora"
-            and data.get("capabilities")
-            and data.get("capabilities").get("n_gpu", 1) > 1
-        ):
-            raise ValueError(
-                "The `use_reentrant: True` implementation of gradient checkpointing "
-                "is not supported for distributed RL training with QLoRA. Please set "
-                "`use_reentrant: False` in `gradient_checkpointing_kwargs`."
-            )
-        return data
-
    @model_validator(mode="before")
    @classmethod
    def check_kto_config(cls, data):
@@ -1713,6 +1689,15 @@ class AxolotlInputConfig(
            if data.get("remove_unused_columns") is not False:
                raise ValueError("Set `remove_unused_columns: False` when using kto")

+            if data.get("gradient_checkpointing") and not (
+                data.get("gradient_checkpointing_kwargs")
+                and isinstance(data.get("gradient_checkpointing_kwargs"), dict)
+                and data["gradient_checkpointing_kwargs"].get("use_reentrant")
+            ):
+                raise ValueError(
+                    "Set `gradient_checkpointing_kwargs: {use_reentrant: true}` for when kto is enabled"
+                )
+
        return data


--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -2,6 +2,7 @@

 import functools
 import logging
+import os
 from pathlib import Path
 from typing import List, Optional, Tuple, Union

@@ -344,6 +345,7 @@ def load_tokenized_prepared_datasets(
                )
                ds_from_iter.save_to_disk(str(prepared_ds_path))
            else:
+                os.makedirs(prepared_ds_path, exist_ok=True)
                dataset.save_to_disk(str(prepared_ds_path))
            if cfg.push_dataset_to_hub:
                LOG.info(
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -108,6 +108,12 @@ def download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset():
    )


+@pytest.fixture(scope="session", autouse=True)
+def download_tiny_shakespeare_dataset():
+    # download the dataset
+    snapshot_download_w_retry("Trelis/tiny-shakespeare", repo_type="dataset")
+
+
@pytest.fixture
 def temp_dir():
    # Create a temporary directory
--- a/tests/e2e/solo/test_relora_llama.py
+++ b/tests/e2e/solo/test_relora_llama.py
@@ -40,8 +40,8 @@ class TestReLoraLlama(unittest.TestCase):
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_modules": ["q_proj", "v_proj"],
-                "relora_steps": 100,
-                "relora_warmup_steps": 20,
+                "relora_steps": 50,
+                "relora_warmup_steps": 10,
                "relora_anneal_steps": 10,
                "relora_prune_ratio": 0.9,
                "relora_cpu_offload": True,
@@ -60,9 +60,9 @@ class TestReLoraLlama(unittest.TestCase):
                        "message_field_content": "value",
                    },
                ],
-                "warmup_steps": 20,
+                "warmup_steps": 10,
                "num_epochs": 2,
-                "max_steps": 205,  # at least 2x relora_steps
+                "max_steps": 105,  # at least 2x relora_steps
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -7,13 +7,13 @@ import tempfile
 import unittest
 from pathlib import Path

+from conftest import snapshot_download_w_retry
 from constants import (
    ALPACA_MESSAGES_CONFIG_OG,
    ALPACA_MESSAGES_CONFIG_REVISION,
    SPECIAL_TOKENS,
 )
 from datasets import Dataset
-from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer

 from axolotl.utils.data import load_tokenized_prepared_datasets
@@ -69,7 +69,7 @@ class TestDatasetPreparation(unittest.TestCase):
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-            snapshot_download(
+            snapshot_download_w_retry(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
                local_dir=tmp_ds_path,
@@ -81,7 +81,7 @@ class TestDatasetPreparation(unittest.TestCase):
            # how to load it.
            cfg = DictDefault(
                {
-                    "tokenizer_config": "huggyllama/llama-7b",
+                    "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
                    "sequence_len": 1024,
                    "datasets": [
                        {
@@ -339,7 +339,7 @@ class TestDatasetPreparation(unittest.TestCase):
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-            snapshot_download(
+            snapshot_download_w_retry(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
                local_dir=tmp_ds_path,
@@ -381,7 +381,7 @@ class TestDatasetPreparation(unittest.TestCase):
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-            snapshot_download(
+            snapshot_download_w_retry(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
                local_dir=tmp_ds_path,
Author	SHA1	Message	Date
Wing Lian	31799bdcc0	more parity across tests and docker images for packaging/setuptools	2025-03-21 08:56:01 -04:00
Wing Lian	25455ac25f	make sure packaging version is consistent	2025-03-21 08:27:17 -04:00
Wing Lian	edea25bd58	comment out license for validation for now	2025-03-21 08:20:28 -04:00
Wing Lian	42e32223c9	try rolling back packaging and setuptools versions	2025-03-21 08:12:07 -04:00
Wing Lian	6e0fed0ce7	use license instead of license-file	2025-03-21 07:25:09 -04:00
Wing Lian	5ece44b4a8	try with reversion of packaging/setuptools/wheel install	2025-03-21 07:19:12 -04:00
Wing Lian	e7532c9b0c	make sure ninja is installed	2025-03-21 06:57:06 -04:00
Wing Lian	2518a9b2a2	multiline fix	2025-03-20 20:51:16 -04:00
Wing Lian	faeae323cb	install deepspeed by itself	2025-03-20 20:04:39 -04:00
Wing Lian	bb683644c3	deepspeed binary fixes hopefully	2025-03-20 19:52:07 -04:00
Wing Lian	7009a48398	bump deepspeed and set no binary	2025-03-20 14:01:01 -04:00
Wing Lian	ee529e2354	use nightly	2025-03-20 11:24:30 -04:00
Wing Lian	b2976e64ec	add 12.8.1 cuda to the base matrix	2025-03-20 11:24:30 -04:00
Wing Lian	38df5a36ea	bump HF versions except for trl (#2427 )	2025-03-20 10:22:05 -04:00
Wing Lian	4d92a68a96	use default torch fused adamw optimizer as default as adamw_hf is deprecated (#2425 ) * use default torch fused adamw optimizer as default as adamw_hf is deprecated * make sure to have latest packaging installed * bump packagingin requirements.txt too	2025-03-19 23:58:33 -04:00