From 17d715c2b35ffd4153e16cecc72b1592f556ae31 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Wed, 7 May 2025 15:06:07 -0400
Subject: [PATCH] swap tinymodels that have safetensors for some ci tests
 (#2641)

---
 .github/workflows/tests-nightly.yml           | 87 +++++++++++++++++++
 requirements.txt                              |  1 +
 src/axolotl/train.py                          |  5 +-
 .../utils/gradient_checkpointing/__init__.py  | 21 +++++
 tests/e2e/multigpu/test_llama.py              |  2 +-
 .../lora_kernels/test_lora_kernel_patching.py | 10 ++-
 tests/e2e/patched/test_falcon_samplepack.py   |  4 +
 tests/e2e/patched/test_mistral_samplepack.py  |  4 +-
 tests/e2e/patched/test_model_patches.py       |  2 +-
 tests/e2e/patched/test_resume.py              |  4 +-
 tests/e2e/test_evaluate.py                    |  7 +-
 tests/e2e/test_falcon.py                      |  5 ++
 tests/e2e/test_mistral.py                     |  4 +-
 tests/test_datasets.py                        |  1 -
 14 files changed, 137 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml
index 23eb25f56..539f7f71b 100644
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -18,9 +18,96 @@ jobs:
         env:
           SKIP: no-commit-to-branch
 
+  preload-cache:
+    name: Preload HF cache
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.11"]
+        pytorch_version: ["2.6.0"]
+    timeout-minutes: 20
+
+    env:
+      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies
+
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }}
+
+      - name: Install dependencies
+        run: |
+          pip3 show torch
+          pip3 install --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: Make sure PyTorch version wasn't clobbered
+        run: |
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+
+      - name: Ensure axolotl CLI was installed
+        run: |
+          axolotl --help
+
+      - name: Pre-Download dataset fixture
+        run: |
+          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+
+      - name: Run tests
+        run: |
+          pytest -v tests/conftest.py
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: ./coverage.xml
+          flags: unittests,pytorch-${{ matrix.pytorch_version }}
+          fail_ci_if_error: false
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+
+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
   pytest:
     name: PyTest
     runs-on: ubuntu-latest
+    needs: [preload-cache]
     strategy:
       fail-fast: false
       max-parallel: 2
diff --git a/requirements.txt b/requirements.txt
index dc495bedd..4ae82dd49 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,7 @@ liger-kernel==0.5.9
 
 packaging==23.2
 
+huggingface_hub==0.31.0
 peft==0.15.2
 transformers==4.51.3
 tokenizers>=0.21.1
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index e58eddbff..68efc0b77 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -2,6 +2,7 @@
 
 import importlib
 import inspect
+import logging
 import os
 import signal
 import sys
@@ -12,7 +13,6 @@ from typing import Any, Dict
 
 import torch
 import transformers.modelcard
-from accelerate.logging import get_logger
 from accelerate.utils import save_fsdp_model
 from datasets import Dataset
 from huggingface_hub.errors import OfflineModeIsEnabled
@@ -42,7 +42,7 @@ try:
 except ImportError:
     BetterTransformer = None
 
-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)
 
 
 def setup_model_and_tokenizer(
@@ -63,7 +63,6 @@ def setup_model_and_tokenizer(
     # Load tokenizer
     LOG.debug(
         f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
-        main_process_only=True,
     )
     tokenizer = load_tokenizer(cfg)
 
diff --git a/src/axolotl/utils/gradient_checkpointing/__init__.py b/src/axolotl/utils/gradient_checkpointing/__init__.py
index 0da5c83a2..f84f76d80 100644
--- a/src/axolotl/utils/gradient_checkpointing/__init__.py
+++ b/src/axolotl/utils/gradient_checkpointing/__init__.py
@@ -1,15 +1,36 @@
 """custom checkpointing utils"""
 
+import importlib
 from functools import partial
 
+from packaging import version
+
 from axolotl.utils.gradient_checkpointing.unsloth import (
     Unsloth_Offloaded_Gradient_Checkpointer,
 )
 
+transformers_version = version.parse(importlib.metadata.version("transformers"))
+if transformers_version > version.parse("4.51.3"):
+    from transformers.modeling_layers import GradientCheckpointingLayer
+
+    def uses_gc_layers(decoder_layer):
+        return isinstance(decoder_layer.func.__self__, GradientCheckpointingLayer)
+
+else:
+
+    def uses_gc_layers(_):
+        return False
+
 
 def hf_grad_checkpoint_offload_wrapper(
     decoder_layer, *args, use_reentrant=None
 ):  # pylint: disable=unused-argument
+    if uses_gc_layers(decoder_layer):
+        return Unsloth_Offloaded_Gradient_Checkpointer.apply(
+            decoder_layer,
+            *args,
+        )
+
     return Unsloth_Offloaded_Gradient_Checkpointer.apply(
         (
             decoder_layer.func.__self__
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index 1ff795bd6..38e6e741a 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -479,7 +479,7 @@ class TestMultiGPULlama:
                 "sample_packing": True,
                 "pad_to_sequence_len": True,
                 "sequence_len": 2048,
-                "val_set_size": 0.05,
+                "val_set_size": 0.1,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
diff --git a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
index f3e59b373..f6b7ee9b9 100644
--- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
+++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
@@ -29,12 +29,12 @@ from axolotl.utils.dict import DictDefault
 
 MODEL_CONFIGS = [
     {
-        "name": "openaccess-ai-collective/tiny-mistral",
+        "name": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
         "expected_activation": apply_lora_mlp_swiglu,
         "dtype": torch.float16,
     },
     {
-        "name": "Qwen/Qwen2-7B",
+        "name": "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
         "expected_activation": apply_lora_mlp_swiglu,
         "dtype": torch.float16,
     },
@@ -44,7 +44,7 @@ MODEL_CONFIGS = [
         "dtype": torch.float32,
     },
     {
-        "name": "mhenrichsen/gemma-2b",
+        "name": "trl-internal-testing/tiny-Gemma2ForCausalLM",
         "expected_activation": apply_lora_mlp_geglu,
         "dtype": torch.float16,
     },
@@ -156,7 +156,9 @@ def test_swiglu_mlp_integration(small_llama_model):
 def test_geglu_model_integration():
     """Test GeGLU activation with Gemma model."""
     model = AutoModelForCausalLM.from_pretrained(
-        "mhenrichsen/gemma-2b", torch_dtype=torch.float16, device_map="cuda:0"
+        "trl-internal-testing/tiny-Gemma2ForCausalLM",
+        torch_dtype=torch.float16,
+        device_map="cuda:0",
     )
     peft_config = get_peft_config(
         {
diff --git a/tests/e2e/patched/test_falcon_samplepack.py b/tests/e2e/patched/test_falcon_samplepack.py
index 0034169af..667b62ffb 100644
--- a/tests/e2e/patched/test_falcon_samplepack.py
+++ b/tests/e2e/patched/test_falcon_samplepack.py
@@ -6,6 +6,8 @@ import logging
 import os
 import unittest
 
+import pytest
+
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
@@ -23,6 +25,7 @@ class TestFalconPatched(unittest.TestCase):
     Test case for Falcon models
     """
 
+    @pytest.mark.skip(reason="no tiny models for testing with safetensors")
     @with_temp_dir
     def test_qlora(self, temp_dir):
         # pylint: disable=duplicate-code
@@ -71,6 +74,7 @@ class TestFalconPatched(unittest.TestCase):
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
 
+    @pytest.mark.skip(reason="no tiny models for testing with safetensors")
     @with_temp_dir
     def test_ft(self, temp_dir):
         # pylint: disable=duplicate-code
diff --git a/tests/e2e/patched/test_mistral_samplepack.py b/tests/e2e/patched/test_mistral_samplepack.py
index 3bc0fcfbc..ccfeb3d63 100644
--- a/tests/e2e/patched/test_mistral_samplepack.py
+++ b/tests/e2e/patched/test_mistral_samplepack.py
@@ -28,7 +28,7 @@ class TestMistral(unittest.TestCase):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
-                "base_model": "openaccess-ai-collective/tiny-mistral",
+                "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
                 "flash_attention": True,
                 "sample_packing": True,
                 "sequence_len": 1024,
@@ -76,7 +76,7 @@ class TestMistral(unittest.TestCase):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
-                "base_model": "openaccess-ai-collective/tiny-mistral",
+                "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
                 "flash_attention": True,
                 "sample_packing": True,
                 "sequence_len": 1024,
diff --git a/tests/e2e/patched/test_model_patches.py b/tests/e2e/patched/test_model_patches.py
index 8a75db52e..26090e697 100644
--- a/tests/e2e/patched/test_model_patches.py
+++ b/tests/e2e/patched/test_model_patches.py
@@ -56,7 +56,7 @@ class TestModelPatches(unittest.TestCase):
     def test_mistral_multipack(self, temp_dir):
         cfg = DictDefault(
             {
-                "base_model": "openaccess-ai-collective/tiny-mistral",
+                "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
                 "flash_attention": True,
                 "sample_packing": True,
                 "sequence_len": 2048,
diff --git a/tests/e2e/patched/test_resume.py b/tests/e2e/patched/test_resume.py
index a84759bae..61e4a0e03 100644
--- a/tests/e2e/patched/test_resume.py
+++ b/tests/e2e/patched/test_resume.py
@@ -15,7 +15,7 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault
 
-from ..utils import check_model_output_exists, most_recent_subdir
+from ..utils import check_model_output_exists, most_recent_subdir, require_torch_2_6_0
 
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"
@@ -26,6 +26,7 @@ class TestResumeLlama:
     Test case for resuming training of llama models
     """
 
+    @require_torch_2_6_0
     def test_resume_lora_packed(self, temp_dir):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
@@ -62,6 +63,7 @@ class TestResumeLlama:
                 "save_total_limit": 5,
                 "max_steps": 15,
                 "use_tensorboard": True,
+                "save_safetensors": True,
             }
         )
         if is_torch_bf16_gpu_available():
diff --git a/tests/e2e/test_evaluate.py b/tests/e2e/test_evaluate.py
index b2d7d02ca..0278113b7 100644
--- a/tests/e2e/test_evaluate.py
+++ b/tests/e2e/test_evaluate.py
@@ -19,14 +19,11 @@ class TestE2eEvaluate:
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
-                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "LlamaTokenizer",
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
                 "sequence_len": 1024,
                 "val_set_size": 0.02,
                 "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
+                    "pad_token": "<|endoftext|>",
                 },
                 "datasets": [
                     {
diff --git a/tests/e2e/test_falcon.py b/tests/e2e/test_falcon.py
index a1641a997..24afab0b3 100644
--- a/tests/e2e/test_falcon.py
+++ b/tests/e2e/test_falcon.py
@@ -6,6 +6,8 @@ import logging
 import os
 import unittest
 
+import pytest
+
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
@@ -23,6 +25,7 @@ class TestFalcon(unittest.TestCase):
     Test case for falcon
     """
 
+    @pytest.mark.skip(reason="no tiny models for testing with safetensors")
     @with_temp_dir
     def test_lora(self, temp_dir):
         # pylint: disable=duplicate-code
@@ -74,6 +77,7 @@ class TestFalcon(unittest.TestCase):
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
 
+    @pytest.mark.skip(reason="no tiny models for testing with safetensors")
     @with_temp_dir
     def test_lora_added_vocab(self, temp_dir):
         # pylint: disable=duplicate-code
@@ -129,6 +133,7 @@ class TestFalcon(unittest.TestCase):
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
 
+    @pytest.mark.skip(reason="no tiny models for testing with safetensors")
     @with_temp_dir
     def test_ft(self, temp_dir):
         # pylint: disable=duplicate-code
diff --git a/tests/e2e/test_mistral.py b/tests/e2e/test_mistral.py
index 740fa6eed..ba8cf2896 100644
--- a/tests/e2e/test_mistral.py
+++ b/tests/e2e/test_mistral.py
@@ -30,7 +30,7 @@ class TestMistral(unittest.TestCase):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
-                "base_model": "openaccess-ai-collective/tiny-mistral",
+                "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
                 "flash_attention": True,
                 "sequence_len": 1024,
                 "load_in_8bit": True,
@@ -77,7 +77,7 @@ class TestMistral(unittest.TestCase):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
-                "base_model": "openaccess-ai-collective/tiny-mistral",
+                "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
                 "flash_attention": True,
                 "sequence_len": 1024,
                 "val_set_size": 0.02,
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index ded82869f..88d196ad1 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -414,7 +414,6 @@ class TestDatasetPreparation:
             snapshot_path = snapshot_download(
                 repo_id="mhenrichsen/alpaca_2k_test",
                 repo_type="dataset",
-                local_dir=tmp_ds_path,
             )
             shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True)