bartch upgrade dependencies (#3299)

* upgrade dependencies * don't use reset sessions * downgrade transformers, upgrade other deps * upgrade bnb to 0.49.0 * restore s3 cache * explicit use local files w hub * decompress and strip top level dir * use 2 levels for strip components * try to preserve permissions for symlinks * use updated tar * fix #3293 for distributed * downgrade bnb * fast fail after 4 * fix total tokens device * patch accelerate CP/SP (#3309) --------- Co-authored-by: salman <salman.mohammadi@outlook.com>
2025-12-30 09:02:49 -05:00
parent 66a3de3629
commit 11c0b5b256
9 changed files with 66 additions and 26 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -66,12 +66,13 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4
-#      - name: Restore Cache from S3
+      - name: Restore Cache from S3
-#        id: hf-cache-restore-s3
+        id: hf-cache-restore-s3
-#        run: |
+        run: |
-#          mkdir -p ~/.cache/huggingface/hub
+          mkdir -p ~/.cache/huggingface/hub
-#          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd --strip-components=1
-#
+          ls -ltr ~/.cache/huggingface/hub/
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
@@ -111,6 +112,9 @@ jobs:
        run: |
          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
      - name: Show HF cache
        run: hf cache scan
      - name: Run tests
        run: |
          df -h
@@ -122,6 +126,9 @@ jobs:
          df -h
          pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
      - name: Show HF cache
        run: hf cache scan
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        with:
@@ -149,12 +156,13 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4
-#      - name: Restore Cache from S3
+      - name: Restore Cache from S3
-#        id: hf-cache-restore-s3
+        id: hf-cache-restore-s3
-#        run: |
+        run: |
-#          mkdir -p ~/.cache/huggingface/hub
+          mkdir -p ~/.cache/huggingface/hub
-#          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd --strip-components=1
-#
+          ls -ltr ~/.cache/huggingface/hub/
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
@@ -200,6 +208,9 @@ jobs:
          pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
          pytest -v --durations=10 tests/cli/
      - name: Show HF cache
        run: hf cache scan
  gate-skip-e2e:
    needs: [pre-commit, pytest, pytest-sdist]
    runs-on: ubuntu-latest
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -2,7 +2,7 @@
 set -e
 # Only run two tests at a time to avoid OOM on GPU (with coverage collection)
-pytest -v --durations=10 -n2 \
+pytest -v --durations=10 -n2 --maxfail=4 \
  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
  --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
  /workspace/axolotl/tests/e2e/multigpu/ \
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,12 +14,12 @@ huggingface_hub>=0.36.0
 peft>=0.18.0
 tokenizers>=0.22.1
 transformers==4.57.1
-accelerate==1.11.0
+accelerate==1.12.0
-datasets==4.4.1
+datasets==4.4.2
-deepspeed>=0.17.0
+deepspeed>=0.18.3
-trl==0.25.0
+trl==0.25.1
 hf_xet==1.2.0
-kernels>=0.9.0
+kernels==0.11.5
 trackio>=0.13.0
 typing_extensions>=4.14.0
--- a/setup.py
+++ b/setup.py
@@ -156,7 +156,7 @@ extras_require = {
        "came_pytorch==0.1.3",
    ],
    "ray": [
-        "ray[train]",
+        "ray[train]>=2.52.1",
    ],
    "vllm": [
        "vllm==0.10.0",
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -356,6 +356,7 @@ class AxolotlTrainer(
            inputs_key = "labels" if "labels" in inputs else "input_ids"
            trainable_tokens = (inputs[inputs_key] != -100).sum()
            total_tokens = inputs[inputs_key].numel()
            total_tokens = torch.tensor(total_tokens, device=inputs[inputs_key].device)
            if is_distributed():
                torch.distributed.all_reduce(
@@ -375,9 +376,7 @@ class AxolotlTrainer(
            self.state.tokens["trainable"] = (
                self.state.tokens["trainable"] + trainable_tokens.detach().cpu()
            )
-            self.state.tokens["total"] = (
+            self.state.tokens["total"] = self.state.tokens["total"] + total_tokens.cpu()
                self.state.tokens["total"] + torch.as_tensor(total_tokens).cpu()
            )
            # Store per-step trainable tokens for throughput calculation
            self.state.tokens["trainable_tokens"] = trainable_tokens.detach().cpu()
--- a/src/axolotl/monkeypatch/accelerate/parallelism_config.py
+++ b/src/axolotl/monkeypatch/accelerate/parallelism_config.py
@@ -75,3 +75,33 @@ def patch_parallelism_config():
    ParallelismConfig._validate_accelerator = _validate_accelerator
    AcceleratorState.is_fsdp2 = property(patched_is_fsdp2)
 def patch_prepare_cp():
    import functools
    import torch
    from accelerate import Accelerator
    def patched_prepare_cp(self, *args):
        if self.parallelism_config.cp_backend == "deepspeed":
            return args
        from accelerate.big_modeling import _attach_context_parallel_hooks
        from torch.distributed.tensor.experimental import context_parallel
        from torch.distributed.tensor.experimental._attention import set_rotate_method
        cp_comm_strategy = self.parallelism_config.cp_handler.cp_comm_strategy
        set_rotate_method(cp_comm_strategy)
        self._cp_context = functools.partial(
            context_parallel, mesh=self.torch_device_mesh["cp"]
        )
        for arg in args:
            if isinstance(arg, torch.nn.Module):
                _attach_context_parallel_hooks(arg)
        return args
    Accelerator._prepare_cp = patched_prepare_cp
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -645,6 +645,9 @@ def setup_parallelism_envs(cfg):
        set_accelerate_parallelism_config = True
        os.environ["PARALLELISM_CONFIG_CP_SIZE"] = str(cfg.context_parallel_size)
        os.environ["ACCELERATE_ALLOW_CP_STANDALONE"] = "true"
        from axolotl.monkeypatch.accelerate.parallelism_config import patch_prepare_cp
        patch_prepare_cp()
    if set_accelerate_parallelism_config:
        os.environ["ACCELERATE_USE_PARALLELISM_CONFIG"] = "true"
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -62,7 +62,7 @@ def snapshot_download_w_retry(*args, **kwargs):
    """
    with hf_offline_context(True):
        try:
-            return snapshot_download(*args, **kwargs)
+            return snapshot_download(*args, local_files_only=True, **kwargs)
        except LocalEntryNotFoundError:
            pass
    with hf_offline_context(False):
--- a/tests/hf_offline_utils.py
+++ b/tests/hf_offline_utils.py
@@ -6,8 +6,6 @@ import os
 from contextlib import contextmanager
 from functools import wraps
 from huggingface_hub.utils import reset_sessions
 def reload_modules(hf_hub_offline):
    # Force reload of the modules that check this variable
@@ -21,7 +19,6 @@ def reload_modules(hf_hub_offline):
    huggingface_hub.constants.HF_HUB_OFFLINE = hf_hub_offline
    importlib.reload(datasets.config)
    datasets.config.HF_HUB_OFFLINE = hf_hub_offline
    reset_sessions()
 def enable_hf_offline(test_func):