bartch upgrade dependencies (#3299)

* upgrade dependencies

* don't use reset sessions

* downgrade transformers, upgrade other deps

* upgrade bnb to 0.49.0

* restore s3 cache

* explicit use local files w hub

* decompress and strip top level dir

* use 2 levels for strip components

* try to preserve permissions for symlinks

* use updated tar

* fix #3293 for distributed

* downgrade bnb

* fast fail after 4

* fix total tokens device

* patch accelerate CP/SP (#3309)

---------

Co-authored-by: salman <salman.mohammadi@outlook.com>
This commit is contained in:
Wing Lian
2025-12-30 09:02:49 -05:00
committed by GitHub
parent 66a3de3629
commit 11c0b5b256
9 changed files with 66 additions and 26 deletions

View File

@@ -66,12 +66,13 @@ jobs:
- name: Check out repository code
uses: actions/checkout@v4
# - name: Restore Cache from S3
# id: hf-cache-restore-s3
# run: |
# mkdir -p ~/.cache/huggingface/hub
# curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd
#
- name: Restore Cache from S3
id: hf-cache-restore-s3
run: |
mkdir -p ~/.cache/huggingface/hub
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1
ls -ltr ~/.cache/huggingface/hub/
- name: Setup Python
uses: actions/setup-python@v5
with:
@@ -111,6 +112,9 @@ jobs:
run: |
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
- name: Show HF cache
run: hf cache scan
- name: Run tests
run: |
df -h
@@ -122,6 +126,9 @@ jobs:
df -h
pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
- name: Show HF cache
run: hf cache scan
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
with:
@@ -149,12 +156,13 @@ jobs:
- name: Check out repository code
uses: actions/checkout@v4
# - name: Restore Cache from S3
# id: hf-cache-restore-s3
# run: |
# mkdir -p ~/.cache/huggingface/hub
# curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd
#
- name: Restore Cache from S3
id: hf-cache-restore-s3
run: |
mkdir -p ~/.cache/huggingface/hub
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1
ls -ltr ~/.cache/huggingface/hub/
- name: Setup Python
uses: actions/setup-python@v5
with:
@@ -200,6 +208,9 @@ jobs:
pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
pytest -v --durations=10 tests/cli/
- name: Show HF cache
run: hf cache scan
gate-skip-e2e:
needs: [pre-commit, pytest, pytest-sdist]
runs-on: ubuntu-latest

View File

@@ -2,7 +2,7 @@
set -e
# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
pytest -v --durations=10 -n2 \
pytest -v --durations=10 -n2 --maxfail=4 \
--ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
--ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
/workspace/axolotl/tests/e2e/multigpu/ \

View File

@@ -14,12 +14,12 @@ huggingface_hub>=0.36.0
peft>=0.18.0
tokenizers>=0.22.1
transformers==4.57.1
accelerate==1.11.0
datasets==4.4.1
deepspeed>=0.17.0
trl==0.25.0
accelerate==1.12.0
datasets==4.4.2
deepspeed>=0.18.3
trl==0.25.1
hf_xet==1.2.0
kernels>=0.9.0
kernels==0.11.5
trackio>=0.13.0
typing_extensions>=4.14.0

View File

@@ -156,7 +156,7 @@ extras_require = {
"came_pytorch==0.1.3",
],
"ray": [
"ray[train]",
"ray[train]>=2.52.1",
],
"vllm": [
"vllm==0.10.0",

View File

@@ -356,6 +356,7 @@ class AxolotlTrainer(
inputs_key = "labels" if "labels" in inputs else "input_ids"
trainable_tokens = (inputs[inputs_key] != -100).sum()
total_tokens = inputs[inputs_key].numel()
total_tokens = torch.tensor(total_tokens, device=inputs[inputs_key].device)
if is_distributed():
torch.distributed.all_reduce(
@@ -375,9 +376,7 @@ class AxolotlTrainer(
self.state.tokens["trainable"] = (
self.state.tokens["trainable"] + trainable_tokens.detach().cpu()
)
self.state.tokens["total"] = (
self.state.tokens["total"] + torch.as_tensor(total_tokens).cpu()
)
self.state.tokens["total"] = self.state.tokens["total"] + total_tokens.cpu()
# Store per-step trainable tokens for throughput calculation
self.state.tokens["trainable_tokens"] = trainable_tokens.detach().cpu()

View File

@@ -75,3 +75,33 @@ def patch_parallelism_config():
ParallelismConfig._validate_accelerator = _validate_accelerator
AcceleratorState.is_fsdp2 = property(patched_is_fsdp2)
def patch_prepare_cp():
import functools
import torch
from accelerate import Accelerator
def patched_prepare_cp(self, *args):
if self.parallelism_config.cp_backend == "deepspeed":
return args
from accelerate.big_modeling import _attach_context_parallel_hooks
from torch.distributed.tensor.experimental import context_parallel
from torch.distributed.tensor.experimental._attention import set_rotate_method
cp_comm_strategy = self.parallelism_config.cp_handler.cp_comm_strategy
set_rotate_method(cp_comm_strategy)
self._cp_context = functools.partial(
context_parallel, mesh=self.torch_device_mesh["cp"]
)
for arg in args:
if isinstance(arg, torch.nn.Module):
_attach_context_parallel_hooks(arg)
return args
Accelerator._prepare_cp = patched_prepare_cp

View File

@@ -645,6 +645,9 @@ def setup_parallelism_envs(cfg):
set_accelerate_parallelism_config = True
os.environ["PARALLELISM_CONFIG_CP_SIZE"] = str(cfg.context_parallel_size)
os.environ["ACCELERATE_ALLOW_CP_STANDALONE"] = "true"
from axolotl.monkeypatch.accelerate.parallelism_config import patch_prepare_cp
patch_prepare_cp()
if set_accelerate_parallelism_config:
os.environ["ACCELERATE_USE_PARALLELISM_CONFIG"] = "true"

View File

@@ -62,7 +62,7 @@ def snapshot_download_w_retry(*args, **kwargs):
"""
with hf_offline_context(True):
try:
return snapshot_download(*args, **kwargs)
return snapshot_download(*args, local_files_only=True, **kwargs)
except LocalEntryNotFoundError:
pass
with hf_offline_context(False):

View File

@@ -6,8 +6,6 @@ import os
from contextlib import contextmanager
from functools import wraps
from huggingface_hub.utils import reset_sessions
def reload_modules(hf_hub_offline):
# Force reload of the modules that check this variable
@@ -21,7 +19,6 @@ def reload_modules(hf_hub_offline):
huggingface_hub.constants.HF_HUB_OFFLINE = hf_hub_offline
importlib.reload(datasets.config)
datasets.config.HF_HUB_OFFLINE = hf_hub_offline
reset_sessions()
def enable_hf_offline(test_func):