bartch upgrade dependencies (#3299)
* upgrade dependencies * don't use reset sessions * downgrade transformers, upgrade other deps * upgrade bnb to 0.49.0 * restore s3 cache * explicit use local files w hub * decompress and strip top level dir * use 2 levels for strip components * try to preserve permissions for symlinks * use updated tar * fix #3293 for distributed * downgrade bnb * fast fail after 4 * fix total tokens device * patch accelerate CP/SP (#3309) --------- Co-authored-by: salman <salman.mohammadi@outlook.com>
This commit is contained in:
35
.github/workflows/tests.yml
vendored
35
.github/workflows/tests.yml
vendored
@@ -66,12 +66,13 @@ jobs:
|
|||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
# - name: Restore Cache from S3
|
- name: Restore Cache from S3
|
||||||
# id: hf-cache-restore-s3
|
id: hf-cache-restore-s3
|
||||||
# run: |
|
run: |
|
||||||
# mkdir -p ~/.cache/huggingface/hub
|
mkdir -p ~/.cache/huggingface/hub
|
||||||
# curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd
|
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1
|
||||||
#
|
ls -ltr ~/.cache/huggingface/hub/
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
@@ -111,6 +112,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
||||||
|
|
||||||
|
- name: Show HF cache
|
||||||
|
run: hf cache scan
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
df -h
|
df -h
|
||||||
@@ -122,6 +126,9 @@ jobs:
|
|||||||
df -h
|
df -h
|
||||||
pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
|
pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
|
||||||
|
|
||||||
|
- name: Show HF cache
|
||||||
|
run: hf cache scan
|
||||||
|
|
||||||
- name: Upload coverage to Codecov
|
- name: Upload coverage to Codecov
|
||||||
uses: codecov/codecov-action@v5
|
uses: codecov/codecov-action@v5
|
||||||
with:
|
with:
|
||||||
@@ -149,12 +156,13 @@ jobs:
|
|||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
# - name: Restore Cache from S3
|
- name: Restore Cache from S3
|
||||||
# id: hf-cache-restore-s3
|
id: hf-cache-restore-s3
|
||||||
# run: |
|
run: |
|
||||||
# mkdir -p ~/.cache/huggingface/hub
|
mkdir -p ~/.cache/huggingface/hub
|
||||||
# curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd
|
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/ --use-compress-program unzstd --strip-components=1
|
||||||
#
|
ls -ltr ~/.cache/huggingface/hub/
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
@@ -200,6 +208,9 @@ jobs:
|
|||||||
pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
|
pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
|
||||||
pytest -v --durations=10 tests/cli/
|
pytest -v --durations=10 tests/cli/
|
||||||
|
|
||||||
|
- name: Show HF cache
|
||||||
|
run: hf cache scan
|
||||||
|
|
||||||
gate-skip-e2e:
|
gate-skip-e2e:
|
||||||
needs: [pre-commit, pytest, pytest-sdist]
|
needs: [pre-commit, pytest, pytest-sdist]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
|
# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
|
||||||
pytest -v --durations=10 -n2 \
|
pytest -v --durations=10 -n2 --maxfail=4 \
|
||||||
--ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
|
--ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
|
||||||
--ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
|
--ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
|
||||||
/workspace/axolotl/tests/e2e/multigpu/ \
|
/workspace/axolotl/tests/e2e/multigpu/ \
|
||||||
|
|||||||
@@ -14,12 +14,12 @@ huggingface_hub>=0.36.0
|
|||||||
peft>=0.18.0
|
peft>=0.18.0
|
||||||
tokenizers>=0.22.1
|
tokenizers>=0.22.1
|
||||||
transformers==4.57.1
|
transformers==4.57.1
|
||||||
accelerate==1.11.0
|
accelerate==1.12.0
|
||||||
datasets==4.4.1
|
datasets==4.4.2
|
||||||
deepspeed>=0.17.0
|
deepspeed>=0.18.3
|
||||||
trl==0.25.0
|
trl==0.25.1
|
||||||
hf_xet==1.2.0
|
hf_xet==1.2.0
|
||||||
kernels>=0.9.0
|
kernels==0.11.5
|
||||||
trackio>=0.13.0
|
trackio>=0.13.0
|
||||||
typing_extensions>=4.14.0
|
typing_extensions>=4.14.0
|
||||||
|
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -156,7 +156,7 @@ extras_require = {
|
|||||||
"came_pytorch==0.1.3",
|
"came_pytorch==0.1.3",
|
||||||
],
|
],
|
||||||
"ray": [
|
"ray": [
|
||||||
"ray[train]",
|
"ray[train]>=2.52.1",
|
||||||
],
|
],
|
||||||
"vllm": [
|
"vllm": [
|
||||||
"vllm==0.10.0",
|
"vllm==0.10.0",
|
||||||
|
|||||||
@@ -356,6 +356,7 @@ class AxolotlTrainer(
|
|||||||
inputs_key = "labels" if "labels" in inputs else "input_ids"
|
inputs_key = "labels" if "labels" in inputs else "input_ids"
|
||||||
trainable_tokens = (inputs[inputs_key] != -100).sum()
|
trainable_tokens = (inputs[inputs_key] != -100).sum()
|
||||||
total_tokens = inputs[inputs_key].numel()
|
total_tokens = inputs[inputs_key].numel()
|
||||||
|
total_tokens = torch.tensor(total_tokens, device=inputs[inputs_key].device)
|
||||||
|
|
||||||
if is_distributed():
|
if is_distributed():
|
||||||
torch.distributed.all_reduce(
|
torch.distributed.all_reduce(
|
||||||
@@ -375,9 +376,7 @@ class AxolotlTrainer(
|
|||||||
self.state.tokens["trainable"] = (
|
self.state.tokens["trainable"] = (
|
||||||
self.state.tokens["trainable"] + trainable_tokens.detach().cpu()
|
self.state.tokens["trainable"] + trainable_tokens.detach().cpu()
|
||||||
)
|
)
|
||||||
self.state.tokens["total"] = (
|
self.state.tokens["total"] = self.state.tokens["total"] + total_tokens.cpu()
|
||||||
self.state.tokens["total"] + torch.as_tensor(total_tokens).cpu()
|
|
||||||
)
|
|
||||||
# Store per-step trainable tokens for throughput calculation
|
# Store per-step trainable tokens for throughput calculation
|
||||||
self.state.tokens["trainable_tokens"] = trainable_tokens.detach().cpu()
|
self.state.tokens["trainable_tokens"] = trainable_tokens.detach().cpu()
|
||||||
|
|
||||||
|
|||||||
@@ -75,3 +75,33 @@ def patch_parallelism_config():
|
|||||||
|
|
||||||
ParallelismConfig._validate_accelerator = _validate_accelerator
|
ParallelismConfig._validate_accelerator = _validate_accelerator
|
||||||
AcceleratorState.is_fsdp2 = property(patched_is_fsdp2)
|
AcceleratorState.is_fsdp2 = property(patched_is_fsdp2)
|
||||||
|
|
||||||
|
|
||||||
|
def patch_prepare_cp():
|
||||||
|
import functools
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from accelerate import Accelerator
|
||||||
|
|
||||||
|
def patched_prepare_cp(self, *args):
|
||||||
|
if self.parallelism_config.cp_backend == "deepspeed":
|
||||||
|
return args
|
||||||
|
|
||||||
|
from accelerate.big_modeling import _attach_context_parallel_hooks
|
||||||
|
from torch.distributed.tensor.experimental import context_parallel
|
||||||
|
from torch.distributed.tensor.experimental._attention import set_rotate_method
|
||||||
|
|
||||||
|
cp_comm_strategy = self.parallelism_config.cp_handler.cp_comm_strategy
|
||||||
|
set_rotate_method(cp_comm_strategy)
|
||||||
|
|
||||||
|
self._cp_context = functools.partial(
|
||||||
|
context_parallel, mesh=self.torch_device_mesh["cp"]
|
||||||
|
)
|
||||||
|
|
||||||
|
for arg in args:
|
||||||
|
if isinstance(arg, torch.nn.Module):
|
||||||
|
_attach_context_parallel_hooks(arg)
|
||||||
|
|
||||||
|
return args
|
||||||
|
|
||||||
|
Accelerator._prepare_cp = patched_prepare_cp
|
||||||
|
|||||||
@@ -645,6 +645,9 @@ def setup_parallelism_envs(cfg):
|
|||||||
set_accelerate_parallelism_config = True
|
set_accelerate_parallelism_config = True
|
||||||
os.environ["PARALLELISM_CONFIG_CP_SIZE"] = str(cfg.context_parallel_size)
|
os.environ["PARALLELISM_CONFIG_CP_SIZE"] = str(cfg.context_parallel_size)
|
||||||
os.environ["ACCELERATE_ALLOW_CP_STANDALONE"] = "true"
|
os.environ["ACCELERATE_ALLOW_CP_STANDALONE"] = "true"
|
||||||
|
from axolotl.monkeypatch.accelerate.parallelism_config import patch_prepare_cp
|
||||||
|
|
||||||
|
patch_prepare_cp()
|
||||||
if set_accelerate_parallelism_config:
|
if set_accelerate_parallelism_config:
|
||||||
os.environ["ACCELERATE_USE_PARALLELISM_CONFIG"] = "true"
|
os.environ["ACCELERATE_USE_PARALLELISM_CONFIG"] = "true"
|
||||||
|
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ def snapshot_download_w_retry(*args, **kwargs):
|
|||||||
"""
|
"""
|
||||||
with hf_offline_context(True):
|
with hf_offline_context(True):
|
||||||
try:
|
try:
|
||||||
return snapshot_download(*args, **kwargs)
|
return snapshot_download(*args, local_files_only=True, **kwargs)
|
||||||
except LocalEntryNotFoundError:
|
except LocalEntryNotFoundError:
|
||||||
pass
|
pass
|
||||||
with hf_offline_context(False):
|
with hf_offline_context(False):
|
||||||
|
|||||||
@@ -6,8 +6,6 @@ import os
|
|||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
|
|
||||||
from huggingface_hub.utils import reset_sessions
|
|
||||||
|
|
||||||
|
|
||||||
def reload_modules(hf_hub_offline):
|
def reload_modules(hf_hub_offline):
|
||||||
# Force reload of the modules that check this variable
|
# Force reload of the modules that check this variable
|
||||||
@@ -21,7 +19,6 @@ def reload_modules(hf_hub_offline):
|
|||||||
huggingface_hub.constants.HF_HUB_OFFLINE = hf_hub_offline
|
huggingface_hub.constants.HF_HUB_OFFLINE = hf_hub_offline
|
||||||
importlib.reload(datasets.config)
|
importlib.reload(datasets.config)
|
||||||
datasets.config.HF_HUB_OFFLINE = hf_hub_offline
|
datasets.config.HF_HUB_OFFLINE = hf_hub_offline
|
||||||
reset_sessions()
|
|
||||||
|
|
||||||
|
|
||||||
def enable_hf_offline(test_func):
|
def enable_hf_offline(test_func):
|
||||||
|
|||||||
Reference in New Issue
Block a user