Compare commits
11 Commits
feat/phi_3
...
shared-pre
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b79996bdc4 | ||
|
|
68368de7ed | ||
|
|
a94c4a014b | ||
|
|
0102ca5943 | ||
|
|
97e8c01a70 | ||
|
|
5c4705b185 | ||
|
|
47a88da330 | ||
|
|
07ab737a55 | ||
|
|
c40da3b5eb | ||
|
|
a5946ff1f0 | ||
|
|
70ca1b2291 |
6
.github/workflows/base.yml
vendored
6
.github/workflows/base.yml
vendored
@@ -5,11 +5,13 @@ on:
|
||||
branches:
|
||||
- "main"
|
||||
paths:
|
||||
- 'Dockerfile-base'
|
||||
- 'docker/Dockerfile-base'
|
||||
- 'docker/Dockerfile-uv-base'
|
||||
- '.github/workflows/base.yml'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'Dockerfile-base'
|
||||
- 'docker/Dockerfile-base'
|
||||
- 'docker/Dockerfile-uv-base'
|
||||
- '.github/workflows/base.yml'
|
||||
workflow_dispatch:
|
||||
|
||||
|
||||
115
.github/workflows/tests-nightly.yml
vendored
115
.github/workflows/tests-nightly.yml
vendored
@@ -18,96 +18,9 @@ jobs:
|
||||
env:
|
||||
SKIP: no-commit-to-branch
|
||||
|
||||
preload-cache:
|
||||
name: Preload HF cache
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.11"]
|
||||
pytorch_version: ["2.6.0"]
|
||||
timeout-minutes: 20
|
||||
|
||||
env:
|
||||
AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
|
||||
|
||||
steps:
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Restore HF cache
|
||||
id: hf-cache-restore
|
||||
uses: actions/cache/restore@v4
|
||||
with:
|
||||
path: |
|
||||
/home/runner/.cache/huggingface/hub/datasets--*
|
||||
/home/runner/.cache/huggingface/hub/models--*
|
||||
key: ${{ runner.os }}-hf-hub-cache-v2
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python_version }}
|
||||
cache: 'pip' # caching pip dependencies
|
||||
|
||||
- name: upgrade pip
|
||||
run: |
|
||||
pip3 install --upgrade pip
|
||||
pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
|
||||
|
||||
- name: Install PyTorch
|
||||
run: |
|
||||
pip3 install torch==${{ matrix.pytorch_version }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip3 show torch
|
||||
pip3 install --no-build-isolation -U -e .
|
||||
python scripts/unsloth_install.py | sh
|
||||
python scripts/cutcrossentropy_install.py | sh
|
||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
||||
|
||||
- name: Make sure PyTorch version wasn't clobbered
|
||||
run: |
|
||||
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
|
||||
|
||||
- name: Ensure axolotl CLI was installed
|
||||
run: |
|
||||
axolotl --help
|
||||
|
||||
- name: Pre-Download dataset fixture
|
||||
run: |
|
||||
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
pytest -v tests/conftest.py
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
files: ./coverage.xml
|
||||
flags: unittests,pytorch-${{ matrix.pytorch_version }}
|
||||
fail_ci_if_error: false
|
||||
|
||||
- name: cleanup pip cache
|
||||
run: |
|
||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||
|
||||
- name: Save HF cache
|
||||
id: hf-cache
|
||||
uses: actions/cache/save@v4
|
||||
with:
|
||||
path: |
|
||||
/home/runner/.cache/huggingface/hub/datasets--*
|
||||
/home/runner/.cache/huggingface/hub/models--*
|
||||
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
|
||||
|
||||
pytest:
|
||||
name: PyTest
|
||||
runs-on: ubuntu-latest
|
||||
needs: [preload-cache]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 2
|
||||
@@ -120,14 +33,11 @@ jobs:
|
||||
- name: Check out repository code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Restore HF cache
|
||||
id: hf-cache-restore
|
||||
uses: actions/cache/restore@v4
|
||||
with:
|
||||
path: |
|
||||
/home/runner/.cache/huggingface/hub/datasets--*
|
||||
/home/runner/.cache/huggingface/hub/models--*
|
||||
key: ${{ runner.os }}-hf-hub-cache-v2
|
||||
- name: Restore Cache from S3
|
||||
id: hf-cache-restore-s3
|
||||
run: |
|
||||
mkdir -p /home/runner/.cache/huggingface/hub
|
||||
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
@@ -168,10 +78,6 @@ jobs:
|
||||
run: |
|
||||
axolotl --help
|
||||
|
||||
- name: Pre-Download dataset fixture
|
||||
run: |
|
||||
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
|
||||
@@ -193,15 +99,8 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.5.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
num_gpus: 1
|
||||
|
||||
@@ -37,3 +37,7 @@ RUN git lfs install --skip-repo && \
|
||||
pip3 install awscli && \
|
||||
# The base image ships with `pydantic==1.8.2` which is not working
|
||||
pip3 install -U --no-cache-dir pydantic==1.10.10
|
||||
|
||||
RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
|
||||
FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
|
||||
fi
|
||||
|
||||
@@ -219,7 +219,9 @@ class TrainerBuilderBase(abc.ABC):
|
||||
if self.cfg.bf16 == "full":
|
||||
training_args_kwargs["bf16_full_eval"] = True
|
||||
else:
|
||||
training_args_kwargs["bf16"] = self.cfg.bf16 or self.cfg.bfloat16
|
||||
bf16 = self.cfg.bf16 or self.cfg.bfloat16
|
||||
bf16 = bf16 if bf16 is not None else False
|
||||
training_args_kwargs["bf16"] = bf16
|
||||
|
||||
def _configure_scheduler(self, training_args_kwargs: dict):
|
||||
if self.cfg.lr_scheduler in ["one_cycle", "rex"]:
|
||||
|
||||
@@ -10,7 +10,7 @@ import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path, PosixPath
|
||||
from pathlib import Path
|
||||
from typing import Generator
|
||||
|
||||
import datasets
|
||||
@@ -423,9 +423,13 @@ def temp_dir() -> Generator[str, None, None]:
|
||||
shutil.rmtree(_temp_dir)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None:
|
||||
os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache"
|
||||
@pytest.fixture(scope="module")
|
||||
def module_temp_dir() -> Generator[str, None, None]:
|
||||
# Create a temporary directory
|
||||
_temp_dir = tempfile.mkdtemp()
|
||||
yield _temp_dir
|
||||
# Clean up the directory after the test
|
||||
shutil.rmtree(_temp_dir)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
E2E tests for multigpu lora tinyllama
|
||||
"""
|
||||
|
||||
# pylint: disable=redefined-outer-name
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
@@ -25,6 +27,60 @@ def download_model():
|
||||
snapshot_download("HuggingFaceTB/SmolLM2-135M")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def sft_base_cfg():
|
||||
cfg = DictDefault(
|
||||
base_model="HuggingFaceTB/SmolLM2-135M",
|
||||
tokenizer_config="HuggingFaceTB/SmolLM2-135M", # this has to be manually set since we haven't done validation
|
||||
sequence_len=1024,
|
||||
special_tokens={
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
datasets=[
|
||||
{
|
||||
"path": "tatsu-lab/alpaca",
|
||||
"type": "alpaca",
|
||||
"split": "train[:10%]",
|
||||
},
|
||||
],
|
||||
val_set_size=0.1,
|
||||
sample_packing=True,
|
||||
flash_attention=True,
|
||||
learning_rate=0.00001,
|
||||
optimizer="adamw_8bit",
|
||||
seed=42,
|
||||
# these need to be set since we aren't running schema validation
|
||||
micro_batch_size=2,
|
||||
gradient_accumulation_steps=1,
|
||||
)
|
||||
|
||||
return cfg
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", name="sft_prepared_dataset_alpaca_cfg")
|
||||
def sft_prepared_dataset_alpaca_cfg(module_temp_dir, sft_base_cfg):
|
||||
dataset_prepared_path = module_temp_dir + "/last_run_prepared"
|
||||
cfg = sft_base_cfg | DictDefault(
|
||||
dataset_prepared_path=dataset_prepared_path,
|
||||
)
|
||||
|
||||
Path(module_temp_dir).mkdir(parents=True, exist_ok=True)
|
||||
with open(Path(module_temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
|
||||
fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
|
||||
|
||||
execute_subprocess_async(
|
||||
[
|
||||
"axolotl",
|
||||
"preprocess",
|
||||
str(Path(module_temp_dir) / "config.yaml"),
|
||||
]
|
||||
)
|
||||
|
||||
# unset flash attention since we have some flex attention tests too
|
||||
cfg.flash_attention = None
|
||||
return cfg
|
||||
|
||||
|
||||
def transformers_version_eq(required_version):
|
||||
return version.parse(transformers.__version__) == version.parse(required_version)
|
||||
|
||||
@@ -97,45 +153,36 @@ class TestMultiGPULlama:
|
||||
"gradient_accumulation_steps",
|
||||
[1, 2],
|
||||
)
|
||||
def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
|
||||
def test_lora_ddp_packed(
|
||||
self, temp_dir, sft_prepared_dataset_alpaca_cfg, gradient_accumulation_steps
|
||||
):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"sequence_len": 2048,
|
||||
"sample_packing": True,
|
||||
"eval_sample_packing": False,
|
||||
"pad_to_sequence_len": True,
|
||||
"adapter": "lora",
|
||||
"lora_r": 8,
|
||||
"lora_alpha": 16,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": True,
|
||||
"val_set_size": 0.05,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "tatsu-lab/alpaca",
|
||||
"type": "alpaca",
|
||||
"split": "train[:20%]",
|
||||
},
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 1,
|
||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||
# "gradient_checkpointing": True,
|
||||
"output_dir": temp_dir,
|
||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"flash_attention": True,
|
||||
"use_tensorboard": True,
|
||||
"bf16": True,
|
||||
}
|
||||
cfg = (
|
||||
DictDefault(
|
||||
{
|
||||
"eval_sample_packing": False,
|
||||
"pad_to_sequence_len": True,
|
||||
"adapter": "lora",
|
||||
"lora_r": 8,
|
||||
"lora_alpha": 16,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": True,
|
||||
"val_set_size": 0.05,
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 1,
|
||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||
# "gradient_checkpointing": True,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"flash_attention": True,
|
||||
"use_tensorboard": True,
|
||||
"bf16": True,
|
||||
}
|
||||
)
|
||||
| sft_prepared_dataset_alpaca_cfg
|
||||
)
|
||||
|
||||
# write cfg to yaml file
|
||||
@@ -385,59 +432,50 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"fsdp_state_dict_type",
|
||||
["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
|
||||
)
|
||||
def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
|
||||
def test_fsdp_packed(
|
||||
self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
|
||||
):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
"val_set_size": 0.05,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "tatsu-lab/alpaca",
|
||||
"type": "alpaca",
|
||||
"split": "train[:10%]",
|
||||
cfg = (
|
||||
DictDefault(
|
||||
{
|
||||
"pad_to_sequence_len": True,
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 2,
|
||||
"gradient_accumulation_steps": 2,
|
||||
# "gradient_checkpointing": True,
|
||||
"output_dir": temp_dir,
|
||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch_fused",
|
||||
"lr_scheduler": "cosine",
|
||||
"flash_attention": True,
|
||||
"fsdp": [
|
||||
"full_shard",
|
||||
"auto_wrap",
|
||||
],
|
||||
"fsdp_config": {
|
||||
"fsdp_limit_all_gathers": True,
|
||||
"fsdp_offload_params": False,
|
||||
"fsdp_sync_module_states": True,
|
||||
"fsdp_use_orig_params": False,
|
||||
"fsdp_cpu_ram_efficient_loading": False,
|
||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||
"fsdp_state_dict_type": fsdp_state_dict_type,
|
||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||
},
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 2,
|
||||
"gradient_accumulation_steps": 2,
|
||||
# "gradient_checkpointing": True,
|
||||
"output_dir": temp_dir,
|
||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch_fused",
|
||||
"lr_scheduler": "cosine",
|
||||
"flash_attention": True,
|
||||
"fsdp": [
|
||||
"full_shard",
|
||||
"auto_wrap",
|
||||
],
|
||||
"fsdp_config": {
|
||||
"fsdp_limit_all_gathers": True,
|
||||
"fsdp_offload_params": False,
|
||||
"fsdp_sync_module_states": True,
|
||||
"fsdp_use_orig_params": False,
|
||||
"fsdp_cpu_ram_efficient_loading": False,
|
||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||
"fsdp_state_dict_type": fsdp_state_dict_type,
|
||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||
},
|
||||
"use_tensorboard": True,
|
||||
}
|
||||
"use_tensorboard": True,
|
||||
}
|
||||
)
|
||||
| sft_prepared_dataset_alpaca_cfg
|
||||
)
|
||||
|
||||
# write cfg to yaml file
|
||||
@@ -458,7 +496,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@require_torch_2_6_0
|
||||
@@ -471,51 +509,43 @@ class TestMultiGPULlama:
|
||||
[True, False],
|
||||
)
|
||||
def test_fsdp2_packed(
|
||||
self, temp_dir, attention_backend, fsdp_reshard_after_forward
|
||||
self,
|
||||
temp_dir,
|
||||
sft_prepared_dataset_alpaca_cfg,
|
||||
attention_backend,
|
||||
fsdp_reshard_after_forward,
|
||||
):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 2048,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "tatsu-lab/alpaca",
|
||||
"type": "alpaca",
|
||||
"split": "train[:10%]",
|
||||
cfg = (
|
||||
DictDefault(
|
||||
{
|
||||
"pad_to_sequence_len": True,
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 4,
|
||||
"gradient_accumulation_steps": 2,
|
||||
"gradient_checkpointing": True,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"fsdp": [
|
||||
"auto_wrap",
|
||||
],
|
||||
"fsdp_config": {
|
||||
"fsdp_version": 2,
|
||||
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
|
||||
"fsdp_offload_params": False,
|
||||
"fsdp_cpu_ram_efficient_loading": False,
|
||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
|
||||
},
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 4,
|
||||
"gradient_accumulation_steps": 2,
|
||||
"gradient_checkpointing": True,
|
||||
"output_dir": temp_dir,
|
||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"fsdp": [
|
||||
"auto_wrap",
|
||||
],
|
||||
"fsdp_config": {
|
||||
"fsdp_version": 2,
|
||||
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
|
||||
"fsdp_offload_params": False,
|
||||
"fsdp_cpu_ram_efficient_loading": False,
|
||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
|
||||
},
|
||||
"use_tensorboard": True,
|
||||
}
|
||||
"use_tensorboard": True,
|
||||
}
|
||||
)
|
||||
| sft_prepared_dataset_alpaca_cfg
|
||||
)
|
||||
if attention_backend == "flash":
|
||||
cfg.flash_attention = True
|
||||
@@ -543,64 +573,55 @@ class TestMultiGPULlama:
|
||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
def test_fsdp_qlora_prequant_packed(self, temp_dir):
|
||||
def test_fsdp_qlora_prequant_packed(
|
||||
self, temp_dir, sft_prepared_dataset_alpaca_cfg
|
||||
):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
|
||||
"adapter": "qlora",
|
||||
"mean_resizing_embeddings": True,
|
||||
"load_in_4bit": True,
|
||||
"lora_r": 8,
|
||||
"lora_alpha": 16,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": True,
|
||||
# "lora_modules_to_save": [
|
||||
# "embed_tokens",
|
||||
# "lm_head",
|
||||
# ],
|
||||
"sample_packing": True,
|
||||
"eval_sample_packing": False,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
"val_set_size": 0.01,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "tatsu-lab/alpaca",
|
||||
"type": "alpaca",
|
||||
"split": "train[:10%]",
|
||||
cfg = (
|
||||
DictDefault(
|
||||
{
|
||||
"base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
|
||||
"adapter": "qlora",
|
||||
"mean_resizing_embeddings": True,
|
||||
"load_in_4bit": True,
|
||||
"lora_r": 8,
|
||||
"lora_alpha": 16,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": True,
|
||||
# "lora_modules_to_save": [
|
||||
# "embed_tokens",
|
||||
# "lm_head",
|
||||
# ],
|
||||
"eval_sample_packing": False,
|
||||
"pad_to_sequence_len": True,
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 2,
|
||||
"gradient_accumulation_steps": 2,
|
||||
# "gradient_checkpointing": True,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch_fused",
|
||||
"lr_scheduler": "cosine",
|
||||
"flash_attention": True,
|
||||
"fsdp": [
|
||||
"full_shard",
|
||||
"auto_wrap",
|
||||
],
|
||||
"fsdp_config": {
|
||||
"fsdp_limit_all_gathers": True,
|
||||
"fsdp_offload_params": False,
|
||||
"fsdp_sync_module_states": True,
|
||||
"fsdp_use_orig_params": False,
|
||||
"fsdp_cpu_ram_efficient_loading": True,
|
||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||
},
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 2,
|
||||
"gradient_accumulation_steps": 2,
|
||||
# "gradient_checkpointing": True,
|
||||
"output_dir": temp_dir,
|
||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch_fused",
|
||||
"lr_scheduler": "cosine",
|
||||
"flash_attention": True,
|
||||
"fsdp": [
|
||||
"full_shard",
|
||||
"auto_wrap",
|
||||
],
|
||||
"fsdp_config": {
|
||||
"fsdp_limit_all_gathers": True,
|
||||
"fsdp_offload_params": False,
|
||||
"fsdp_sync_module_states": True,
|
||||
"fsdp_use_orig_params": False,
|
||||
"fsdp_cpu_ram_efficient_loading": True,
|
||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||
},
|
||||
"use_tensorboard": True,
|
||||
}
|
||||
"use_tensorboard": True,
|
||||
}
|
||||
)
|
||||
| sft_prepared_dataset_alpaca_cfg
|
||||
)
|
||||
|
||||
# write cfg to yaml file
|
||||
@@ -641,7 +662,12 @@ class TestMultiGPULlama:
|
||||
[True, False],
|
||||
)
|
||||
def test_ds_zero3_packed(
|
||||
self, temp_dir, gradient_accumulation_steps, deepspeed, qlora
|
||||
self,
|
||||
temp_dir,
|
||||
sft_prepared_dataset_alpaca_cfg,
|
||||
gradient_accumulation_steps,
|
||||
deepspeed,
|
||||
qlora,
|
||||
):
|
||||
# pylint: disable=duplicate-code
|
||||
if qlora:
|
||||
@@ -655,37 +681,25 @@ class TestMultiGPULlama:
|
||||
}
|
||||
else:
|
||||
adapter = {}
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
"val_set_size": 0.05,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "tatsu-lab/alpaca",
|
||||
"type": "alpaca",
|
||||
"split": "train[:10%]",
|
||||
},
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 1,
|
||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||
"output_dir": temp_dir,
|
||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch_fused",
|
||||
"lr_scheduler": "cosine",
|
||||
"flash_attention": True,
|
||||
"deepspeed": str(AXOLOTL_ROOT / deepspeed),
|
||||
"use_tensorboard": True,
|
||||
**adapter,
|
||||
}
|
||||
cfg = (
|
||||
DictDefault(
|
||||
{
|
||||
"pad_to_sequence_len": True,
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 1,
|
||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch_fused",
|
||||
"lr_scheduler": "cosine",
|
||||
"flash_attention": True,
|
||||
"deepspeed": str(AXOLOTL_ROOT / deepspeed),
|
||||
"use_tensorboard": True,
|
||||
**adapter,
|
||||
}
|
||||
)
|
||||
| sft_prepared_dataset_alpaca_cfg
|
||||
)
|
||||
|
||||
# write cfg to yaml file
|
||||
@@ -706,7 +720,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -717,7 +731,13 @@ class TestMultiGPULlama:
|
||||
"qlora",
|
||||
[True, False],
|
||||
)
|
||||
def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
|
||||
def test_ds_zero2_packed(
|
||||
self,
|
||||
temp_dir,
|
||||
sft_prepared_dataset_alpaca_cfg,
|
||||
gradient_accumulation_steps,
|
||||
qlora,
|
||||
):
|
||||
# pylint: disable=duplicate-code
|
||||
if qlora:
|
||||
adapter = {
|
||||
@@ -730,37 +750,25 @@ class TestMultiGPULlama:
|
||||
}
|
||||
else:
|
||||
adapter = {}
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
"val_set_size": 0.01,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "tatsu-lab/alpaca",
|
||||
"type": "alpaca",
|
||||
"split": "train[:10%]",
|
||||
},
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 1,
|
||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||
"output_dir": temp_dir,
|
||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch_fused",
|
||||
"lr_scheduler": "cosine",
|
||||
"flash_attention": True,
|
||||
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
|
||||
"use_tensorboard": True,
|
||||
**adapter,
|
||||
}
|
||||
cfg = (
|
||||
DictDefault(
|
||||
{
|
||||
"pad_to_sequence_len": True,
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 1,
|
||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch_fused",
|
||||
"lr_scheduler": "cosine",
|
||||
"flash_attention": True,
|
||||
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
|
||||
"use_tensorboard": True,
|
||||
**adapter,
|
||||
}
|
||||
)
|
||||
| sft_prepared_dataset_alpaca_cfg
|
||||
)
|
||||
|
||||
# write cfg to yaml file
|
||||
@@ -781,7 +789,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -792,7 +800,13 @@ class TestMultiGPULlama:
|
||||
"qlora",
|
||||
[True, False],
|
||||
)
|
||||
def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
|
||||
def test_ds_zero1_packed(
|
||||
self,
|
||||
temp_dir,
|
||||
sft_prepared_dataset_alpaca_cfg,
|
||||
gradient_accumulation_steps,
|
||||
qlora,
|
||||
):
|
||||
# pylint: disable=duplicate-code
|
||||
if qlora:
|
||||
adapter = {
|
||||
@@ -805,37 +819,25 @@ class TestMultiGPULlama:
|
||||
}
|
||||
else:
|
||||
adapter = {}
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
"val_set_size": 0.01,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "tatsu-lab/alpaca",
|
||||
"type": "alpaca",
|
||||
"split": "train[:10%]",
|
||||
},
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 1,
|
||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||
"output_dir": temp_dir,
|
||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch_fused",
|
||||
"lr_scheduler": "cosine",
|
||||
"flash_attention": True,
|
||||
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
|
||||
"use_tensorboard": True,
|
||||
**adapter,
|
||||
}
|
||||
cfg = (
|
||||
DictDefault(
|
||||
{
|
||||
"pad_to_sequence_len": True,
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 1,
|
||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch_fused",
|
||||
"lr_scheduler": "cosine",
|
||||
"flash_attention": True,
|
||||
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
|
||||
"use_tensorboard": True,
|
||||
**adapter,
|
||||
}
|
||||
)
|
||||
| sft_prepared_dataset_alpaca_cfg
|
||||
)
|
||||
|
||||
# write cfg to yaml file
|
||||
@@ -856,7 +858,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.skip(
|
||||
|
||||
Reference in New Issue
Block a user