Compare commits

...

11 Commits

Author SHA1 Message Date
Wing Lian
b79996bdc4 tweak loss 2025-07-06 19:42:43 -04:00
Wing Lian
68368de7ed add seed for stable reproducibility 2025-07-06 19:29:51 -04:00
Wing Lian
a94c4a014b tweak acceptable loss from changed hyperparams 2025-07-06 19:25:26 -04:00
Wing Lian
0102ca5943 fix cfg merge 2025-07-06 19:11:46 -04:00
Wing Lian
97e8c01a70 tweak losses 2025-07-06 18:55:16 -04:00
Wing Lian
5c4705b185 unset fa 2025-07-06 13:27:55 -04:00
Wing Lian
47a88da330 set mbsz and revert non-packed test 2025-07-06 12:27:25 -04:00
Wing Lian
07ab737a55 set tokenizer_config in fixture 2025-07-06 12:24:21 -04:00
Wing Lian
c40da3b5eb use shared fixture for preprocessed alpaca dataset 2025-07-06 11:44:31 -04:00
Wing Lian
a5946ff1f0 build fa2 from source for base image with torch2.6 and cu124 (#2867) 2025-07-05 09:21:18 -04:00
Wing Lian
70ca1b2291 fix nightlies to use correct cache (#2848) [skip ci]
* fix nightlies to use correct cache

* fix for handling None for bf16
2025-07-03 12:21:39 -04:00
6 changed files with 309 additions and 396 deletions

View File

@@ -5,11 +5,13 @@ on:
branches: branches:
- "main" - "main"
paths: paths:
- 'Dockerfile-base' - 'docker/Dockerfile-base'
- 'docker/Dockerfile-uv-base'
- '.github/workflows/base.yml' - '.github/workflows/base.yml'
pull_request: pull_request:
paths: paths:
- 'Dockerfile-base' - 'docker/Dockerfile-base'
- 'docker/Dockerfile-uv-base'
- '.github/workflows/base.yml' - '.github/workflows/base.yml'
workflow_dispatch: workflow_dispatch:

View File

@@ -18,96 +18,9 @@ jobs:
env: env:
SKIP: no-commit-to-branch SKIP: no-commit-to-branch
preload-cache:
name: Preload HF cache
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python_version: ["3.11"]
pytorch_version: ["2.6.0"]
timeout-minutes: 20
env:
AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
steps:
- name: Check out repository code
uses: actions/checkout@v4
- name: Restore HF cache
id: hf-cache-restore
uses: actions/cache/restore@v4
with:
path: |
/home/runner/.cache/huggingface/hub/datasets--*
/home/runner/.cache/huggingface/hub/models--*
key: ${{ runner.os }}-hf-hub-cache-v2
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python_version }}
cache: 'pip' # caching pip dependencies
- name: upgrade pip
run: |
pip3 install --upgrade pip
pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
- name: Install PyTorch
run: |
pip3 install torch==${{ matrix.pytorch_version }}
- name: Install dependencies
run: |
pip3 show torch
pip3 install --no-build-isolation -U -e .
python scripts/unsloth_install.py | sh
python scripts/cutcrossentropy_install.py | sh
pip3 install -r requirements-dev.txt -r requirements-tests.txt
- name: Make sure PyTorch version wasn't clobbered
run: |
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
- name: Ensure axolotl CLI was installed
run: |
axolotl --help
- name: Pre-Download dataset fixture
run: |
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
- name: Run tests
run: |
pytest -v tests/conftest.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml
flags: unittests,pytorch-${{ matrix.pytorch_version }}
fail_ci_if_error: false
- name: cleanup pip cache
run: |
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
- name: Save HF cache
id: hf-cache
uses: actions/cache/save@v4
with:
path: |
/home/runner/.cache/huggingface/hub/datasets--*
/home/runner/.cache/huggingface/hub/models--*
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
pytest: pytest:
name: PyTest name: PyTest
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: [preload-cache]
strategy: strategy:
fail-fast: false fail-fast: false
max-parallel: 2 max-parallel: 2
@@ -120,14 +33,11 @@ jobs:
- name: Check out repository code - name: Check out repository code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Restore HF cache - name: Restore Cache from S3
id: hf-cache-restore id: hf-cache-restore-s3
uses: actions/cache/restore@v4 run: |
with: mkdir -p /home/runner/.cache/huggingface/hub
path: | curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
/home/runner/.cache/huggingface/hub/datasets--*
/home/runner/.cache/huggingface/hub/models--*
key: ${{ runner.os }}-hf-hub-cache-v2
- name: Setup Python - name: Setup Python
uses: actions/setup-python@v5 uses: actions/setup-python@v5
@@ -168,10 +78,6 @@ jobs:
run: | run: |
axolotl --help axolotl --help
- name: Pre-Download dataset fixture
run: |
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
- name: Run tests - name: Run tests
run: | run: |
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
@@ -193,15 +99,8 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
include: include:
- cuda: 124 - cuda: 126
cuda_version: 12.4.1 cuda_version: 12.6.3
python_version: "3.11"
pytorch: 2.5.1
num_gpus: 1
axolotl_extras:
nightly_build: "true"
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11" python_version: "3.11"
pytorch: 2.6.0 pytorch: 2.6.0
num_gpus: 1 num_gpus: 1

View File

@@ -37,3 +37,7 @@ RUN git lfs install --skip-repo && \
pip3 install awscli && \ pip3 install awscli && \
# The base image ships with `pydantic==1.8.2` which is not working # The base image ships with `pydantic==1.8.2` which is not working
pip3 install -U --no-cache-dir pydantic==1.10.10 pip3 install -U --no-cache-dir pydantic==1.10.10
RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
fi

View File

@@ -219,7 +219,9 @@ class TrainerBuilderBase(abc.ABC):
if self.cfg.bf16 == "full": if self.cfg.bf16 == "full":
training_args_kwargs["bf16_full_eval"] = True training_args_kwargs["bf16_full_eval"] = True
else: else:
training_args_kwargs["bf16"] = self.cfg.bf16 or self.cfg.bfloat16 bf16 = self.cfg.bf16 or self.cfg.bfloat16
bf16 = bf16 if bf16 is not None else False
training_args_kwargs["bf16"] = bf16
def _configure_scheduler(self, training_args_kwargs: dict): def _configure_scheduler(self, training_args_kwargs: dict):
if self.cfg.lr_scheduler in ["one_cycle", "rex"]: if self.cfg.lr_scheduler in ["one_cycle", "rex"]:

View File

@@ -10,7 +10,7 @@ import shutil
import sys import sys
import tempfile import tempfile
import time import time
from pathlib import Path, PosixPath from pathlib import Path
from typing import Generator from typing import Generator
import datasets import datasets
@@ -423,9 +423,13 @@ def temp_dir() -> Generator[str, None, None]:
shutil.rmtree(_temp_dir) shutil.rmtree(_temp_dir)
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="module")
def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None: def module_temp_dir() -> Generator[str, None, None]:
os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache" # Create a temporary directory
_temp_dir = tempfile.mkdtemp()
yield _temp_dir
# Clean up the directory after the test
shutil.rmtree(_temp_dir)
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)

View File

@@ -2,6 +2,8 @@
E2E tests for multigpu lora tinyllama E2E tests for multigpu lora tinyllama
""" """
# pylint: disable=redefined-outer-name
from pathlib import Path from pathlib import Path
import pytest import pytest
@@ -25,6 +27,60 @@ def download_model():
snapshot_download("HuggingFaceTB/SmolLM2-135M") snapshot_download("HuggingFaceTB/SmolLM2-135M")
@pytest.fixture(scope="module")
def sft_base_cfg():
cfg = DictDefault(
base_model="HuggingFaceTB/SmolLM2-135M",
tokenizer_config="HuggingFaceTB/SmolLM2-135M", # this has to be manually set since we haven't done validation
sequence_len=1024,
special_tokens={
"pad_token": "<|endoftext|>",
},
datasets=[
{
"path": "tatsu-lab/alpaca",
"type": "alpaca",
"split": "train[:10%]",
},
],
val_set_size=0.1,
sample_packing=True,
flash_attention=True,
learning_rate=0.00001,
optimizer="adamw_8bit",
seed=42,
# these need to be set since we aren't running schema validation
micro_batch_size=2,
gradient_accumulation_steps=1,
)
return cfg
@pytest.fixture(scope="module", name="sft_prepared_dataset_alpaca_cfg")
def sft_prepared_dataset_alpaca_cfg(module_temp_dir, sft_base_cfg):
dataset_prepared_path = module_temp_dir + "/last_run_prepared"
cfg = sft_base_cfg | DictDefault(
dataset_prepared_path=dataset_prepared_path,
)
Path(module_temp_dir).mkdir(parents=True, exist_ok=True)
with open(Path(module_temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
execute_subprocess_async(
[
"axolotl",
"preprocess",
str(Path(module_temp_dir) / "config.yaml"),
]
)
# unset flash attention since we have some flex attention tests too
cfg.flash_attention = None
return cfg
def transformers_version_eq(required_version): def transformers_version_eq(required_version):
return version.parse(transformers.__version__) == version.parse(required_version) return version.parse(transformers.__version__) == version.parse(required_version)
@@ -97,45 +153,36 @@ class TestMultiGPULlama:
"gradient_accumulation_steps", "gradient_accumulation_steps",
[1, 2], [1, 2],
) )
def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps): def test_lora_ddp_packed(
self, temp_dir, sft_prepared_dataset_alpaca_cfg, gradient_accumulation_steps
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "HuggingFaceTB/SmolLM2-135M", {
"sequence_len": 2048, "eval_sample_packing": False,
"sample_packing": True, "pad_to_sequence_len": True,
"eval_sample_packing": False, "adapter": "lora",
"pad_to_sequence_len": True, "lora_r": 8,
"adapter": "lora", "lora_alpha": 16,
"lora_r": 8, "lora_dropout": 0.05,
"lora_alpha": 16, "lora_target_linear": True,
"lora_dropout": 0.05, "val_set_size": 0.05,
"lora_target_linear": True, "num_epochs": 1,
"val_set_size": 0.05, "max_steps": 2,
"special_tokens": { "micro_batch_size": 1,
"pad_token": "<|endoftext|>", "gradient_accumulation_steps": gradient_accumulation_steps,
}, # "gradient_checkpointing": True,
"datasets": [ "output_dir": temp_dir,
{ "learning_rate": 0.00001,
"path": "tatsu-lab/alpaca", "optimizer": "adamw_8bit",
"type": "alpaca", "lr_scheduler": "cosine",
"split": "train[:20%]", "flash_attention": True,
}, "use_tensorboard": True,
], "bf16": True,
"num_epochs": 1, }
"max_steps": 2, )
"micro_batch_size": 1, | sft_prepared_dataset_alpaca_cfg
"gradient_accumulation_steps": gradient_accumulation_steps,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_8bit",
"lr_scheduler": "cosine",
"flash_attention": True,
"use_tensorboard": True,
"bf16": True,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -385,59 +432,50 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"fsdp_state_dict_type", "fsdp_state_dict_type",
["FULL_STATE_DICT", "SHARDED_STATE_DICT"], ["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
) )
def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type): def test_fsdp_packed(
self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "HuggingFaceTB/SmolLM2-135M", {
"sample_packing": True, "pad_to_sequence_len": True,
"pad_to_sequence_len": True, "num_epochs": 1,
"sequence_len": 1024, "max_steps": 2,
"val_set_size": 0.05, "micro_batch_size": 2,
"special_tokens": { "gradient_accumulation_steps": 2,
"pad_token": "<|endoftext|>", # "gradient_checkpointing": True,
}, "output_dir": temp_dir,
"datasets": [ "dataset_prepared_path": temp_dir + "/last_run_prepared",
{ "learning_rate": 0.00001,
"path": "tatsu-lab/alpaca", "optimizer": "adamw_torch_fused",
"type": "alpaca", "lr_scheduler": "cosine",
"split": "train[:10%]", "flash_attention": True,
"fsdp": [
"full_shard",
"auto_wrap",
],
"fsdp_config": {
"fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
"fsdp_cpu_ram_efficient_loading": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": fsdp_state_dict_type,
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
}, },
], "use_tensorboard": True,
"num_epochs": 1, }
"max_steps": 2, )
"micro_batch_size": 2, | sft_prepared_dataset_alpaca_cfg
"gradient_accumulation_steps": 2,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"fsdp": [
"full_shard",
"auto_wrap",
],
"fsdp_config": {
"fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
"fsdp_cpu_ram_efficient_loading": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": fsdp_state_dict_type,
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
},
"use_tensorboard": True,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -458,7 +496,7 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
) )
@require_torch_2_6_0 @require_torch_2_6_0
@@ -471,51 +509,43 @@ class TestMultiGPULlama:
[True, False], [True, False],
) )
def test_fsdp2_packed( def test_fsdp2_packed(
self, temp_dir, attention_backend, fsdp_reshard_after_forward self,
temp_dir,
sft_prepared_dataset_alpaca_cfg,
attention_backend,
fsdp_reshard_after_forward,
): ):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "HuggingFaceTB/SmolLM2-135M", {
"sample_packing": True, "pad_to_sequence_len": True,
"pad_to_sequence_len": True, "num_epochs": 1,
"sequence_len": 2048, "max_steps": 2,
"val_set_size": 0.1, "micro_batch_size": 4,
"special_tokens": { "gradient_accumulation_steps": 2,
"pad_token": "<|endoftext|>", "gradient_checkpointing": True,
}, "output_dir": temp_dir,
"datasets": [ "learning_rate": 0.00001,
{ "optimizer": "adamw_torch_8bit",
"path": "tatsu-lab/alpaca", "lr_scheduler": "cosine",
"type": "alpaca", "fsdp": [
"split": "train[:10%]", "auto_wrap",
],
"fsdp_config": {
"fsdp_version": 2,
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
"fsdp_offload_params": False,
"fsdp_cpu_ram_efficient_loading": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
}, },
], "use_tensorboard": True,
"num_epochs": 1, }
"max_steps": 2, )
"micro_batch_size": 4, | sft_prepared_dataset_alpaca_cfg
"gradient_accumulation_steps": 2,
"gradient_checkpointing": True,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_8bit",
"lr_scheduler": "cosine",
"fsdp": [
"auto_wrap",
],
"fsdp_config": {
"fsdp_version": 2,
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
"fsdp_offload_params": False,
"fsdp_cpu_ram_efficient_loading": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
},
"use_tensorboard": True,
}
) )
if attention_backend == "flash": if attention_backend == "flash":
cfg.flash_attention = True cfg.flash_attention = True
@@ -543,64 +573,55 @@ class TestMultiGPULlama:
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
) )
def test_fsdp_qlora_prequant_packed(self, temp_dir): def test_fsdp_qlora_prequant_packed(
self, temp_dir, sft_prepared_dataset_alpaca_cfg
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16", {
"adapter": "qlora", "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
"mean_resizing_embeddings": True, "adapter": "qlora",
"load_in_4bit": True, "mean_resizing_embeddings": True,
"lora_r": 8, "load_in_4bit": True,
"lora_alpha": 16, "lora_r": 8,
"lora_dropout": 0.05, "lora_alpha": 16,
"lora_target_linear": True, "lora_dropout": 0.05,
# "lora_modules_to_save": [ "lora_target_linear": True,
# "embed_tokens", # "lora_modules_to_save": [
# "lm_head", # "embed_tokens",
# ], # "lm_head",
"sample_packing": True, # ],
"eval_sample_packing": False, "eval_sample_packing": False,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 1024, "num_epochs": 1,
"val_set_size": 0.01, "max_steps": 2,
"special_tokens": { "micro_batch_size": 2,
"pad_token": "<|endoftext|>", "gradient_accumulation_steps": 2,
}, # "gradient_checkpointing": True,
"datasets": [ "output_dir": temp_dir,
{ "learning_rate": 0.00001,
"path": "tatsu-lab/alpaca", "optimizer": "adamw_torch_fused",
"type": "alpaca", "lr_scheduler": "cosine",
"split": "train[:10%]", "flash_attention": True,
"fsdp": [
"full_shard",
"auto_wrap",
],
"fsdp_config": {
"fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
"fsdp_cpu_ram_efficient_loading": True,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
}, },
], "use_tensorboard": True,
"num_epochs": 1, }
"max_steps": 2, )
"micro_batch_size": 2, | sft_prepared_dataset_alpaca_cfg
"gradient_accumulation_steps": 2,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"fsdp": [
"full_shard",
"auto_wrap",
],
"fsdp_config": {
"fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
"fsdp_cpu_ram_efficient_loading": True,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
},
"use_tensorboard": True,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -641,7 +662,12 @@ class TestMultiGPULlama:
[True, False], [True, False],
) )
def test_ds_zero3_packed( def test_ds_zero3_packed(
self, temp_dir, gradient_accumulation_steps, deepspeed, qlora self,
temp_dir,
sft_prepared_dataset_alpaca_cfg,
gradient_accumulation_steps,
deepspeed,
qlora,
): ):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
if qlora: if qlora:
@@ -655,37 +681,25 @@ class TestMultiGPULlama:
} }
else: else:
adapter = {} adapter = {}
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "HuggingFaceTB/SmolLM2-135M", {
"sample_packing": True, "pad_to_sequence_len": True,
"pad_to_sequence_len": True, "num_epochs": 1,
"sequence_len": 1024, "max_steps": 2,
"val_set_size": 0.05, "micro_batch_size": 1,
"special_tokens": { "gradient_accumulation_steps": gradient_accumulation_steps,
"pad_token": "<|endoftext|>", "output_dir": temp_dir,
}, "learning_rate": 0.00001,
"datasets": [ "optimizer": "adamw_torch_fused",
{ "lr_scheduler": "cosine",
"path": "tatsu-lab/alpaca", "flash_attention": True,
"type": "alpaca", "deepspeed": str(AXOLOTL_ROOT / deepspeed),
"split": "train[:10%]", "use_tensorboard": True,
}, **adapter,
], }
"num_epochs": 1, )
"max_steps": 2, | sft_prepared_dataset_alpaca_cfg
"micro_batch_size": 1,
"gradient_accumulation_steps": gradient_accumulation_steps,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / deepspeed),
"use_tensorboard": True,
**adapter,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -706,7 +720,7 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
@@ -717,7 +731,13 @@ class TestMultiGPULlama:
"qlora", "qlora",
[True, False], [True, False],
) )
def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora): def test_ds_zero2_packed(
self,
temp_dir,
sft_prepared_dataset_alpaca_cfg,
gradient_accumulation_steps,
qlora,
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
if qlora: if qlora:
adapter = { adapter = {
@@ -730,37 +750,25 @@ class TestMultiGPULlama:
} }
else: else:
adapter = {} adapter = {}
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "HuggingFaceTB/SmolLM2-135M", {
"sample_packing": True, "pad_to_sequence_len": True,
"pad_to_sequence_len": True, "num_epochs": 1,
"sequence_len": 1024, "max_steps": 2,
"val_set_size": 0.01, "micro_batch_size": 1,
"special_tokens": { "gradient_accumulation_steps": gradient_accumulation_steps,
"pad_token": "<|endoftext|>", "output_dir": temp_dir,
}, "learning_rate": 0.00001,
"datasets": [ "optimizer": "adamw_torch_fused",
{ "lr_scheduler": "cosine",
"path": "tatsu-lab/alpaca", "flash_attention": True,
"type": "alpaca", "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
"split": "train[:10%]", "use_tensorboard": True,
}, **adapter,
], }
"num_epochs": 1, )
"max_steps": 2, | sft_prepared_dataset_alpaca_cfg
"micro_batch_size": 1,
"gradient_accumulation_steps": gradient_accumulation_steps,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
"use_tensorboard": True,
**adapter,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -781,7 +789,7 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
@@ -792,7 +800,13 @@ class TestMultiGPULlama:
"qlora", "qlora",
[True, False], [True, False],
) )
def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora): def test_ds_zero1_packed(
self,
temp_dir,
sft_prepared_dataset_alpaca_cfg,
gradient_accumulation_steps,
qlora,
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
if qlora: if qlora:
adapter = { adapter = {
@@ -805,37 +819,25 @@ class TestMultiGPULlama:
} }
else: else:
adapter = {} adapter = {}
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "HuggingFaceTB/SmolLM2-135M", {
"sample_packing": True, "pad_to_sequence_len": True,
"pad_to_sequence_len": True, "num_epochs": 1,
"sequence_len": 1024, "max_steps": 2,
"val_set_size": 0.01, "micro_batch_size": 1,
"special_tokens": { "gradient_accumulation_steps": gradient_accumulation_steps,
"pad_token": "<|endoftext|>", "output_dir": temp_dir,
}, "learning_rate": 0.00001,
"datasets": [ "optimizer": "adamw_torch_fused",
{ "lr_scheduler": "cosine",
"path": "tatsu-lab/alpaca", "flash_attention": True,
"type": "alpaca", "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"split": "train[:10%]", "use_tensorboard": True,
}, **adapter,
], }
"num_epochs": 1, )
"max_steps": 2, | sft_prepared_dataset_alpaca_cfg
"micro_batch_size": 1,
"gradient_accumulation_steps": gradient_accumulation_steps,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"use_tensorboard": True,
**adapter,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -856,7 +858,7 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
) )
@pytest.mark.skip( @pytest.mark.skip(