Compare commits

...

9 Commits

Author SHA1 Message Date
Wing Lian
b79996bdc4 tweak loss 2025-07-06 19:42:43 -04:00
Wing Lian
68368de7ed add seed for stable reproducibility 2025-07-06 19:29:51 -04:00
Wing Lian
a94c4a014b tweak acceptable loss from changed hyperparams 2025-07-06 19:25:26 -04:00
Wing Lian
0102ca5943 fix cfg merge 2025-07-06 19:11:46 -04:00
Wing Lian
97e8c01a70 tweak losses 2025-07-06 18:55:16 -04:00
Wing Lian
5c4705b185 unset fa 2025-07-06 13:27:55 -04:00
Wing Lian
47a88da330 set mbsz and revert non-packed test 2025-07-06 12:27:25 -04:00
Wing Lian
07ab737a55 set tokenizer_config in fixture 2025-07-06 12:24:21 -04:00
Wing Lian
c40da3b5eb use shared fixture for preprocessed alpaca dataset 2025-07-06 11:44:31 -04:00
2 changed files with 291 additions and 285 deletions

View File

@@ -10,7 +10,7 @@ import shutil
import sys import sys
import tempfile import tempfile
import time import time
from pathlib import Path, PosixPath from pathlib import Path
from typing import Generator from typing import Generator
import datasets import datasets
@@ -423,9 +423,13 @@ def temp_dir() -> Generator[str, None, None]:
shutil.rmtree(_temp_dir) shutil.rmtree(_temp_dir)
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="module")
def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None: def module_temp_dir() -> Generator[str, None, None]:
os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache" # Create a temporary directory
_temp_dir = tempfile.mkdtemp()
yield _temp_dir
# Clean up the directory after the test
shutil.rmtree(_temp_dir)
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)

View File

@@ -2,6 +2,8 @@
E2E tests for multigpu lora tinyllama E2E tests for multigpu lora tinyllama
""" """
# pylint: disable=redefined-outer-name
from pathlib import Path from pathlib import Path
import pytest import pytest
@@ -25,6 +27,60 @@ def download_model():
snapshot_download("HuggingFaceTB/SmolLM2-135M") snapshot_download("HuggingFaceTB/SmolLM2-135M")
@pytest.fixture(scope="module")
def sft_base_cfg():
cfg = DictDefault(
base_model="HuggingFaceTB/SmolLM2-135M",
tokenizer_config="HuggingFaceTB/SmolLM2-135M", # this has to be manually set since we haven't done validation
sequence_len=1024,
special_tokens={
"pad_token": "<|endoftext|>",
},
datasets=[
{
"path": "tatsu-lab/alpaca",
"type": "alpaca",
"split": "train[:10%]",
},
],
val_set_size=0.1,
sample_packing=True,
flash_attention=True,
learning_rate=0.00001,
optimizer="adamw_8bit",
seed=42,
# these need to be set since we aren't running schema validation
micro_batch_size=2,
gradient_accumulation_steps=1,
)
return cfg
@pytest.fixture(scope="module", name="sft_prepared_dataset_alpaca_cfg")
def sft_prepared_dataset_alpaca_cfg(module_temp_dir, sft_base_cfg):
dataset_prepared_path = module_temp_dir + "/last_run_prepared"
cfg = sft_base_cfg | DictDefault(
dataset_prepared_path=dataset_prepared_path,
)
Path(module_temp_dir).mkdir(parents=True, exist_ok=True)
with open(Path(module_temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
execute_subprocess_async(
[
"axolotl",
"preprocess",
str(Path(module_temp_dir) / "config.yaml"),
]
)
# unset flash attention since we have some flex attention tests too
cfg.flash_attention = None
return cfg
def transformers_version_eq(required_version): def transformers_version_eq(required_version):
return version.parse(transformers.__version__) == version.parse(required_version) return version.parse(transformers.__version__) == version.parse(required_version)
@@ -97,45 +153,36 @@ class TestMultiGPULlama:
"gradient_accumulation_steps", "gradient_accumulation_steps",
[1, 2], [1, 2],
) )
def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps): def test_lora_ddp_packed(
self, temp_dir, sft_prepared_dataset_alpaca_cfg, gradient_accumulation_steps
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "HuggingFaceTB/SmolLM2-135M", {
"sequence_len": 2048, "eval_sample_packing": False,
"sample_packing": True, "pad_to_sequence_len": True,
"eval_sample_packing": False, "adapter": "lora",
"pad_to_sequence_len": True, "lora_r": 8,
"adapter": "lora", "lora_alpha": 16,
"lora_r": 8, "lora_dropout": 0.05,
"lora_alpha": 16, "lora_target_linear": True,
"lora_dropout": 0.05, "val_set_size": 0.05,
"lora_target_linear": True, "num_epochs": 1,
"val_set_size": 0.05, "max_steps": 2,
"special_tokens": { "micro_batch_size": 1,
"pad_token": "<|endoftext|>", "gradient_accumulation_steps": gradient_accumulation_steps,
}, # "gradient_checkpointing": True,
"datasets": [ "output_dir": temp_dir,
{ "learning_rate": 0.00001,
"path": "tatsu-lab/alpaca", "optimizer": "adamw_8bit",
"type": "alpaca", "lr_scheduler": "cosine",
"split": "train[:20%]", "flash_attention": True,
}, "use_tensorboard": True,
], "bf16": True,
"num_epochs": 1, }
"max_steps": 2, )
"micro_batch_size": 1, | sft_prepared_dataset_alpaca_cfg
"gradient_accumulation_steps": gradient_accumulation_steps,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_8bit",
"lr_scheduler": "cosine",
"flash_attention": True,
"use_tensorboard": True,
"bf16": True,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -385,59 +432,50 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"fsdp_state_dict_type", "fsdp_state_dict_type",
["FULL_STATE_DICT", "SHARDED_STATE_DICT"], ["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
) )
def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type): def test_fsdp_packed(
self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "HuggingFaceTB/SmolLM2-135M", {
"sample_packing": True, "pad_to_sequence_len": True,
"pad_to_sequence_len": True, "num_epochs": 1,
"sequence_len": 1024, "max_steps": 2,
"val_set_size": 0.05, "micro_batch_size": 2,
"special_tokens": { "gradient_accumulation_steps": 2,
"pad_token": "<|endoftext|>", # "gradient_checkpointing": True,
}, "output_dir": temp_dir,
"datasets": [ "dataset_prepared_path": temp_dir + "/last_run_prepared",
{ "learning_rate": 0.00001,
"path": "tatsu-lab/alpaca", "optimizer": "adamw_torch_fused",
"type": "alpaca", "lr_scheduler": "cosine",
"split": "train[:10%]", "flash_attention": True,
"fsdp": [
"full_shard",
"auto_wrap",
],
"fsdp_config": {
"fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
"fsdp_cpu_ram_efficient_loading": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": fsdp_state_dict_type,
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
}, },
], "use_tensorboard": True,
"num_epochs": 1, }
"max_steps": 2, )
"micro_batch_size": 2, | sft_prepared_dataset_alpaca_cfg
"gradient_accumulation_steps": 2,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"fsdp": [
"full_shard",
"auto_wrap",
],
"fsdp_config": {
"fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
"fsdp_cpu_ram_efficient_loading": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": fsdp_state_dict_type,
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
},
"use_tensorboard": True,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -458,7 +496,7 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
) )
@require_torch_2_6_0 @require_torch_2_6_0
@@ -471,51 +509,43 @@ class TestMultiGPULlama:
[True, False], [True, False],
) )
def test_fsdp2_packed( def test_fsdp2_packed(
self, temp_dir, attention_backend, fsdp_reshard_after_forward self,
temp_dir,
sft_prepared_dataset_alpaca_cfg,
attention_backend,
fsdp_reshard_after_forward,
): ):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "HuggingFaceTB/SmolLM2-135M", {
"sample_packing": True, "pad_to_sequence_len": True,
"pad_to_sequence_len": True, "num_epochs": 1,
"sequence_len": 2048, "max_steps": 2,
"val_set_size": 0.1, "micro_batch_size": 4,
"special_tokens": { "gradient_accumulation_steps": 2,
"pad_token": "<|endoftext|>", "gradient_checkpointing": True,
}, "output_dir": temp_dir,
"datasets": [ "learning_rate": 0.00001,
{ "optimizer": "adamw_torch_8bit",
"path": "tatsu-lab/alpaca", "lr_scheduler": "cosine",
"type": "alpaca", "fsdp": [
"split": "train[:10%]", "auto_wrap",
],
"fsdp_config": {
"fsdp_version": 2,
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
"fsdp_offload_params": False,
"fsdp_cpu_ram_efficient_loading": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
}, },
], "use_tensorboard": True,
"num_epochs": 1, }
"max_steps": 2, )
"micro_batch_size": 4, | sft_prepared_dataset_alpaca_cfg
"gradient_accumulation_steps": 2,
"gradient_checkpointing": True,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_8bit",
"lr_scheduler": "cosine",
"fsdp": [
"auto_wrap",
],
"fsdp_config": {
"fsdp_version": 2,
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
"fsdp_offload_params": False,
"fsdp_cpu_ram_efficient_loading": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
},
"use_tensorboard": True,
}
) )
if attention_backend == "flash": if attention_backend == "flash":
cfg.flash_attention = True cfg.flash_attention = True
@@ -543,64 +573,55 @@ class TestMultiGPULlama:
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
) )
def test_fsdp_qlora_prequant_packed(self, temp_dir): def test_fsdp_qlora_prequant_packed(
self, temp_dir, sft_prepared_dataset_alpaca_cfg
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16", {
"adapter": "qlora", "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
"mean_resizing_embeddings": True, "adapter": "qlora",
"load_in_4bit": True, "mean_resizing_embeddings": True,
"lora_r": 8, "load_in_4bit": True,
"lora_alpha": 16, "lora_r": 8,
"lora_dropout": 0.05, "lora_alpha": 16,
"lora_target_linear": True, "lora_dropout": 0.05,
# "lora_modules_to_save": [ "lora_target_linear": True,
# "embed_tokens", # "lora_modules_to_save": [
# "lm_head", # "embed_tokens",
# ], # "lm_head",
"sample_packing": True, # ],
"eval_sample_packing": False, "eval_sample_packing": False,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 1024, "num_epochs": 1,
"val_set_size": 0.01, "max_steps": 2,
"special_tokens": { "micro_batch_size": 2,
"pad_token": "<|endoftext|>", "gradient_accumulation_steps": 2,
}, # "gradient_checkpointing": True,
"datasets": [ "output_dir": temp_dir,
{ "learning_rate": 0.00001,
"path": "tatsu-lab/alpaca", "optimizer": "adamw_torch_fused",
"type": "alpaca", "lr_scheduler": "cosine",
"split": "train[:10%]", "flash_attention": True,
"fsdp": [
"full_shard",
"auto_wrap",
],
"fsdp_config": {
"fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
"fsdp_cpu_ram_efficient_loading": True,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
}, },
], "use_tensorboard": True,
"num_epochs": 1, }
"max_steps": 2, )
"micro_batch_size": 2, | sft_prepared_dataset_alpaca_cfg
"gradient_accumulation_steps": 2,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"fsdp": [
"full_shard",
"auto_wrap",
],
"fsdp_config": {
"fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
"fsdp_cpu_ram_efficient_loading": True,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
},
"use_tensorboard": True,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -641,7 +662,12 @@ class TestMultiGPULlama:
[True, False], [True, False],
) )
def test_ds_zero3_packed( def test_ds_zero3_packed(
self, temp_dir, gradient_accumulation_steps, deepspeed, qlora self,
temp_dir,
sft_prepared_dataset_alpaca_cfg,
gradient_accumulation_steps,
deepspeed,
qlora,
): ):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
if qlora: if qlora:
@@ -655,37 +681,25 @@ class TestMultiGPULlama:
} }
else: else:
adapter = {} adapter = {}
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "HuggingFaceTB/SmolLM2-135M", {
"sample_packing": True, "pad_to_sequence_len": True,
"pad_to_sequence_len": True, "num_epochs": 1,
"sequence_len": 1024, "max_steps": 2,
"val_set_size": 0.05, "micro_batch_size": 1,
"special_tokens": { "gradient_accumulation_steps": gradient_accumulation_steps,
"pad_token": "<|endoftext|>", "output_dir": temp_dir,
}, "learning_rate": 0.00001,
"datasets": [ "optimizer": "adamw_torch_fused",
{ "lr_scheduler": "cosine",
"path": "tatsu-lab/alpaca", "flash_attention": True,
"type": "alpaca", "deepspeed": str(AXOLOTL_ROOT / deepspeed),
"split": "train[:10%]", "use_tensorboard": True,
}, **adapter,
], }
"num_epochs": 1, )
"max_steps": 2, | sft_prepared_dataset_alpaca_cfg
"micro_batch_size": 1,
"gradient_accumulation_steps": gradient_accumulation_steps,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / deepspeed),
"use_tensorboard": True,
**adapter,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -706,7 +720,7 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
@@ -717,7 +731,13 @@ class TestMultiGPULlama:
"qlora", "qlora",
[True, False], [True, False],
) )
def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora): def test_ds_zero2_packed(
self,
temp_dir,
sft_prepared_dataset_alpaca_cfg,
gradient_accumulation_steps,
qlora,
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
if qlora: if qlora:
adapter = { adapter = {
@@ -730,37 +750,25 @@ class TestMultiGPULlama:
} }
else: else:
adapter = {} adapter = {}
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "HuggingFaceTB/SmolLM2-135M", {
"sample_packing": True, "pad_to_sequence_len": True,
"pad_to_sequence_len": True, "num_epochs": 1,
"sequence_len": 1024, "max_steps": 2,
"val_set_size": 0.01, "micro_batch_size": 1,
"special_tokens": { "gradient_accumulation_steps": gradient_accumulation_steps,
"pad_token": "<|endoftext|>", "output_dir": temp_dir,
}, "learning_rate": 0.00001,
"datasets": [ "optimizer": "adamw_torch_fused",
{ "lr_scheduler": "cosine",
"path": "tatsu-lab/alpaca", "flash_attention": True,
"type": "alpaca", "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
"split": "train[:10%]", "use_tensorboard": True,
}, **adapter,
], }
"num_epochs": 1, )
"max_steps": 2, | sft_prepared_dataset_alpaca_cfg
"micro_batch_size": 1,
"gradient_accumulation_steps": gradient_accumulation_steps,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
"use_tensorboard": True,
**adapter,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -781,7 +789,7 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
@@ -792,7 +800,13 @@ class TestMultiGPULlama:
"qlora", "qlora",
[True, False], [True, False],
) )
def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora): def test_ds_zero1_packed(
self,
temp_dir,
sft_prepared_dataset_alpaca_cfg,
gradient_accumulation_steps,
qlora,
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
if qlora: if qlora:
adapter = { adapter = {
@@ -805,37 +819,25 @@ class TestMultiGPULlama:
} }
else: else:
adapter = {} adapter = {}
cfg = DictDefault( cfg = (
{ DictDefault(
"base_model": "HuggingFaceTB/SmolLM2-135M", {
"sample_packing": True, "pad_to_sequence_len": True,
"pad_to_sequence_len": True, "num_epochs": 1,
"sequence_len": 1024, "max_steps": 2,
"val_set_size": 0.01, "micro_batch_size": 1,
"special_tokens": { "gradient_accumulation_steps": gradient_accumulation_steps,
"pad_token": "<|endoftext|>", "output_dir": temp_dir,
}, "learning_rate": 0.00001,
"datasets": [ "optimizer": "adamw_torch_fused",
{ "lr_scheduler": "cosine",
"path": "tatsu-lab/alpaca", "flash_attention": True,
"type": "alpaca", "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"split": "train[:10%]", "use_tensorboard": True,
}, **adapter,
], }
"num_epochs": 1, )
"max_steps": 2, | sft_prepared_dataset_alpaca_cfg
"micro_batch_size": 1,
"gradient_accumulation_steps": gradient_accumulation_steps,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"use_tensorboard": True,
**adapter,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -856,7 +858,7 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
) )
@pytest.mark.skip( @pytest.mark.skip(