Compare commits
9 Commits
shared-pre
...
print_venv
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
454eea049f | ||
|
|
5a961ecadf | ||
|
|
b37ddf9778 | ||
|
|
bf38e507fb | ||
|
|
d00bd99279 | ||
|
|
2b41bfe9eb | ||
|
|
5bbbd599b4 | ||
|
|
26c782183d | ||
|
|
8065fed126 |
@@ -22,9 +22,11 @@ RUN apt-get update \
|
|||||||
&& mkdir /root/.conda \
|
&& mkdir /root/.conda \
|
||||||
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
||||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
&& conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
|
||||||
|
&& conda init bash \
|
||||||
|
&& echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc
|
||||||
|
|
||||||
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
|||||||
@@ -22,9 +22,11 @@ RUN apt-get update \
|
|||||||
&& mkdir /root/.conda \
|
&& mkdir /root/.conda \
|
||||||
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
||||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
&& conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
|
||||||
|
&& conda init bash \
|
||||||
|
&& echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc
|
||||||
|
|
||||||
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
|||||||
@@ -22,9 +22,11 @@ RUN apt-get update \
|
|||||||
&& mkdir /root/.conda \
|
&& mkdir /root/.conda \
|
||||||
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
||||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
&& conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
|
||||||
|
&& conda init bash \
|
||||||
|
&& echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc
|
||||||
|
|
||||||
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
|||||||
@@ -51,6 +51,10 @@ description: Frequently asked questions
|
|||||||
> pad_token: "..."
|
> pad_token: "..."
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI**
|
||||||
|
|
||||||
|
> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand.
|
||||||
|
|
||||||
### Chat templates
|
### Chat templates
|
||||||
|
|
||||||
**Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
|
**Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
|
||||||
|
|||||||
@@ -35,6 +35,12 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
|
|||||||
check_accelerate_default_config()
|
check_accelerate_default_config()
|
||||||
check_user_token()
|
check_user_token()
|
||||||
|
|
||||||
|
for key in ["skip_prepare_dataset", "pretraining_dataset"]:
|
||||||
|
if cfg.get("key"):
|
||||||
|
raise ValueError(
|
||||||
|
f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead."
|
||||||
|
)
|
||||||
|
|
||||||
if not cfg.dataset_prepared_path:
|
if not cfg.dataset_prepared_path:
|
||||||
msg = (
|
msg = (
|
||||||
Fore.RED
|
Fore.RED
|
||||||
|
|||||||
@@ -526,8 +526,9 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
|
|||||||
if len(datasets) == 1:
|
if len(datasets) == 1:
|
||||||
ds = datasets[0]
|
ds = datasets[0]
|
||||||
|
|
||||||
# Do not shuffle if curriculum sampling is enabled
|
# Do not shuffle if curriculum sampling is enabled or
|
||||||
if cfg.curriculum_sampling:
|
# shuffle_merged_datasets is disabled
|
||||||
|
if cfg.curriculum_sampling or not cfg.shuffle_merged_datasets:
|
||||||
return ds
|
return ds
|
||||||
|
|
||||||
return ds.shuffle(seed=cfg.seed)
|
return ds.shuffle(seed=cfg.seed)
|
||||||
|
|||||||
@@ -609,6 +609,9 @@ def prepare_opinionated_env(cfg):
|
|||||||
if cfg.qlora_sharded_model_loading:
|
if cfg.qlora_sharded_model_loading:
|
||||||
# model loading is forked after the tokenizer
|
# model loading is forked after the tokenizer
|
||||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
|
if cfg.sample_packing:
|
||||||
|
# multipack parallel packing sampler defaults to using fork
|
||||||
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
|
|
||||||
|
|
||||||
def setup_trainer(
|
def setup_trainer(
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import shutil
|
|||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path, PosixPath
|
||||||
from typing import Generator
|
from typing import Generator
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
@@ -423,13 +423,9 @@ def temp_dir() -> Generator[str, None, None]:
|
|||||||
shutil.rmtree(_temp_dir)
|
shutil.rmtree(_temp_dir)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
def module_temp_dir() -> Generator[str, None, None]:
|
def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None:
|
||||||
# Create a temporary directory
|
os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache"
|
||||||
_temp_dir = tempfile.mkdtemp()
|
|
||||||
yield _temp_dir
|
|
||||||
# Clean up the directory after the test
|
|
||||||
shutil.rmtree(_temp_dir)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
|
|||||||
@@ -2,8 +2,6 @@
|
|||||||
E2E tests for multigpu lora tinyllama
|
E2E tests for multigpu lora tinyllama
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# pylint: disable=redefined-outer-name
|
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@@ -27,60 +25,6 @@ def download_model():
|
|||||||
snapshot_download("HuggingFaceTB/SmolLM2-135M")
|
snapshot_download("HuggingFaceTB/SmolLM2-135M")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
|
||||||
def sft_base_cfg():
|
|
||||||
cfg = DictDefault(
|
|
||||||
base_model="HuggingFaceTB/SmolLM2-135M",
|
|
||||||
tokenizer_config="HuggingFaceTB/SmolLM2-135M", # this has to be manually set since we haven't done validation
|
|
||||||
sequence_len=1024,
|
|
||||||
special_tokens={
|
|
||||||
"pad_token": "<|endoftext|>",
|
|
||||||
},
|
|
||||||
datasets=[
|
|
||||||
{
|
|
||||||
"path": "tatsu-lab/alpaca",
|
|
||||||
"type": "alpaca",
|
|
||||||
"split": "train[:10%]",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
val_set_size=0.1,
|
|
||||||
sample_packing=True,
|
|
||||||
flash_attention=True,
|
|
||||||
learning_rate=0.00001,
|
|
||||||
optimizer="adamw_8bit",
|
|
||||||
seed=42,
|
|
||||||
# these need to be set since we aren't running schema validation
|
|
||||||
micro_batch_size=2,
|
|
||||||
gradient_accumulation_steps=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
return cfg
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module", name="sft_prepared_dataset_alpaca_cfg")
|
|
||||||
def sft_prepared_dataset_alpaca_cfg(module_temp_dir, sft_base_cfg):
|
|
||||||
dataset_prepared_path = module_temp_dir + "/last_run_prepared"
|
|
||||||
cfg = sft_base_cfg | DictDefault(
|
|
||||||
dataset_prepared_path=dataset_prepared_path,
|
|
||||||
)
|
|
||||||
|
|
||||||
Path(module_temp_dir).mkdir(parents=True, exist_ok=True)
|
|
||||||
with open(Path(module_temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
|
|
||||||
fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
|
|
||||||
|
|
||||||
execute_subprocess_async(
|
|
||||||
[
|
|
||||||
"axolotl",
|
|
||||||
"preprocess",
|
|
||||||
str(Path(module_temp_dir) / "config.yaml"),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
# unset flash attention since we have some flex attention tests too
|
|
||||||
cfg.flash_attention = None
|
|
||||||
return cfg
|
|
||||||
|
|
||||||
|
|
||||||
def transformers_version_eq(required_version):
|
def transformers_version_eq(required_version):
|
||||||
return version.parse(transformers.__version__) == version.parse(required_version)
|
return version.parse(transformers.__version__) == version.parse(required_version)
|
||||||
|
|
||||||
@@ -153,36 +97,45 @@ class TestMultiGPULlama:
|
|||||||
"gradient_accumulation_steps",
|
"gradient_accumulation_steps",
|
||||||
[1, 2],
|
[1, 2],
|
||||||
)
|
)
|
||||||
def test_lora_ddp_packed(
|
def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
|
||||||
self, temp_dir, sft_prepared_dataset_alpaca_cfg, gradient_accumulation_steps
|
|
||||||
):
|
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = (
|
cfg = DictDefault(
|
||||||
DictDefault(
|
{
|
||||||
{
|
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||||
"eval_sample_packing": False,
|
"sequence_len": 2048,
|
||||||
"pad_to_sequence_len": True,
|
"sample_packing": True,
|
||||||
"adapter": "lora",
|
"eval_sample_packing": False,
|
||||||
"lora_r": 8,
|
"pad_to_sequence_len": True,
|
||||||
"lora_alpha": 16,
|
"adapter": "lora",
|
||||||
"lora_dropout": 0.05,
|
"lora_r": 8,
|
||||||
"lora_target_linear": True,
|
"lora_alpha": 16,
|
||||||
"val_set_size": 0.05,
|
"lora_dropout": 0.05,
|
||||||
"num_epochs": 1,
|
"lora_target_linear": True,
|
||||||
"max_steps": 2,
|
"val_set_size": 0.05,
|
||||||
"micro_batch_size": 1,
|
"special_tokens": {
|
||||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
"pad_token": "<|endoftext|>",
|
||||||
# "gradient_checkpointing": True,
|
},
|
||||||
"output_dir": temp_dir,
|
"datasets": [
|
||||||
"learning_rate": 0.00001,
|
{
|
||||||
"optimizer": "adamw_8bit",
|
"path": "tatsu-lab/alpaca",
|
||||||
"lr_scheduler": "cosine",
|
"type": "alpaca",
|
||||||
"flash_attention": True,
|
"split": "train[:20%]",
|
||||||
"use_tensorboard": True,
|
},
|
||||||
"bf16": True,
|
],
|
||||||
}
|
"num_epochs": 1,
|
||||||
)
|
"max_steps": 2,
|
||||||
| sft_prepared_dataset_alpaca_cfg
|
"micro_batch_size": 1,
|
||||||
|
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||||
|
# "gradient_checkpointing": True,
|
||||||
|
"output_dir": temp_dir,
|
||||||
|
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||||
|
"learning_rate": 0.00001,
|
||||||
|
"optimizer": "adamw_8bit",
|
||||||
|
"lr_scheduler": "cosine",
|
||||||
|
"flash_attention": True,
|
||||||
|
"use_tensorboard": True,
|
||||||
|
"bf16": True,
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# write cfg to yaml file
|
# write cfg to yaml file
|
||||||
@@ -432,50 +385,59 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"fsdp_state_dict_type",
|
"fsdp_state_dict_type",
|
||||||
["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
|
["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
|
||||||
)
|
)
|
||||||
def test_fsdp_packed(
|
def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
|
||||||
self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
|
|
||||||
):
|
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = (
|
cfg = DictDefault(
|
||||||
DictDefault(
|
{
|
||||||
{
|
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||||
"pad_to_sequence_len": True,
|
"sample_packing": True,
|
||||||
"num_epochs": 1,
|
"pad_to_sequence_len": True,
|
||||||
"max_steps": 2,
|
"sequence_len": 1024,
|
||||||
"micro_batch_size": 2,
|
"val_set_size": 0.05,
|
||||||
"gradient_accumulation_steps": 2,
|
"special_tokens": {
|
||||||
# "gradient_checkpointing": True,
|
"pad_token": "<|endoftext|>",
|
||||||
"output_dir": temp_dir,
|
},
|
||||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
"datasets": [
|
||||||
"learning_rate": 0.00001,
|
{
|
||||||
"optimizer": "adamw_torch_fused",
|
"path": "tatsu-lab/alpaca",
|
||||||
"lr_scheduler": "cosine",
|
"type": "alpaca",
|
||||||
"flash_attention": True,
|
"split": "train[:10%]",
|
||||||
"fsdp": [
|
|
||||||
"full_shard",
|
|
||||||
"auto_wrap",
|
|
||||||
],
|
|
||||||
"fsdp_config": {
|
|
||||||
"fsdp_limit_all_gathers": True,
|
|
||||||
"fsdp_offload_params": False,
|
|
||||||
"fsdp_sync_module_states": True,
|
|
||||||
"fsdp_use_orig_params": False,
|
|
||||||
"fsdp_cpu_ram_efficient_loading": False,
|
|
||||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
|
||||||
"fsdp_state_dict_type": fsdp_state_dict_type,
|
|
||||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
|
||||||
},
|
},
|
||||||
"use_tensorboard": True,
|
],
|
||||||
}
|
"num_epochs": 1,
|
||||||
)
|
"max_steps": 2,
|
||||||
| sft_prepared_dataset_alpaca_cfg
|
"micro_batch_size": 2,
|
||||||
|
"gradient_accumulation_steps": 2,
|
||||||
|
# "gradient_checkpointing": True,
|
||||||
|
"output_dir": temp_dir,
|
||||||
|
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||||
|
"learning_rate": 0.00001,
|
||||||
|
"optimizer": "adamw_torch_fused",
|
||||||
|
"lr_scheduler": "cosine",
|
||||||
|
"flash_attention": True,
|
||||||
|
"fsdp": [
|
||||||
|
"full_shard",
|
||||||
|
"auto_wrap",
|
||||||
|
],
|
||||||
|
"fsdp_config": {
|
||||||
|
"fsdp_limit_all_gathers": True,
|
||||||
|
"fsdp_offload_params": False,
|
||||||
|
"fsdp_sync_module_states": True,
|
||||||
|
"fsdp_use_orig_params": False,
|
||||||
|
"fsdp_cpu_ram_efficient_loading": False,
|
||||||
|
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||||
|
"fsdp_state_dict_type": fsdp_state_dict_type,
|
||||||
|
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||||
|
},
|
||||||
|
"use_tensorboard": True,
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# write cfg to yaml file
|
# write cfg to yaml file
|
||||||
@@ -496,7 +458,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@require_torch_2_6_0
|
@require_torch_2_6_0
|
||||||
@@ -509,43 +471,51 @@ class TestMultiGPULlama:
|
|||||||
[True, False],
|
[True, False],
|
||||||
)
|
)
|
||||||
def test_fsdp2_packed(
|
def test_fsdp2_packed(
|
||||||
self,
|
self, temp_dir, attention_backend, fsdp_reshard_after_forward
|
||||||
temp_dir,
|
|
||||||
sft_prepared_dataset_alpaca_cfg,
|
|
||||||
attention_backend,
|
|
||||||
fsdp_reshard_after_forward,
|
|
||||||
):
|
):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = (
|
cfg = DictDefault(
|
||||||
DictDefault(
|
{
|
||||||
{
|
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||||
"pad_to_sequence_len": True,
|
"sample_packing": True,
|
||||||
"num_epochs": 1,
|
"pad_to_sequence_len": True,
|
||||||
"max_steps": 2,
|
"sequence_len": 2048,
|
||||||
"micro_batch_size": 4,
|
"val_set_size": 0.1,
|
||||||
"gradient_accumulation_steps": 2,
|
"special_tokens": {
|
||||||
"gradient_checkpointing": True,
|
"pad_token": "<|endoftext|>",
|
||||||
"output_dir": temp_dir,
|
},
|
||||||
"learning_rate": 0.00001,
|
"datasets": [
|
||||||
"optimizer": "adamw_torch_8bit",
|
{
|
||||||
"lr_scheduler": "cosine",
|
"path": "tatsu-lab/alpaca",
|
||||||
"fsdp": [
|
"type": "alpaca",
|
||||||
"auto_wrap",
|
"split": "train[:10%]",
|
||||||
],
|
|
||||||
"fsdp_config": {
|
|
||||||
"fsdp_version": 2,
|
|
||||||
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
|
|
||||||
"fsdp_offload_params": False,
|
|
||||||
"fsdp_cpu_ram_efficient_loading": False,
|
|
||||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
|
||||||
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
|
||||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
|
||||||
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
|
|
||||||
},
|
},
|
||||||
"use_tensorboard": True,
|
],
|
||||||
}
|
"num_epochs": 1,
|
||||||
)
|
"max_steps": 2,
|
||||||
| sft_prepared_dataset_alpaca_cfg
|
"micro_batch_size": 4,
|
||||||
|
"gradient_accumulation_steps": 2,
|
||||||
|
"gradient_checkpointing": True,
|
||||||
|
"output_dir": temp_dir,
|
||||||
|
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||||
|
"learning_rate": 0.00001,
|
||||||
|
"optimizer": "adamw_torch_8bit",
|
||||||
|
"lr_scheduler": "cosine",
|
||||||
|
"fsdp": [
|
||||||
|
"auto_wrap",
|
||||||
|
],
|
||||||
|
"fsdp_config": {
|
||||||
|
"fsdp_version": 2,
|
||||||
|
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
|
||||||
|
"fsdp_offload_params": False,
|
||||||
|
"fsdp_cpu_ram_efficient_loading": False,
|
||||||
|
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||||
|
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
||||||
|
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||||
|
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
|
||||||
|
},
|
||||||
|
"use_tensorboard": True,
|
||||||
|
}
|
||||||
)
|
)
|
||||||
if attention_backend == "flash":
|
if attention_backend == "flash":
|
||||||
cfg.flash_attention = True
|
cfg.flash_attention = True
|
||||||
@@ -573,55 +543,64 @@ class TestMultiGPULlama:
|
|||||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_fsdp_qlora_prequant_packed(
|
def test_fsdp_qlora_prequant_packed(self, temp_dir):
|
||||||
self, temp_dir, sft_prepared_dataset_alpaca_cfg
|
|
||||||
):
|
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = (
|
cfg = DictDefault(
|
||||||
DictDefault(
|
{
|
||||||
{
|
"base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
|
||||||
"base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
|
"adapter": "qlora",
|
||||||
"adapter": "qlora",
|
"mean_resizing_embeddings": True,
|
||||||
"mean_resizing_embeddings": True,
|
"load_in_4bit": True,
|
||||||
"load_in_4bit": True,
|
"lora_r": 8,
|
||||||
"lora_r": 8,
|
"lora_alpha": 16,
|
||||||
"lora_alpha": 16,
|
"lora_dropout": 0.05,
|
||||||
"lora_dropout": 0.05,
|
"lora_target_linear": True,
|
||||||
"lora_target_linear": True,
|
# "lora_modules_to_save": [
|
||||||
# "lora_modules_to_save": [
|
# "embed_tokens",
|
||||||
# "embed_tokens",
|
# "lm_head",
|
||||||
# "lm_head",
|
# ],
|
||||||
# ],
|
"sample_packing": True,
|
||||||
"eval_sample_packing": False,
|
"eval_sample_packing": False,
|
||||||
"pad_to_sequence_len": True,
|
"pad_to_sequence_len": True,
|
||||||
"num_epochs": 1,
|
"sequence_len": 1024,
|
||||||
"max_steps": 2,
|
"val_set_size": 0.01,
|
||||||
"micro_batch_size": 2,
|
"special_tokens": {
|
||||||
"gradient_accumulation_steps": 2,
|
"pad_token": "<|endoftext|>",
|
||||||
# "gradient_checkpointing": True,
|
},
|
||||||
"output_dir": temp_dir,
|
"datasets": [
|
||||||
"learning_rate": 0.00001,
|
{
|
||||||
"optimizer": "adamw_torch_fused",
|
"path": "tatsu-lab/alpaca",
|
||||||
"lr_scheduler": "cosine",
|
"type": "alpaca",
|
||||||
"flash_attention": True,
|
"split": "train[:10%]",
|
||||||
"fsdp": [
|
|
||||||
"full_shard",
|
|
||||||
"auto_wrap",
|
|
||||||
],
|
|
||||||
"fsdp_config": {
|
|
||||||
"fsdp_limit_all_gathers": True,
|
|
||||||
"fsdp_offload_params": False,
|
|
||||||
"fsdp_sync_module_states": True,
|
|
||||||
"fsdp_use_orig_params": False,
|
|
||||||
"fsdp_cpu_ram_efficient_loading": True,
|
|
||||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
|
||||||
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
|
||||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
|
||||||
},
|
},
|
||||||
"use_tensorboard": True,
|
],
|
||||||
}
|
"num_epochs": 1,
|
||||||
)
|
"max_steps": 2,
|
||||||
| sft_prepared_dataset_alpaca_cfg
|
"micro_batch_size": 2,
|
||||||
|
"gradient_accumulation_steps": 2,
|
||||||
|
# "gradient_checkpointing": True,
|
||||||
|
"output_dir": temp_dir,
|
||||||
|
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||||
|
"learning_rate": 0.00001,
|
||||||
|
"optimizer": "adamw_torch_fused",
|
||||||
|
"lr_scheduler": "cosine",
|
||||||
|
"flash_attention": True,
|
||||||
|
"fsdp": [
|
||||||
|
"full_shard",
|
||||||
|
"auto_wrap",
|
||||||
|
],
|
||||||
|
"fsdp_config": {
|
||||||
|
"fsdp_limit_all_gathers": True,
|
||||||
|
"fsdp_offload_params": False,
|
||||||
|
"fsdp_sync_module_states": True,
|
||||||
|
"fsdp_use_orig_params": False,
|
||||||
|
"fsdp_cpu_ram_efficient_loading": True,
|
||||||
|
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||||
|
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
||||||
|
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||||
|
},
|
||||||
|
"use_tensorboard": True,
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# write cfg to yaml file
|
# write cfg to yaml file
|
||||||
@@ -662,12 +641,7 @@ class TestMultiGPULlama:
|
|||||||
[True, False],
|
[True, False],
|
||||||
)
|
)
|
||||||
def test_ds_zero3_packed(
|
def test_ds_zero3_packed(
|
||||||
self,
|
self, temp_dir, gradient_accumulation_steps, deepspeed, qlora
|
||||||
temp_dir,
|
|
||||||
sft_prepared_dataset_alpaca_cfg,
|
|
||||||
gradient_accumulation_steps,
|
|
||||||
deepspeed,
|
|
||||||
qlora,
|
|
||||||
):
|
):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
if qlora:
|
if qlora:
|
||||||
@@ -681,25 +655,37 @@ class TestMultiGPULlama:
|
|||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
adapter = {}
|
adapter = {}
|
||||||
cfg = (
|
cfg = DictDefault(
|
||||||
DictDefault(
|
{
|
||||||
{
|
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||||
"pad_to_sequence_len": True,
|
"sample_packing": True,
|
||||||
"num_epochs": 1,
|
"pad_to_sequence_len": True,
|
||||||
"max_steps": 2,
|
"sequence_len": 1024,
|
||||||
"micro_batch_size": 1,
|
"val_set_size": 0.05,
|
||||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
"special_tokens": {
|
||||||
"output_dir": temp_dir,
|
"pad_token": "<|endoftext|>",
|
||||||
"learning_rate": 0.00001,
|
},
|
||||||
"optimizer": "adamw_torch_fused",
|
"datasets": [
|
||||||
"lr_scheduler": "cosine",
|
{
|
||||||
"flash_attention": True,
|
"path": "tatsu-lab/alpaca",
|
||||||
"deepspeed": str(AXOLOTL_ROOT / deepspeed),
|
"type": "alpaca",
|
||||||
"use_tensorboard": True,
|
"split": "train[:10%]",
|
||||||
**adapter,
|
},
|
||||||
}
|
],
|
||||||
)
|
"num_epochs": 1,
|
||||||
| sft_prepared_dataset_alpaca_cfg
|
"max_steps": 2,
|
||||||
|
"micro_batch_size": 1,
|
||||||
|
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||||
|
"output_dir": temp_dir,
|
||||||
|
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||||
|
"learning_rate": 0.00001,
|
||||||
|
"optimizer": "adamw_torch_fused",
|
||||||
|
"lr_scheduler": "cosine",
|
||||||
|
"flash_attention": True,
|
||||||
|
"deepspeed": str(AXOLOTL_ROOT / deepspeed),
|
||||||
|
"use_tensorboard": True,
|
||||||
|
**adapter,
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# write cfg to yaml file
|
# write cfg to yaml file
|
||||||
@@ -720,7 +706,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -731,13 +717,7 @@ class TestMultiGPULlama:
|
|||||||
"qlora",
|
"qlora",
|
||||||
[True, False],
|
[True, False],
|
||||||
)
|
)
|
||||||
def test_ds_zero2_packed(
|
def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
|
||||||
self,
|
|
||||||
temp_dir,
|
|
||||||
sft_prepared_dataset_alpaca_cfg,
|
|
||||||
gradient_accumulation_steps,
|
|
||||||
qlora,
|
|
||||||
):
|
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
if qlora:
|
if qlora:
|
||||||
adapter = {
|
adapter = {
|
||||||
@@ -750,25 +730,37 @@ class TestMultiGPULlama:
|
|||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
adapter = {}
|
adapter = {}
|
||||||
cfg = (
|
cfg = DictDefault(
|
||||||
DictDefault(
|
{
|
||||||
{
|
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||||
"pad_to_sequence_len": True,
|
"sample_packing": True,
|
||||||
"num_epochs": 1,
|
"pad_to_sequence_len": True,
|
||||||
"max_steps": 2,
|
"sequence_len": 1024,
|
||||||
"micro_batch_size": 1,
|
"val_set_size": 0.01,
|
||||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
"special_tokens": {
|
||||||
"output_dir": temp_dir,
|
"pad_token": "<|endoftext|>",
|
||||||
"learning_rate": 0.00001,
|
},
|
||||||
"optimizer": "adamw_torch_fused",
|
"datasets": [
|
||||||
"lr_scheduler": "cosine",
|
{
|
||||||
"flash_attention": True,
|
"path": "tatsu-lab/alpaca",
|
||||||
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
|
"type": "alpaca",
|
||||||
"use_tensorboard": True,
|
"split": "train[:10%]",
|
||||||
**adapter,
|
},
|
||||||
}
|
],
|
||||||
)
|
"num_epochs": 1,
|
||||||
| sft_prepared_dataset_alpaca_cfg
|
"max_steps": 2,
|
||||||
|
"micro_batch_size": 1,
|
||||||
|
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||||
|
"output_dir": temp_dir,
|
||||||
|
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||||
|
"learning_rate": 0.00001,
|
||||||
|
"optimizer": "adamw_torch_fused",
|
||||||
|
"lr_scheduler": "cosine",
|
||||||
|
"flash_attention": True,
|
||||||
|
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
|
||||||
|
"use_tensorboard": True,
|
||||||
|
**adapter,
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# write cfg to yaml file
|
# write cfg to yaml file
|
||||||
@@ -789,7 +781,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -800,13 +792,7 @@ class TestMultiGPULlama:
|
|||||||
"qlora",
|
"qlora",
|
||||||
[True, False],
|
[True, False],
|
||||||
)
|
)
|
||||||
def test_ds_zero1_packed(
|
def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
|
||||||
self,
|
|
||||||
temp_dir,
|
|
||||||
sft_prepared_dataset_alpaca_cfg,
|
|
||||||
gradient_accumulation_steps,
|
|
||||||
qlora,
|
|
||||||
):
|
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
if qlora:
|
if qlora:
|
||||||
adapter = {
|
adapter = {
|
||||||
@@ -819,25 +805,37 @@ class TestMultiGPULlama:
|
|||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
adapter = {}
|
adapter = {}
|
||||||
cfg = (
|
cfg = DictDefault(
|
||||||
DictDefault(
|
{
|
||||||
{
|
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||||
"pad_to_sequence_len": True,
|
"sample_packing": True,
|
||||||
"num_epochs": 1,
|
"pad_to_sequence_len": True,
|
||||||
"max_steps": 2,
|
"sequence_len": 1024,
|
||||||
"micro_batch_size": 1,
|
"val_set_size": 0.01,
|
||||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
"special_tokens": {
|
||||||
"output_dir": temp_dir,
|
"pad_token": "<|endoftext|>",
|
||||||
"learning_rate": 0.00001,
|
},
|
||||||
"optimizer": "adamw_torch_fused",
|
"datasets": [
|
||||||
"lr_scheduler": "cosine",
|
{
|
||||||
"flash_attention": True,
|
"path": "tatsu-lab/alpaca",
|
||||||
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
|
"type": "alpaca",
|
||||||
"use_tensorboard": True,
|
"split": "train[:10%]",
|
||||||
**adapter,
|
},
|
||||||
}
|
],
|
||||||
)
|
"num_epochs": 1,
|
||||||
| sft_prepared_dataset_alpaca_cfg
|
"max_steps": 2,
|
||||||
|
"micro_batch_size": 1,
|
||||||
|
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||||
|
"output_dir": temp_dir,
|
||||||
|
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||||
|
"learning_rate": 0.00001,
|
||||||
|
"optimizer": "adamw_torch_fused",
|
||||||
|
"lr_scheduler": "cosine",
|
||||||
|
"flash_attention": True,
|
||||||
|
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
|
||||||
|
"use_tensorboard": True,
|
||||||
|
**adapter,
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# write cfg to yaml file
|
# write cfg to yaml file
|
||||||
@@ -858,7 +856,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.skip(
|
@pytest.mark.skip(
|
||||||
|
|||||||
Reference in New Issue
Block a user