Compare commits

..

9 Commits

Author SHA1 Message Date
salman
454eea049f Merge branch 'main' into print_venv 2025-07-07 10:01:00 +01:00
NanoCode012
5a961ecadf Fix: do not call preprocess in multimodal or pretraining case (#2861)
* fix: let users know to not call preprocess for vision mode

* fix: improve ux for pretraining dataset and skip prepare ds

* feat: add info to doc

* Update src/axolotl/cli/preprocess.py following comment

Co-authored-by: salman <salman.mohammadi@outlook.com>

---------

Co-authored-by: salman <salman.mohammadi@outlook.com>
2025-07-06 21:55:33 -04:00
Wing Lian
b37ddf9778 don't use tokenizer parallelism when using packing (#2862) [skip ci] 2025-07-06 21:55:09 -04:00
Wing Lian
bf38e507fb respect shuffle_merged_datasets for single dataset too (#2866) [skip ci]
* respect shuffle_merged_datasets for single dataset too

* update inline comment for behavior

Co-authored-by: NanoCode012 <nano@axolotl.ai>

---------

Co-authored-by: NanoCode012 <nano@axolotl.ai>
2025-07-06 21:20:41 -04:00
Salman Mohammadi
d00bd99279 Merge branch 'print_venv' of github.com:axolotl-ai-cloud/axolotl into print_venv 2025-07-04 12:44:49 +01:00
Salman Mohammadi
2b41bfe9eb reverting 2025-07-04 12:40:58 +01:00
salman
5bbbd599b4 Merge branch 'main' into print_venv 2025-07-04 12:36:13 +01:00
Salman Mohammadi
26c782183d merging commands 2025-07-04 12:35:20 +01:00
Salman Mohammadi
8065fed126 adding venv to prompt 2025-07-02 15:27:42 +01:00
9 changed files with 313 additions and 299 deletions

View File

@@ -22,9 +22,11 @@ RUN apt-get update \
&& mkdir /root/.conda \ && mkdir /root/.conda \
&& bash Miniconda3-latest-Linux-x86_64.sh -b \ && bash Miniconda3-latest-Linux-x86_64.sh -b \
&& rm -f Miniconda3-latest-Linux-x86_64.sh \ && rm -f Miniconda3-latest-Linux-x86_64.sh \
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}" && conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
&& conda init bash \
&& echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}" ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"
WORKDIR /workspace WORKDIR /workspace

View File

@@ -22,9 +22,11 @@ RUN apt-get update \
&& mkdir /root/.conda \ && mkdir /root/.conda \
&& bash Miniconda3-latest-Linux-x86_64.sh -b \ && bash Miniconda3-latest-Linux-x86_64.sh -b \
&& rm -f Miniconda3-latest-Linux-x86_64.sh \ && rm -f Miniconda3-latest-Linux-x86_64.sh \
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}" && conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
&& conda init bash \
&& echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}" ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"
WORKDIR /workspace WORKDIR /workspace

View File

@@ -22,9 +22,11 @@ RUN apt-get update \
&& mkdir /root/.conda \ && mkdir /root/.conda \
&& bash Miniconda3-latest-Linux-x86_64.sh -b \ && bash Miniconda3-latest-Linux-x86_64.sh -b \
&& rm -f Miniconda3-latest-Linux-x86_64.sh \ && rm -f Miniconda3-latest-Linux-x86_64.sh \
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}" && conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
&& conda init bash \
&& echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}" ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"
WORKDIR /workspace WORKDIR /workspace

View File

@@ -51,6 +51,10 @@ description: Frequently asked questions
> pad_token: "..." > pad_token: "..."
> ``` > ```
**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI**
> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand.
### Chat templates ### Chat templates
**Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`** **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**

View File

@@ -35,6 +35,12 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
check_accelerate_default_config() check_accelerate_default_config()
check_user_token() check_user_token()
for key in ["skip_prepare_dataset", "pretraining_dataset"]:
if cfg.get("key"):
raise ValueError(
f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead."
)
if not cfg.dataset_prepared_path: if not cfg.dataset_prepared_path:
msg = ( msg = (
Fore.RED Fore.RED

View File

@@ -526,8 +526,9 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
if len(datasets) == 1: if len(datasets) == 1:
ds = datasets[0] ds = datasets[0]
# Do not shuffle if curriculum sampling is enabled # Do not shuffle if curriculum sampling is enabled or
if cfg.curriculum_sampling: # shuffle_merged_datasets is disabled
if cfg.curriculum_sampling or not cfg.shuffle_merged_datasets:
return ds return ds
return ds.shuffle(seed=cfg.seed) return ds.shuffle(seed=cfg.seed)

View File

@@ -609,6 +609,9 @@ def prepare_opinionated_env(cfg):
if cfg.qlora_sharded_model_loading: if cfg.qlora_sharded_model_loading:
# model loading is forked after the tokenizer # model loading is forked after the tokenizer
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
if cfg.sample_packing:
# multipack parallel packing sampler defaults to using fork
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def setup_trainer( def setup_trainer(

View File

@@ -10,7 +10,7 @@ import shutil
import sys import sys
import tempfile import tempfile
import time import time
from pathlib import Path from pathlib import Path, PosixPath
from typing import Generator from typing import Generator
import datasets import datasets
@@ -423,13 +423,9 @@ def temp_dir() -> Generator[str, None, None]:
shutil.rmtree(_temp_dir) shutil.rmtree(_temp_dir)
@pytest.fixture(scope="module") @pytest.fixture(scope="function", autouse=True)
def module_temp_dir() -> Generator[str, None, None]: def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None:
# Create a temporary directory os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache"
_temp_dir = tempfile.mkdtemp()
yield _temp_dir
# Clean up the directory after the test
shutil.rmtree(_temp_dir)
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)

View File

@@ -2,8 +2,6 @@
E2E tests for multigpu lora tinyllama E2E tests for multigpu lora tinyllama
""" """
# pylint: disable=redefined-outer-name
from pathlib import Path from pathlib import Path
import pytest import pytest
@@ -27,60 +25,6 @@ def download_model():
snapshot_download("HuggingFaceTB/SmolLM2-135M") snapshot_download("HuggingFaceTB/SmolLM2-135M")
@pytest.fixture(scope="module")
def sft_base_cfg():
cfg = DictDefault(
base_model="HuggingFaceTB/SmolLM2-135M",
tokenizer_config="HuggingFaceTB/SmolLM2-135M", # this has to be manually set since we haven't done validation
sequence_len=1024,
special_tokens={
"pad_token": "<|endoftext|>",
},
datasets=[
{
"path": "tatsu-lab/alpaca",
"type": "alpaca",
"split": "train[:10%]",
},
],
val_set_size=0.1,
sample_packing=True,
flash_attention=True,
learning_rate=0.00001,
optimizer="adamw_8bit",
seed=42,
# these need to be set since we aren't running schema validation
micro_batch_size=2,
gradient_accumulation_steps=1,
)
return cfg
@pytest.fixture(scope="module", name="sft_prepared_dataset_alpaca_cfg")
def sft_prepared_dataset_alpaca_cfg(module_temp_dir, sft_base_cfg):
dataset_prepared_path = module_temp_dir + "/last_run_prepared"
cfg = sft_base_cfg | DictDefault(
dataset_prepared_path=dataset_prepared_path,
)
Path(module_temp_dir).mkdir(parents=True, exist_ok=True)
with open(Path(module_temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
execute_subprocess_async(
[
"axolotl",
"preprocess",
str(Path(module_temp_dir) / "config.yaml"),
]
)
# unset flash attention since we have some flex attention tests too
cfg.flash_attention = None
return cfg
def transformers_version_eq(required_version): def transformers_version_eq(required_version):
return version.parse(transformers.__version__) == version.parse(required_version) return version.parse(transformers.__version__) == version.parse(required_version)
@@ -153,36 +97,45 @@ class TestMultiGPULlama:
"gradient_accumulation_steps", "gradient_accumulation_steps",
[1, 2], [1, 2],
) )
def test_lora_ddp_packed( def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
self, temp_dir, sft_prepared_dataset_alpaca_cfg, gradient_accumulation_steps
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = ( cfg = DictDefault(
DictDefault( {
{ "base_model": "HuggingFaceTB/SmolLM2-135M",
"eval_sample_packing": False, "sequence_len": 2048,
"pad_to_sequence_len": True, "sample_packing": True,
"adapter": "lora", "eval_sample_packing": False,
"lora_r": 8, "pad_to_sequence_len": True,
"lora_alpha": 16, "adapter": "lora",
"lora_dropout": 0.05, "lora_r": 8,
"lora_target_linear": True, "lora_alpha": 16,
"val_set_size": 0.05, "lora_dropout": 0.05,
"num_epochs": 1, "lora_target_linear": True,
"max_steps": 2, "val_set_size": 0.05,
"micro_batch_size": 1, "special_tokens": {
"gradient_accumulation_steps": gradient_accumulation_steps, "pad_token": "<|endoftext|>",
# "gradient_checkpointing": True, },
"output_dir": temp_dir, "datasets": [
"learning_rate": 0.00001, {
"optimizer": "adamw_8bit", "path": "tatsu-lab/alpaca",
"lr_scheduler": "cosine", "type": "alpaca",
"flash_attention": True, "split": "train[:20%]",
"use_tensorboard": True, },
"bf16": True, ],
} "num_epochs": 1,
) "max_steps": 2,
| sft_prepared_dataset_alpaca_cfg "micro_batch_size": 1,
"gradient_accumulation_steps": gradient_accumulation_steps,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_8bit",
"lr_scheduler": "cosine",
"flash_attention": True,
"use_tensorboard": True,
"bf16": True,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -432,50 +385,59 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"fsdp_state_dict_type", "fsdp_state_dict_type",
["FULL_STATE_DICT", "SHARDED_STATE_DICT"], ["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
) )
def test_fsdp_packed( def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = ( cfg = DictDefault(
DictDefault( {
{ "base_model": "HuggingFaceTB/SmolLM2-135M",
"pad_to_sequence_len": True, "sample_packing": True,
"num_epochs": 1, "pad_to_sequence_len": True,
"max_steps": 2, "sequence_len": 1024,
"micro_batch_size": 2, "val_set_size": 0.05,
"gradient_accumulation_steps": 2, "special_tokens": {
# "gradient_checkpointing": True, "pad_token": "<|endoftext|>",
"output_dir": temp_dir, },
"dataset_prepared_path": temp_dir + "/last_run_prepared", "datasets": [
"learning_rate": 0.00001, {
"optimizer": "adamw_torch_fused", "path": "tatsu-lab/alpaca",
"lr_scheduler": "cosine", "type": "alpaca",
"flash_attention": True, "split": "train[:10%]",
"fsdp": [
"full_shard",
"auto_wrap",
],
"fsdp_config": {
"fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
"fsdp_cpu_ram_efficient_loading": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": fsdp_state_dict_type,
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
}, },
"use_tensorboard": True, ],
} "num_epochs": 1,
) "max_steps": 2,
| sft_prepared_dataset_alpaca_cfg "micro_batch_size": 2,
"gradient_accumulation_steps": 2,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"fsdp": [
"full_shard",
"auto_wrap",
],
"fsdp_config": {
"fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
"fsdp_cpu_ram_efficient_loading": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": fsdp_state_dict_type,
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
},
"use_tensorboard": True,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -496,7 +458,7 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
) )
@require_torch_2_6_0 @require_torch_2_6_0
@@ -509,43 +471,51 @@ class TestMultiGPULlama:
[True, False], [True, False],
) )
def test_fsdp2_packed( def test_fsdp2_packed(
self, self, temp_dir, attention_backend, fsdp_reshard_after_forward
temp_dir,
sft_prepared_dataset_alpaca_cfg,
attention_backend,
fsdp_reshard_after_forward,
): ):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = ( cfg = DictDefault(
DictDefault( {
{ "base_model": "HuggingFaceTB/SmolLM2-135M",
"pad_to_sequence_len": True, "sample_packing": True,
"num_epochs": 1, "pad_to_sequence_len": True,
"max_steps": 2, "sequence_len": 2048,
"micro_batch_size": 4, "val_set_size": 0.1,
"gradient_accumulation_steps": 2, "special_tokens": {
"gradient_checkpointing": True, "pad_token": "<|endoftext|>",
"output_dir": temp_dir, },
"learning_rate": 0.00001, "datasets": [
"optimizer": "adamw_torch_8bit", {
"lr_scheduler": "cosine", "path": "tatsu-lab/alpaca",
"fsdp": [ "type": "alpaca",
"auto_wrap", "split": "train[:10%]",
],
"fsdp_config": {
"fsdp_version": 2,
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
"fsdp_offload_params": False,
"fsdp_cpu_ram_efficient_loading": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
}, },
"use_tensorboard": True, ],
} "num_epochs": 1,
) "max_steps": 2,
| sft_prepared_dataset_alpaca_cfg "micro_batch_size": 4,
"gradient_accumulation_steps": 2,
"gradient_checkpointing": True,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_8bit",
"lr_scheduler": "cosine",
"fsdp": [
"auto_wrap",
],
"fsdp_config": {
"fsdp_version": 2,
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
"fsdp_offload_params": False,
"fsdp_cpu_ram_efficient_loading": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
},
"use_tensorboard": True,
}
) )
if attention_backend == "flash": if attention_backend == "flash":
cfg.flash_attention = True cfg.flash_attention = True
@@ -573,55 +543,64 @@ class TestMultiGPULlama:
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
) )
def test_fsdp_qlora_prequant_packed( def test_fsdp_qlora_prequant_packed(self, temp_dir):
self, temp_dir, sft_prepared_dataset_alpaca_cfg
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = ( cfg = DictDefault(
DictDefault( {
{ "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
"base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16", "adapter": "qlora",
"adapter": "qlora", "mean_resizing_embeddings": True,
"mean_resizing_embeddings": True, "load_in_4bit": True,
"load_in_4bit": True, "lora_r": 8,
"lora_r": 8, "lora_alpha": 16,
"lora_alpha": 16, "lora_dropout": 0.05,
"lora_dropout": 0.05, "lora_target_linear": True,
"lora_target_linear": True, # "lora_modules_to_save": [
# "lora_modules_to_save": [ # "embed_tokens",
# "embed_tokens", # "lm_head",
# "lm_head", # ],
# ], "sample_packing": True,
"eval_sample_packing": False, "eval_sample_packing": False,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"num_epochs": 1, "sequence_len": 1024,
"max_steps": 2, "val_set_size": 0.01,
"micro_batch_size": 2, "special_tokens": {
"gradient_accumulation_steps": 2, "pad_token": "<|endoftext|>",
# "gradient_checkpointing": True, },
"output_dir": temp_dir, "datasets": [
"learning_rate": 0.00001, {
"optimizer": "adamw_torch_fused", "path": "tatsu-lab/alpaca",
"lr_scheduler": "cosine", "type": "alpaca",
"flash_attention": True, "split": "train[:10%]",
"fsdp": [
"full_shard",
"auto_wrap",
],
"fsdp_config": {
"fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
"fsdp_cpu_ram_efficient_loading": True,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
}, },
"use_tensorboard": True, ],
} "num_epochs": 1,
) "max_steps": 2,
| sft_prepared_dataset_alpaca_cfg "micro_batch_size": 2,
"gradient_accumulation_steps": 2,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"fsdp": [
"full_shard",
"auto_wrap",
],
"fsdp_config": {
"fsdp_limit_all_gathers": True,
"fsdp_offload_params": False,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": False,
"fsdp_cpu_ram_efficient_loading": True,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
},
"use_tensorboard": True,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -662,12 +641,7 @@ class TestMultiGPULlama:
[True, False], [True, False],
) )
def test_ds_zero3_packed( def test_ds_zero3_packed(
self, self, temp_dir, gradient_accumulation_steps, deepspeed, qlora
temp_dir,
sft_prepared_dataset_alpaca_cfg,
gradient_accumulation_steps,
deepspeed,
qlora,
): ):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
if qlora: if qlora:
@@ -681,25 +655,37 @@ class TestMultiGPULlama:
} }
else: else:
adapter = {} adapter = {}
cfg = ( cfg = DictDefault(
DictDefault( {
{ "base_model": "HuggingFaceTB/SmolLM2-135M",
"pad_to_sequence_len": True, "sample_packing": True,
"num_epochs": 1, "pad_to_sequence_len": True,
"max_steps": 2, "sequence_len": 1024,
"micro_batch_size": 1, "val_set_size": 0.05,
"gradient_accumulation_steps": gradient_accumulation_steps, "special_tokens": {
"output_dir": temp_dir, "pad_token": "<|endoftext|>",
"learning_rate": 0.00001, },
"optimizer": "adamw_torch_fused", "datasets": [
"lr_scheduler": "cosine", {
"flash_attention": True, "path": "tatsu-lab/alpaca",
"deepspeed": str(AXOLOTL_ROOT / deepspeed), "type": "alpaca",
"use_tensorboard": True, "split": "train[:10%]",
**adapter, },
} ],
) "num_epochs": 1,
| sft_prepared_dataset_alpaca_cfg "max_steps": 2,
"micro_batch_size": 1,
"gradient_accumulation_steps": gradient_accumulation_steps,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / deepspeed),
"use_tensorboard": True,
**adapter,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -720,7 +706,7 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
@@ -731,13 +717,7 @@ class TestMultiGPULlama:
"qlora", "qlora",
[True, False], [True, False],
) )
def test_ds_zero2_packed( def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
self,
temp_dir,
sft_prepared_dataset_alpaca_cfg,
gradient_accumulation_steps,
qlora,
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
if qlora: if qlora:
adapter = { adapter = {
@@ -750,25 +730,37 @@ class TestMultiGPULlama:
} }
else: else:
adapter = {} adapter = {}
cfg = ( cfg = DictDefault(
DictDefault( {
{ "base_model": "HuggingFaceTB/SmolLM2-135M",
"pad_to_sequence_len": True, "sample_packing": True,
"num_epochs": 1, "pad_to_sequence_len": True,
"max_steps": 2, "sequence_len": 1024,
"micro_batch_size": 1, "val_set_size": 0.01,
"gradient_accumulation_steps": gradient_accumulation_steps, "special_tokens": {
"output_dir": temp_dir, "pad_token": "<|endoftext|>",
"learning_rate": 0.00001, },
"optimizer": "adamw_torch_fused", "datasets": [
"lr_scheduler": "cosine", {
"flash_attention": True, "path": "tatsu-lab/alpaca",
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"), "type": "alpaca",
"use_tensorboard": True, "split": "train[:10%]",
**adapter, },
} ],
) "num_epochs": 1,
| sft_prepared_dataset_alpaca_cfg "max_steps": 2,
"micro_batch_size": 1,
"gradient_accumulation_steps": gradient_accumulation_steps,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
"use_tensorboard": True,
**adapter,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -789,7 +781,7 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
@@ -800,13 +792,7 @@ class TestMultiGPULlama:
"qlora", "qlora",
[True, False], [True, False],
) )
def test_ds_zero1_packed( def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
self,
temp_dir,
sft_prepared_dataset_alpaca_cfg,
gradient_accumulation_steps,
qlora,
):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
if qlora: if qlora:
adapter = { adapter = {
@@ -819,25 +805,37 @@ class TestMultiGPULlama:
} }
else: else:
adapter = {} adapter = {}
cfg = ( cfg = DictDefault(
DictDefault( {
{ "base_model": "HuggingFaceTB/SmolLM2-135M",
"pad_to_sequence_len": True, "sample_packing": True,
"num_epochs": 1, "pad_to_sequence_len": True,
"max_steps": 2, "sequence_len": 1024,
"micro_batch_size": 1, "val_set_size": 0.01,
"gradient_accumulation_steps": gradient_accumulation_steps, "special_tokens": {
"output_dir": temp_dir, "pad_token": "<|endoftext|>",
"learning_rate": 0.00001, },
"optimizer": "adamw_torch_fused", "datasets": [
"lr_scheduler": "cosine", {
"flash_attention": True, "path": "tatsu-lab/alpaca",
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"), "type": "alpaca",
"use_tensorboard": True, "split": "train[:10%]",
**adapter, },
} ],
) "num_epochs": 1,
| sft_prepared_dataset_alpaca_cfg "max_steps": 2,
"micro_batch_size": 1,
"gradient_accumulation_steps": gradient_accumulation_steps,
"output_dir": temp_dir,
"dataset_prepared_path": temp_dir + "/last_run_prepared",
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
"lr_scheduler": "cosine",
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"use_tensorboard": True,
**adapter,
}
) )
# write cfg to yaml file # write cfg to yaml file
@@ -858,7 +856,7 @@ class TestMultiGPULlama:
) )
check_tensorboard( check_tensorboard(
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high" temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
) )
@pytest.mark.skip( @pytest.mark.skip(