Compare commits
4 Commits
fix/merge-
...
smaller-ra
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a0670abc94 | ||
|
|
08f287b57f | ||
|
|
b4c7d9c29d | ||
|
|
d2637fb01d |
@@ -496,6 +496,12 @@ def dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff(
|
||||
return datasets.load_from_disk(ds_path)["train"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def download_tiny_llama_7m_model():
|
||||
# download the model
|
||||
return snapshot_download_w_retry("axolotl-ai-internal/llama-7m", repo_type="model")
|
||||
|
||||
|
||||
# # pylint: disable=redefined-outer-name,unused-argument
|
||||
# def test_load_fixtures(
|
||||
# download_smollm2_135m_model,
|
||||
|
||||
@@ -90,7 +90,7 @@ class TestKnowledgeDistillation:
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "model.safetensors").exists()
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/loss", 1.0, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -121,5 +121,5 @@ class TestKnowledgeDistillation:
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "adapter_model.safetensors").exists()
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/loss", 1.0, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -89,5 +89,5 @@ class TestPackedFlex:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -96,5 +96,5 @@ class TestMultiGPUGemma3:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 1.8, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -43,7 +43,7 @@ class TestMultiGPULlama:
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"base_model": "axolotl-ai-internal/llama-7m",
|
||||
"sequence_len": 2048,
|
||||
"adapter": "lora",
|
||||
"lora_r": 8,
|
||||
@@ -94,7 +94,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -105,7 +105,7 @@ class TestMultiGPULlama:
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"base_model": "axolotl-ai-internal/llama-7m",
|
||||
"sequence_len": 2048,
|
||||
"sample_packing": True,
|
||||
"eval_sample_packing": False,
|
||||
@@ -159,14 +159,14 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
def test_dpo_lora_ddp(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"base_model": "axolotl-ai-internal/llama-7m",
|
||||
"sequence_len": 2048,
|
||||
"sample_packing": False,
|
||||
"eval_sample_packing": False,
|
||||
@@ -244,7 +244,7 @@ class TestMultiGPULlama:
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"base_model": "axolotl-ai-internal/llama-7m",
|
||||
"sequence_len": 2048,
|
||||
"sample_packing": False,
|
||||
"eval_sample_packing": False,
|
||||
@@ -326,7 +326,7 @@ class TestMultiGPULlama:
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"base_model": "axolotl-ai-internal/llama-7m",
|
||||
"sequence_len": 2048,
|
||||
"val_set_size": 0.01,
|
||||
"special_tokens": {
|
||||
@@ -385,7 +385,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -396,7 +396,7 @@ class TestMultiGPULlama:
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"base_model": "axolotl-ai-internal/llama-7m",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
@@ -457,7 +457,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@require_torch_2_6_0
|
||||
@@ -475,7 +475,7 @@ class TestMultiGPULlama:
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"base_model": "axolotl-ai-internal/llama-7m",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 2048,
|
||||
@@ -538,7 +538,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
def test_fsdp_qlora_prequant_packed(self, temp_dir):
|
||||
@@ -618,7 +618,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -654,7 +654,7 @@ class TestMultiGPULlama:
|
||||
adapter = {}
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"base_model": "axolotl-ai-internal/llama-7m",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
@@ -702,7 +702,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -728,7 +728,7 @@ class TestMultiGPULlama:
|
||||
adapter = {}
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"base_model": "axolotl-ai-internal/llama-7m",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
@@ -776,7 +776,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -802,7 +802,7 @@ class TestMultiGPULlama:
|
||||
adapter = {}
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"base_model": "axolotl-ai-internal/llama-7m",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
@@ -850,7 +850,7 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.skip(
|
||||
@@ -860,7 +860,7 @@ class TestMultiGPULlama:
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"base_model": "axolotl-ai-internal/llama-7m",
|
||||
"fix_untrained_tokens": True,
|
||||
"sequence_len": 512,
|
||||
"val_set_size": 0.0,
|
||||
@@ -917,5 +917,5 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 4.0, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -80,7 +80,7 @@ class TestMultiGPURay:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@require_torch_lt_2_6_0
|
||||
@@ -138,5 +138,5 @@ class TestMultiGPURay:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -93,7 +93,7 @@ class TestSequenceParallelism:
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.6, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.6, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@@ -86,5 +86,5 @@ class TestFAXentropyLlama:
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 1.5, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -80,7 +80,7 @@ class TestUnslothQLoRA:
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
def test_unsloth_llama_qlora_unpacked(self, temp_dir):
|
||||
@@ -130,7 +130,7 @@ class TestUnslothQLoRA:
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -185,5 +185,5 @@ class TestUnslothQLoRA:
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -69,5 +69,5 @@ class TestPackedFlex(unittest.TestCase):
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -84,5 +84,5 @@ class TestPretrainLlama:
|
||||
temp_dir + "/runs",
|
||||
"train/train_loss",
|
||||
loss_threshold,
|
||||
"Train Loss is too high",
|
||||
"Train Loss (%s) is too high",
|
||||
)
|
||||
|
||||
@@ -68,5 +68,5 @@ class TestPackedLlama(unittest.TestCase):
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train loss (%s) is too high"
|
||||
)
|
||||
|
||||
@@ -73,6 +73,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
|
||||
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss is too high"
|
||||
temp_dir + "/runs", "train/train_loss", 2.5, "Train loss (%s) is too high"
|
||||
)
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
|
||||
@@ -8,7 +8,7 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer
|
||||
|
||||
from axolotl.utils.callbacks.perplexity import Perplexity
|
||||
|
||||
MODEL_NAME = "HuggingFaceTB/SmolLM2-135M"
|
||||
MODEL_NAME = "axolotl-ai-internal/llama-7m"
|
||||
|
||||
|
||||
@fixture()
|
||||
@@ -36,7 +36,7 @@ One day, a little fish named Fin was swimming near the shore. He saw a big crab
|
||||
"""
|
||||
result = metric.compute(model, [sample_text])
|
||||
ppl = result["score"]
|
||||
assert round(ppl, 2) == 7.41
|
||||
assert round(ppl, 2) == 75.14
|
||||
|
||||
|
||||
def test_perplexity_short(model, metric):
|
||||
@@ -44,4 +44,4 @@ def test_perplexity_short(model, metric):
|
||||
sample_text = "Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun."
|
||||
result = metric.compute(model, [sample_text])
|
||||
ppl = result["score"]
|
||||
assert round(ppl, 2) == 10.33
|
||||
assert round(ppl, 2) == 70.54
|
||||
|
||||
Reference in New Issue
Block a user