From 431888c1de9b8059f5f8d80f73556e5df09d594f Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 23 Apr 2026 13:51:01 +0000 Subject: [PATCH] use smaller pretrained models for ci --- tests/conftest.py | 56 +++++++-- .../integrations/test_cut_cross_entropy.py | 34 ++++- tests/e2e/kernels/test_lora_features.py | 2 +- tests/e2e/multigpu/test_dist_muon_fsdp2.py | 22 +++- tests/e2e/multigpu/test_fsdp1.py | 26 +++- tests/e2e/multigpu/test_fsdp2.py | 32 +++-- tests/e2e/multigpu/test_fsdp2_lora_kernels.py | 2 +- tests/e2e/multigpu/test_tp.py | 8 +- .../lora_kernels/test_lora_kernel_patching.py | 8 +- tests/e2e/patched/test_falcon_samplepack.py | 56 ++++++--- tests/e2e/patched/test_mistral_samplepack.py | 49 ++++++-- tests/e2e/patched/test_mixtral_samplepack.py | 50 +++++--- tests/e2e/patched/test_model_patches.py | 5 +- tests/e2e/patched/test_phi_multipack.py | 50 +++++--- tests/e2e/solo/test_batch_flattening.py | 2 +- tests/e2e/test_falcon.py | 80 ++++++++---- tests/e2e/test_mistral.py | 48 ++++++-- tests/e2e/test_mixtral.py | 116 ++++++++++++------ tests/e2e/test_optimizers.py | 22 ++-- tests/e2e/test_phi.py | 50 +++++--- tests/e2e/test_preprocess.py | 2 +- tests/e2e/test_quantization.py | 2 +- tests/e2e/test_qwen.py | 2 +- tests/e2e/utils.py | 95 ++++++++++++++ 24 files changed, 614 insertions(+), 205 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 19e3dc3f0..16a01f8aa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -119,15 +119,49 @@ def download_smollm2_135m_gptq_model(): @pytest.fixture(scope="session", autouse=True) -def download_qwen_2_5_half_billion_model(): - # download the model - snapshot_download_w_retry("Qwen/Qwen2.5-0.5B", repo_type="model") +def download_qwen3_half_billion_model(): + # download the model (still used as the KD teacher in tests/e2e/integrations/test_kd.py) + snapshot_download_w_retry("Qwen/Qwen3-0.6B", repo_type="model") @pytest.fixture(scope="session", autouse=True) -def download_qwen3_half_billion_model(): - # download the model - snapshot_download_w_retry("Qwen/Qwen3-0.6B", repo_type="model") +def download_tiny_llama_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-llama-50m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_mistral_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-mistral-25m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_mixtral_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-mixtral-30m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_phi_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-phi-64m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_falcon_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-falcon-42m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_qwen2_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-qwen2-129m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_qwen3_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-qwen3-129m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_gemma2_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-gemma2-137m", repo_type="model") @pytest.fixture(scope="session", autouse=True) @@ -620,7 +654,15 @@ def fixture_min_base_cfg(): ) def test_load_fixtures( download_smollm2_135m_model, - download_qwen_2_5_half_billion_model, + download_qwen3_half_billion_model, + download_tiny_llama_model, + download_tiny_mistral_model, + download_tiny_mixtral_model, + download_tiny_phi_model, + download_tiny_falcon_model, + download_tiny_qwen2_model, + download_tiny_qwen3_model, + download_tiny_gemma2_model, download_tatsu_lab_alpaca_dataset, download_mhenrichsen_alpaca_2k_dataset, download_mhenrichsen_alpaca_2k_w_revision_dataset, diff --git a/tests/e2e/integrations/test_cut_cross_entropy.py b/tests/e2e/integrations/test_cut_cross_entropy.py index 7da644ec3..2cebefdd0 100644 --- a/tests/e2e/integrations/test_cut_cross_entropy.py +++ b/tests/e2e/integrations/test_cut_cross_entropy.py @@ -10,7 +10,10 @@ from axolotl.utils import get_pytorch_version from axolotl.utils.config import normalize_config, prepare_plugins, validate_config from axolotl.utils.dict import DictDefault -from tests.e2e.utils import check_model_output_exists +from tests.e2e.utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, +) @pytest.fixture() @@ -42,6 +45,7 @@ def min_cfg(temp_dir): "max_steps": 10, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } @@ -64,11 +68,18 @@ class TestCutCrossEntropyIntegration: else: train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) def test_qwen2_w_cce(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "plugins": [ "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin", ], @@ -87,13 +98,14 @@ class TestCutCrossEntropyIntegration: "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 1, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "output_dir": temp_dir, "lr_scheduler": "cosine", - "max_steps": 10, + "max_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) cfg = validate_config(cfg) @@ -108,6 +120,13 @@ class TestCutCrossEntropyIntegration: else: train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) @pytest.mark.parametrize( "attention_type", @@ -136,3 +155,10 @@ class TestCutCrossEntropyIntegration: else: train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) diff --git a/tests/e2e/kernels/test_lora_features.py b/tests/e2e/kernels/test_lora_features.py index 80495c68d..688710f3a 100644 --- a/tests/e2e/kernels/test_lora_features.py +++ b/tests/e2e/kernels/test_lora_features.py @@ -24,7 +24,7 @@ from axolotl.monkeypatch.lora_kernels import ( ) from axolotl.utils.dict import DictDefault -MODEL_NAME = "Qwen/Qwen3-0.6B" +MODEL_NAME = "axolotl-ai-co/tiny-qwen3-129m" DEVICE = "cuda" DTYPE = torch.bfloat16 diff --git a/tests/e2e/multigpu/test_dist_muon_fsdp2.py b/tests/e2e/multigpu/test_dist_muon_fsdp2.py index 93db473a9..a6bb0e1cc 100644 --- a/tests/e2e/multigpu/test_dist_muon_fsdp2.py +++ b/tests/e2e/multigpu/test_dist_muon_fsdp2.py @@ -37,12 +37,26 @@ def verify_training_success(temp_dir): event_file = os.path.join(tb_log_path, event_files[0]) reader = SummaryReader(event_file) df = reader.scalars - train_loss_df = df[df.tag == "train/train_loss"] + train_loss_df = df[df.tag == "train/loss"] + if len(train_loss_df) == 0: + train_loss_df = df[df.tag == "train/train_loss"] if len(train_loss_df) > 0: - final_loss = train_loss_df.value.values[-1] + values = train_loss_df.value.values + final_loss = values[-1] assert not torch.isnan(torch.tensor(final_loss)), ( f"Training loss is NaN: {final_loss}" ) + if len(values) >= 2: + initial_loss = float(values[0]) + assert float(final_loss) <= initial_loss * 1.10, ( + f"Training loss regressed: initial={initial_loss:.4f}, " + f"final={final_loss:.4f} — likely silent bug (e.g. " + "bad label masking) pushed loss scale up." + ) + assert float(final_loss) <= 10.0, ( + f"Final loss {final_loss:.4f} above sanity bound 10.0 " + "— absolute scale wrong." + ) class TestDistMuon: @@ -52,7 +66,7 @@ class TestDistMuon: def test_fft_sft(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -109,7 +123,7 @@ class TestDistMuon: def test_lora_sft(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ diff --git a/tests/e2e/multigpu/test_fsdp1.py b/tests/e2e/multigpu/test_fsdp1.py index 5b6724791..1c11ba4de 100644 --- a/tests/e2e/multigpu/test_fsdp1.py +++ b/tests/e2e/multigpu/test_fsdp1.py @@ -38,12 +38,26 @@ def verify_training_success(temp_dir): event_file = os.path.join(tb_log_path, event_files[0]) reader = SummaryReader(event_file) df = reader.scalars - train_loss_df = df[df.tag == "train/train_loss"] + train_loss_df = df[df.tag == "train/loss"] + if len(train_loss_df) == 0: + train_loss_df = df[df.tag == "train/train_loss"] if len(train_loss_df) > 0: - final_loss = train_loss_df.value.values[-1] + values = train_loss_df.value.values + final_loss = values[-1] assert not torch.isnan(torch.tensor(final_loss)), ( f"Training loss is NaN: {final_loss}" ) + if len(values) >= 2: + initial_loss = float(values[0]) + assert float(final_loss) <= initial_loss * 1.10, ( + f"Training loss regressed: initial={initial_loss:.4f}, " + f"final={final_loss:.4f} — likely silent bug (e.g. " + "bad label masking) pushed loss scale up." + ) + assert float(final_loss) <= 10.0, ( + f"Final loss {final_loss:.4f} above sanity bound 10.0 " + "— absolute scale wrong." + ) class TestFSDP1: @@ -56,7 +70,7 @@ class TestFSDP1: def test_fft_sft(self, temp_dir, fsdp_cpu_ram_efficient_loading): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -126,7 +140,7 @@ class TestFSDP1: def test_lora_sft(self, temp_dir, adapter_config): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -190,7 +204,7 @@ class TestFSDP1: def test_dpo_fft(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "rl": "dpo", @@ -262,7 +276,7 @@ class TestFSDP1: def test_dpo_lora(self, temp_dir, adapter_config): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "load_in_4bit": adapter_config["load_in_4bit"], "rl": "dpo", "chat_template": "chatml", diff --git a/tests/e2e/multigpu/test_fsdp2.py b/tests/e2e/multigpu/test_fsdp2.py index a70ff9aa7..95e16b52b 100644 --- a/tests/e2e/multigpu/test_fsdp2.py +++ b/tests/e2e/multigpu/test_fsdp2.py @@ -38,12 +38,26 @@ def verify_training_success(temp_dir): event_file = os.path.join(tb_log_path, event_files[0]) reader = SummaryReader(event_file) df = reader.scalars - train_loss_df = df[df.tag == "train/train_loss"] + train_loss_df = df[df.tag == "train/loss"] + if len(train_loss_df) == 0: + train_loss_df = df[df.tag == "train/train_loss"] if len(train_loss_df) > 0: - final_loss = train_loss_df.value.values[-1] + values = train_loss_df.value.values + final_loss = values[-1] assert not torch.isnan(torch.tensor(final_loss)), ( f"Training loss is NaN: {final_loss}" ) + if len(values) >= 2: + initial_loss = float(values[0]) + assert float(final_loss) <= initial_loss * 1.10, ( + f"Training loss regressed: initial={initial_loss:.4f}, " + f"final={final_loss:.4f} — likely silent bug (e.g. " + "bad label masking) pushed loss scale up." + ) + assert float(final_loss) <= 10.0, ( + f"Final loss {final_loss:.4f} above sanity bound 10.0 " + "— absolute scale wrong." + ) class TestFSDP2: @@ -57,7 +71,7 @@ class TestFSDP2: def test_fft_sft(self, temp_dir, fsdp_cpu_ram_efficient_loading): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -114,7 +128,7 @@ class TestFSDP2: def test_lora_sft(self, temp_dir, peft_use_dora): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -180,7 +194,7 @@ class TestFSDP2: def test_lora_sft_kernels(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -243,7 +257,7 @@ class TestFSDP2: def test_qlora_sft(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -305,7 +319,7 @@ class TestFSDP2: def test_qlora_sft_kernels(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -370,7 +384,7 @@ class TestFSDP2: def test_dpo_fft(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "rl": "dpo", @@ -428,7 +442,7 @@ class TestFSDP2: def test_dpo_lora(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "rl": "dpo", "chat_template": "chatml", diff --git a/tests/e2e/multigpu/test_fsdp2_lora_kernels.py b/tests/e2e/multigpu/test_fsdp2_lora_kernels.py index 27ad2b8e9..0f2fd421a 100644 --- a/tests/e2e/multigpu/test_fsdp2_lora_kernels.py +++ b/tests/e2e/multigpu/test_fsdp2_lora_kernels.py @@ -40,7 +40,7 @@ def _run_training(temp_dir, cfg): def _base_lora_fsdp2_config(temp_dir, **overrides): """Base config for LoRA + FSDP2 + kernel tests.""" cfg = { - "base_model": "Qwen/Qwen3-0.6B", + "base_model": "axolotl-ai-co/tiny-qwen3-129m", "sequence_len": 512, "val_set_size": 0.0, "datasets": [ diff --git a/tests/e2e/multigpu/test_tp.py b/tests/e2e/multigpu/test_tp.py index 9891a0906..965dfa8e5 100644 --- a/tests/e2e/multigpu/test_tp.py +++ b/tests/e2e/multigpu/test_tp.py @@ -8,7 +8,7 @@ from accelerate.test_utils import execute_subprocess_async, get_torch_dist_uniqu from axolotl.utils.dict import DictDefault -from tests.e2e.utils import check_tensorboard, require_torch_2_7_0 +from tests.e2e.utils import check_tensorboard_loss_decreased, require_torch_2_7_0 class TestTensorParallel: @@ -21,7 +21,7 @@ class TestTensorParallel: def test_fft_sft(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -63,6 +63,6 @@ class TestTensorParallel: ] ) - check_tensorboard( - temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high" + check_tensorboard_loss_decreased( + temp_dir + "/runs", max_initial=5.0, max_final=4.7 ) diff --git a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py index 2865a80f9..4844ce539 100644 --- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py +++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py @@ -32,12 +32,12 @@ from axolotl.utils.dict import DictDefault MODEL_CONFIGS = [ { - "name": "trl-internal-testing/tiny-MistralForCausalLM-0.2", + "name": "axolotl-ai-co/tiny-mistral-25m", "expected_activation": apply_lora_mlp_swiglu, "dtype": torch.float16, }, { - "name": "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", + "name": "axolotl-ai-co/tiny-qwen2-129m", "expected_activation": apply_lora_mlp_swiglu, "dtype": torch.float16, }, @@ -47,7 +47,7 @@ MODEL_CONFIGS = [ "dtype": torch.float32, }, { - "name": "trl-internal-testing/tiny-Gemma2ForCausalLM", + "name": "axolotl-ai-co/tiny-gemma2-137m", "expected_activation": apply_lora_mlp_geglu, "dtype": torch.float16, }, @@ -159,7 +159,7 @@ def test_swiglu_mlp_integration(small_llama_model): def test_geglu_model_integration(): """Test GeGLU activation with Gemma model.""" model = AutoModelForCausalLM.from_pretrained( - "trl-internal-testing/tiny-Gemma2ForCausalLM", + "axolotl-ai-co/tiny-gemma2-137m", dtype=torch.float16, device_map="cuda:0", ) diff --git a/tests/e2e/patched/test_falcon_samplepack.py b/tests/e2e/patched/test_falcon_samplepack.py index cc5091403..3166eff52 100644 --- a/tests/e2e/patched/test_falcon_samplepack.py +++ b/tests/e2e/patched/test_falcon_samplepack.py @@ -4,14 +4,16 @@ E2E tests for falcon import unittest -import pytest - from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from ..utils import check_model_output_exists, with_temp_dir +from ..utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestFalconPatched(unittest.TestCase): @@ -19,13 +21,12 @@ class TestFalconPatched(unittest.TestCase): Test case for Falcon models """ - @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_qlora(self, temp_dir): cfg = DictDefault( { - "base_model": "illuin/tiny-random-FalconForCausalLM", - "flash_attention": True, + "base_model": "axolotl-ai-co/tiny-falcon-42m", + "flash_attention": False, "sample_packing": True, "sequence_len": 2048, "load_in_4bit": True, @@ -47,17 +48,19 @@ class TestFalconPatched(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) cfg = validate_config(cfg) @@ -66,14 +69,20 @@ class TestFalconPatched(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=6.0, + max_final=4.7, + ) - @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { - "base_model": "illuin/tiny-random-FalconForCausalLM", - "flash_attention": True, + "base_model": "axolotl-ai-co/tiny-falcon-42m", + "flash_attention": False, "sample_packing": True, "sequence_len": 2048, "val_set_size": 0.05, @@ -88,17 +97,19 @@ class TestFalconPatched(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) cfg = validate_config(cfg) @@ -107,3 +118,10 @@ class TestFalconPatched(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=6.0, + max_final=4.7, + ) diff --git a/tests/e2e/patched/test_mistral_samplepack.py b/tests/e2e/patched/test_mistral_samplepack.py index e03941b07..7b38319b3 100644 --- a/tests/e2e/patched/test_mistral_samplepack.py +++ b/tests/e2e/patched/test_mistral_samplepack.py @@ -9,7 +9,12 @@ from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from ..utils import check_model_output_exists, require_torch_2_6_0, with_temp_dir +from ..utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + require_torch_2_6_0, + with_temp_dir, +) class TestMistral(unittest.TestCase): @@ -22,7 +27,7 @@ class TestMistral(unittest.TestCase): def test_lora_packing(self, temp_dir): cfg = DictDefault( { - "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", + "base_model": "axolotl-ai-co/tiny-mistral-25m", "flash_attention": True, "sample_packing": True, "sequence_len": 1024, @@ -45,17 +50,19 @@ class TestMistral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 5, - "save_steps": 3, - "eval_steps": 4, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) cfg = validate_config(cfg) @@ -64,12 +71,19 @@ class TestMistral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.5, + max_final=4.3, + ) @with_temp_dir def test_ft_packing(self, temp_dir): cfg = DictDefault( { - "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", + "base_model": "axolotl-ai-co/tiny-mistral-25m", "flash_attention": True, "sample_packing": True, "sequence_len": 1024, @@ -86,17 +100,19 @@ class TestMistral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 5, - "save_steps": 3, - "eval_steps": 4, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) cfg = validate_config(cfg) @@ -105,3 +121,10 @@ class TestMistral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.5, + max_final=4.3, + ) diff --git a/tests/e2e/patched/test_mixtral_samplepack.py b/tests/e2e/patched/test_mixtral_samplepack.py index 3517ff3db..92a3810c4 100644 --- a/tests/e2e/patched/test_mixtral_samplepack.py +++ b/tests/e2e/patched/test_mixtral_samplepack.py @@ -9,7 +9,11 @@ from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from ..utils import check_model_output_exists, with_temp_dir +from ..utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestMixtral(unittest.TestCase): @@ -21,8 +25,7 @@ class TestMixtral(unittest.TestCase): def test_qlora(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, @@ -41,17 +44,19 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 5, - "save_steps": 3, - "eval_steps": 4, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) cfg = validate_config(cfg) @@ -60,13 +65,19 @@ class TestMixtral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=6.0, + max_final=4.7, + ) @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, @@ -79,17 +90,19 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 5, - "save_steps": 3, - "eval_steps": 4, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) cfg = validate_config(cfg) @@ -98,3 +111,10 @@ class TestMixtral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=6.0, + max_final=4.7, + ) diff --git a/tests/e2e/patched/test_model_patches.py b/tests/e2e/patched/test_model_patches.py index aaaaf5fe2..83448d175 100644 --- a/tests/e2e/patched/test_model_patches.py +++ b/tests/e2e/patched/test_model_patches.py @@ -22,8 +22,7 @@ class TestModelPatches(unittest.TestCase): def test_mixtral_multipack(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, @@ -57,7 +56,7 @@ class TestModelPatches(unittest.TestCase): def test_mistral_multipack(self, temp_dir): cfg = DictDefault( { - "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", + "base_model": "axolotl-ai-co/tiny-mistral-25m", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, diff --git a/tests/e2e/patched/test_phi_multipack.py b/tests/e2e/patched/test_phi_multipack.py index 77b2d99e5..2e8d45f05 100644 --- a/tests/e2e/patched/test_phi_multipack.py +++ b/tests/e2e/patched/test_phi_multipack.py @@ -9,7 +9,11 @@ from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from ..utils import check_model_output_exists, with_temp_dir +from ..utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestPhiMultipack(unittest.TestCase): @@ -21,7 +25,7 @@ class TestPhiMultipack(unittest.TestCase): def test_ft_packed(self, temp_dir): cfg = DictDefault( { - "base_model": "microsoft/phi-1_5", + "base_model": "axolotl-ai-co/tiny-phi-64m", "model_type": "PhiForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, @@ -43,17 +47,19 @@ class TestPhiMultipack(unittest.TestCase): "dataset_shard_num": 10, "dataset_shard_idx": 0, "num_epochs": 1, - "micro_batch_size": 1, + "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, - "optimizer": "adamw_bnb_8bit", + "learning_rate": 2e-4, + "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 5, - "eval_steps": 3, - "save_steps": 4, + "max_steps": 50, + "logging_steps": 1, + "eval_steps": 50, + "save_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) @@ -63,12 +69,19 @@ class TestPhiMultipack(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=6.0, + max_final=4.7, + ) @with_temp_dir def test_qlora_packed(self, temp_dir): cfg = DictDefault( { - "base_model": "microsoft/phi-1_5", + "base_model": "axolotl-ai-co/tiny-phi-64m", "model_type": "PhiForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, @@ -94,17 +107,19 @@ class TestPhiMultipack(unittest.TestCase): "dataset_shard_num": 10, "dataset_shard_idx": 0, "num_epochs": 1, - "micro_batch_size": 1, + "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 5, - "eval_steps": 3, - "save_steps": 4, + "max_steps": 50, + "logging_steps": 1, + "eval_steps": 50, + "save_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) @@ -114,3 +129,10 @@ class TestPhiMultipack(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=6.0, + max_final=4.7, + ) diff --git a/tests/e2e/solo/test_batch_flattening.py b/tests/e2e/solo/test_batch_flattening.py index 80b7b0259..7b6c59119 100644 --- a/tests/e2e/solo/test_batch_flattening.py +++ b/tests/e2e/solo/test_batch_flattening.py @@ -18,7 +18,7 @@ from transformers import AutoModelForCausalLM # Import the actual trainer methods we want to test from axolotl.core.trainers.grpo.async_trainer import AsyncGRPOTrainer -MODEL_NAME = "Qwen/Qwen3-0.6B" +MODEL_NAME = "axolotl-ai-co/tiny-qwen3-129m" def _fix_patched_attention(model): diff --git a/tests/e2e/test_falcon.py b/tests/e2e/test_falcon.py index 1a363fe6a..6a7ead1e0 100644 --- a/tests/e2e/test_falcon.py +++ b/tests/e2e/test_falcon.py @@ -4,14 +4,16 @@ E2E tests for falcon import unittest -import pytest - from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from .utils import check_model_output_exists, with_temp_dir +from .utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestFalcon(unittest.TestCase): @@ -19,13 +21,12 @@ class TestFalcon(unittest.TestCase): Test case for falcon """ - @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_lora(self, temp_dir): cfg = DictDefault( { - "base_model": "illuin/tiny-random-FalconForCausalLM", - "flash_attention": True, + "base_model": "axolotl-ai-co/tiny-falcon-42m", + "flash_attention": False, "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", @@ -49,17 +50,19 @@ class TestFalcon(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) @@ -69,14 +72,20 @@ class TestFalcon(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) - @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_lora_added_vocab(self, temp_dir): cfg = DictDefault( { - "base_model": "illuin/tiny-random-FalconForCausalLM", - "flash_attention": True, + "base_model": "axolotl-ai-co/tiny-falcon-42m", + "flash_attention": False, "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", @@ -104,17 +113,19 @@ class TestFalcon(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) @@ -124,14 +135,20 @@ class TestFalcon(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) - @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { - "base_model": "illuin/tiny-random-FalconForCausalLM", - "flash_attention": True, + "base_model": "axolotl-ai-co/tiny-falcon-42m", + "flash_attention": False, "sequence_len": 1024, "val_set_size": 0.02, "special_tokens": { @@ -145,17 +162,19 @@ class TestFalcon(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) @@ -165,3 +184,10 @@ class TestFalcon(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) diff --git a/tests/e2e/test_mistral.py b/tests/e2e/test_mistral.py index 08b3b05af..37cae7ce7 100644 --- a/tests/e2e/test_mistral.py +++ b/tests/e2e/test_mistral.py @@ -11,7 +11,11 @@ from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from .utils import check_model_output_exists, with_temp_dir +from .utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestMistral(unittest.TestCase): @@ -23,7 +27,7 @@ class TestMistral(unittest.TestCase): def test_lora(self, temp_dir): cfg = DictDefault( { - "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", + "base_model": "axolotl-ai-co/tiny-mistral-25m", "flash_attention": True, "sequence_len": 1024, "load_in_8bit": True, @@ -45,16 +49,18 @@ class TestMistral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) @@ -64,12 +70,19 @@ class TestMistral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=4.5, + max_final=4.3, + ) @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { - "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", + "base_model": "axolotl-ai-co/tiny-mistral-25m", "flash_attention": True, "sequence_len": 1024, "val_set_size": 0.02, @@ -85,16 +98,18 @@ class TestMistral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) if is_torch_bf16_gpu_available(): @@ -108,3 +123,10 @@ class TestMistral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=4.5, + max_final=4.3, + ) diff --git a/tests/e2e/test_mixtral.py b/tests/e2e/test_mixtral.py index c47486b3c..00c75f426 100644 --- a/tests/e2e/test_mixtral.py +++ b/tests/e2e/test_mixtral.py @@ -12,7 +12,11 @@ from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from .utils import check_model_output_exists, with_temp_dir +from .utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestMixtral(unittest.TestCase): @@ -24,8 +28,7 @@ class TestMixtral(unittest.TestCase): def test_qlora_w_fa2(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": True, "sequence_len": 1024, "load_in_4bit": True, @@ -51,16 +54,18 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) @@ -74,13 +79,19 @@ class TestMixtral(unittest.TestCase): == torch.float32 ) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) @with_temp_dir def test_qlora_wo_fa2(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": False, "sequence_len": 1024, "load_in_4bit": True, @@ -106,16 +117,18 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) @@ -129,13 +142,19 @@ class TestMixtral(unittest.TestCase): == torch.float32 ) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) @with_temp_dir def test_16bit_lora_w_fa2(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": True, "sequence_len": 1024, "adapter": "lora", @@ -160,16 +179,18 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) if is_torch_bf16_gpu_available(): @@ -187,13 +208,19 @@ class TestMixtral(unittest.TestCase): == torch.float32 ) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) @with_temp_dir def test_16bit_lora_wo_fa2(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": False, "sequence_len": 1024, "adapter": "lora", @@ -218,16 +245,18 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) @@ -245,13 +274,19 @@ class TestMixtral(unittest.TestCase): == torch.float32 ) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": True, "sequence_len": 1024, "val_set_size": 0.02, @@ -263,16 +298,18 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) if is_torch_bf16_gpu_available(): @@ -286,3 +323,10 @@ class TestMixtral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) diff --git a/tests/e2e/test_optimizers.py b/tests/e2e/test_optimizers.py index a53e8b005..55183eaae 100644 --- a/tests/e2e/test_optimizers.py +++ b/tests/e2e/test_optimizers.py @@ -13,6 +13,7 @@ from axolotl.utils.dict import DictDefault from .utils import ( check_model_output_exists, + check_tensorboard_loss_decreased, require_torch_2_5_1, require_torch_2_6_0, require_torch_2_7_0, @@ -243,8 +244,8 @@ class TestCustomOptimizers(unittest.TestCase): def test_came_pytorch(self, temp_dir): cfg = DictDefault( { - "base_model": "JackFram/llama-68m", - "tokenizer_type": "LlamaTokenizer", + "base_model": "axolotl-ai-co/tiny-llama-50m", + "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", @@ -254,9 +255,7 @@ class TestCustomOptimizers(unittest.TestCase): "lora_target_linear": True, "val_set_size": 0.1, "special_tokens": { - "unk_token": "", - "bos_token": "", - "eos_token": "", + "pad_token": "<|endoftext|>", }, "datasets": [ { @@ -268,13 +267,15 @@ class TestCustomOptimizers(unittest.TestCase): "micro_batch_size": 8, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "came_pytorch", "adam_beta3": 0.9999, "adam_epsilon2": 1e-16, - "max_steps": 5, + "max_steps": 50, + "logging_steps": 1, "lr_scheduler": "cosine", "save_first_step": False, + "use_tensorboard": True, } ) @@ -284,6 +285,13 @@ class TestCustomOptimizers(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=4.5, + max_final=4.3, + ) @require_torch_2_7_0 diff --git a/tests/e2e/test_phi.py b/tests/e2e/test_phi.py index ae2210249..393713b20 100644 --- a/tests/e2e/test_phi.py +++ b/tests/e2e/test_phi.py @@ -9,7 +9,11 @@ from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from .utils import check_model_output_exists, with_temp_dir +from .utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestPhi(unittest.TestCase): @@ -21,7 +25,7 @@ class TestPhi(unittest.TestCase): def test_phi_ft(self, temp_dir): cfg = DictDefault( { - "base_model": "microsoft/phi-1_5", + "base_model": "axolotl-ai-co/tiny-phi-64m", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 2048, @@ -41,18 +45,20 @@ class TestPhi(unittest.TestCase): "dataset_shard_num": 10, "dataset_shard_idx": 0, "num_epochs": 1, - "micro_batch_size": 1, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, - "optimizer": "paged_adamw_8bit", + "learning_rate": 2e-4, + "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, - "max_steps": 10, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) cfg = validate_config(cfg) @@ -61,12 +67,19 @@ class TestPhi(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) @with_temp_dir def test_phi_qlora(self, temp_dir): cfg = DictDefault( { - "base_model": "microsoft/phi-1_5", + "base_model": "axolotl-ai-co/tiny-phi-64m", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 2048, @@ -90,18 +103,20 @@ class TestPhi(unittest.TestCase): "dataset_shard_num": 10, "dataset_shard_idx": 0, "num_epochs": 1, - "micro_batch_size": 1, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "paged_adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, - "max_steps": 10, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, } ) cfg = validate_config(cfg) @@ -110,3 +125,10 @@ class TestPhi(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) diff --git a/tests/e2e/test_preprocess.py b/tests/e2e/test_preprocess.py index 8f15cbe55..895c29c87 100644 --- a/tests/e2e/test_preprocess.py +++ b/tests/e2e/test_preprocess.py @@ -18,7 +18,7 @@ class TestPreprocess: cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ diff --git a/tests/e2e/test_quantization.py b/tests/e2e/test_quantization.py index a70a46194..bb8ce969c 100644 --- a/tests/e2e/test_quantization.py +++ b/tests/e2e/test_quantization.py @@ -45,7 +45,7 @@ def _get_fake_quant_config_dtype(config): @pytest.fixture() def model(): dummy_model = AutoModelForCausalLM.from_pretrained( - "Qwen/Qwen2-0.5B", + "axolotl-ai-co/tiny-qwen2-129m", device_map="auto", dtype=torch.bfloat16, ) diff --git a/tests/e2e/test_qwen.py b/tests/e2e/test_qwen.py index 1c75d817b..b8654c0ad 100644 --- a/tests/e2e/test_qwen.py +++ b/tests/e2e/test_qwen.py @@ -17,7 +17,7 @@ class TestE2eQwen: Test cases for qwen models """ - @pytest.mark.parametrize("base_model", ["Qwen/Qwen2-0.5B", "Qwen/Qwen2.5-0.5B"]) + @pytest.mark.parametrize("base_model", ["axolotl-ai-co/tiny-qwen2-129m"]) def test_dpo(self, base_model, temp_dir): cfg = DictDefault( { diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py index e1eaca050..f0f5d9f13 100644 --- a/tests/e2e/utils.py +++ b/tests/e2e/utils.py @@ -199,6 +199,101 @@ def check_tensorboard( assert df.value.values[-1] > 1e-5, "Expected loss to be greater than zero" +def check_tensorboard_loss_decreased( + temp_run_dir: str, + tag: str | None = None, + initial_window: int = 1, + final_window: int = 1, + min_delta: float | None = None, + max_initial: float | None = None, + max_final: float | None = None, + max_loss_ratio: float = 1.10, +) -> None: + """Check that training didn't regress — loss stayed in a sensible range. + + Used with the tiny ``axolotl-ai-co/tiny-*`` CI models, where pretraining + was brief enough that final loss won't clear the absolute thresholds used + for 135M+ models — but the training pipeline should still behave. + + ``train/train_loss`` is only logged once (end-of-training aggregate). The + per-step tag is ``train/loss`` for SFT/LM trainers and may vary across + trainers (e.g. DPO). When ``tag`` is None we try common per-step tags in + order and use the first with enough samples. + + Two kinds of regression we guard against: + + 1. **Loss blew up.** A silent bug (e.g. broken label masking) can start + training at an absurdly high loss. ``max_initial`` / ``max_final`` + assert the measured means stay at-or-below bounds measured from a + known-good run. Both are optional but strongly encouraged — loss + going *down* from a bad starting scale still looks like "learning." + + 2. **Training diverged.** ``max_loss_ratio`` (default 1.10) requires + ``final <= initial * ratio``. Allows small noise in flat-loss cases + (common with tiny pretrained models that start near optimum), but + a final loss 10%+ above initial flags instability / NaNs / drift. + + ``min_delta`` is optional; when set, additionally requires + ``final + min_delta <= initial`` — use for configs with enough signal + to demand a strict decrease. + """ + tb_log_path = most_recent_subdir(temp_run_dir) + event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) + reader = SummaryReader(event_file) + df = reader.scalars + + if tag is None: + candidates = ["train/loss", "train/train_loss"] + else: + candidates = [tag] + + required = initial_window + final_window + chosen_tag, values = None, None + for candidate in candidates: + sub = df[df.tag == candidate] + if len(sub) >= required: + chosen_tag = candidate + values = sub.value.values + break + + available = sorted({t for t in df.tag.unique() if "loss" in t.lower()}) + assert values is not None, ( + f"None of the tags {candidates} had ≥{required} logged steps. " + f"Loss tags present: {available}" + ) + + initial = float(values[:initial_window].mean()) + final = float(values[-final_window:].mean()) + print( + f"[check_tensorboard_loss_decreased] tag={chosen_tag} n={len(values)} " + f"initial_mean{initial_window}={initial:.4f} final_mean{final_window}={final:.4f}" + ) + assert final > 1e-5, "Expected loss to be greater than zero" + assert final <= initial * max_loss_ratio, ( + f"Loss regressed for {chosen_tag}: " + f"initial(mean of first {initial_window})={initial:.4f}, " + f"final(mean of last {final_window})={final:.4f}, " + f"ratio={final / initial:.4f} (max allowed {max_loss_ratio})" + ) + if min_delta is not None: + assert final + min_delta <= initial, ( + f"Expected loss to decrease by at least {min_delta} for {chosen_tag}: " + f"initial={initial:.4f}, final={final:.4f}, delta={initial - final:.4f}" + ) + if max_initial is not None: + assert initial <= max_initial, ( + f"Initial loss {initial:.4f} is above the expected max {max_initial}. " + f"Absolute scale is wrong — probably a silent regression " + f"(e.g. bad label masking) that bumped the starting point." + ) + if max_final is not None: + assert final <= max_final, ( + f"Final loss {final:.4f} is above the expected max {max_final}. " + f"Absolute scale is wrong — probably a silent regression " + f"(e.g. bad label masking) that bumped the endpoint." + ) + + def check_model_output_exists(temp_dir: str, cfg: DictDefault) -> None: """ helper function to check if a model output file exists after training