From ac77da96daf074475fc8f207b179510ed7ba3f17 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 27 Apr 2026 13:22:56 -0400 Subject: [PATCH] use smaller pretrained models for ci (#3620) [skip ci] * use smaller pretrained models for ci * more steps for loss check * fix tests * more train steps * fix losses --- tests/conftest.py | 56 +++++++-- .../integrations/test_cut_cross_entropy.py | 41 ++++++- tests/e2e/kernels/test_lora_features.py | 2 +- tests/e2e/multigpu/test_dist_muon_fsdp2.py | 51 ++++---- tests/e2e/multigpu/test_fsdp1.py | 69 ++++++----- tests/e2e/multigpu/test_fsdp2.py | 101 ++++++++------- tests/e2e/multigpu/test_fsdp2_lora_kernels.py | 2 +- tests/e2e/multigpu/test_tp.py | 8 +- .../lora_kernels/test_lora_kernel_patching.py | 8 +- tests/e2e/patched/test_falcon_samplepack.py | 58 ++++++--- tests/e2e/patched/test_mistral_samplepack.py | 51 ++++++-- tests/e2e/patched/test_mixtral_samplepack.py | 58 ++++++--- tests/e2e/patched/test_model_patches.py | 5 +- tests/e2e/patched/test_phi_multipack.py | 52 +++++--- tests/e2e/solo/test_batch_flattening.py | 2 +- tests/e2e/test_falcon.py | 88 +++++++++---- tests/e2e/test_mistral.py | 48 ++++++-- tests/e2e/test_mixtral.py | 116 ++++++++++++------ tests/e2e/test_optimizers.py | 28 +++-- tests/e2e/test_phi.py | 54 +++++--- tests/e2e/test_preprocess.py | 2 +- tests/e2e/test_quantization.py | 2 +- tests/e2e/test_qwen.py | 2 +- tests/e2e/utils.py | 100 +++++++++++++++ 24 files changed, 716 insertions(+), 288 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 19e3dc3f0..16a01f8aa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -119,15 +119,49 @@ def download_smollm2_135m_gptq_model(): @pytest.fixture(scope="session", autouse=True) -def download_qwen_2_5_half_billion_model(): - # download the model - snapshot_download_w_retry("Qwen/Qwen2.5-0.5B", repo_type="model") +def download_qwen3_half_billion_model(): + # download the model (still used as the KD teacher in tests/e2e/integrations/test_kd.py) + snapshot_download_w_retry("Qwen/Qwen3-0.6B", repo_type="model") @pytest.fixture(scope="session", autouse=True) -def download_qwen3_half_billion_model(): - # download the model - snapshot_download_w_retry("Qwen/Qwen3-0.6B", repo_type="model") +def download_tiny_llama_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-llama-50m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_mistral_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-mistral-25m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_mixtral_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-mixtral-30m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_phi_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-phi-64m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_falcon_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-falcon-42m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_qwen2_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-qwen2-129m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_qwen3_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-qwen3-129m", repo_type="model") + + +@pytest.fixture(scope="session", autouse=True) +def download_tiny_gemma2_model(): + snapshot_download_w_retry("axolotl-ai-co/tiny-gemma2-137m", repo_type="model") @pytest.fixture(scope="session", autouse=True) @@ -620,7 +654,15 @@ def fixture_min_base_cfg(): ) def test_load_fixtures( download_smollm2_135m_model, - download_qwen_2_5_half_billion_model, + download_qwen3_half_billion_model, + download_tiny_llama_model, + download_tiny_mistral_model, + download_tiny_mixtral_model, + download_tiny_phi_model, + download_tiny_falcon_model, + download_tiny_qwen2_model, + download_tiny_qwen3_model, + download_tiny_gemma2_model, download_tatsu_lab_alpaca_dataset, download_mhenrichsen_alpaca_2k_dataset, download_mhenrichsen_alpaca_2k_w_revision_dataset, diff --git a/tests/e2e/integrations/test_cut_cross_entropy.py b/tests/e2e/integrations/test_cut_cross_entropy.py index 7da644ec3..75fb9d0db 100644 --- a/tests/e2e/integrations/test_cut_cross_entropy.py +++ b/tests/e2e/integrations/test_cut_cross_entropy.py @@ -10,7 +10,10 @@ from axolotl.utils import get_pytorch_version from axolotl.utils.config import normalize_config, prepare_plugins, validate_config from axolotl.utils.dict import DictDefault -from tests.e2e.utils import check_model_output_exists +from tests.e2e.utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, +) @pytest.fixture() @@ -35,13 +38,16 @@ def min_cfg(temp_dir): "num_epochs": 1, "micro_batch_size": 8, "gradient_accumulation_steps": 1, - "learning_rate": 0.00001, + "learning_rate": 5e-4, "optimizer": "adamw_torch_fused", "output_dir": temp_dir, "lr_scheduler": "cosine", - "max_steps": 10, + "max_steps": 40, + "warmup_steps": 5, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } @@ -64,11 +70,18 @@ class TestCutCrossEntropyIntegration: else: train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=2.2, + max_final=2.0, + ) def test_qwen2_w_cce(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "plugins": [ "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin", ], @@ -87,13 +100,15 @@ class TestCutCrossEntropyIntegration: "num_epochs": 1, "micro_batch_size": 4, "gradient_accumulation_steps": 1, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "output_dir": temp_dir, "lr_scheduler": "cosine", - "max_steps": 10, + "max_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -108,6 +123,13 @@ class TestCutCrossEntropyIntegration: else: train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) @pytest.mark.parametrize( "attention_type", @@ -136,3 +158,10 @@ class TestCutCrossEntropyIntegration: else: train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=2.2, + max_final=2.0, + ) diff --git a/tests/e2e/kernels/test_lora_features.py b/tests/e2e/kernels/test_lora_features.py index 80495c68d..688710f3a 100644 --- a/tests/e2e/kernels/test_lora_features.py +++ b/tests/e2e/kernels/test_lora_features.py @@ -24,7 +24,7 @@ from axolotl.monkeypatch.lora_kernels import ( ) from axolotl.utils.dict import DictDefault -MODEL_NAME = "Qwen/Qwen3-0.6B" +MODEL_NAME = "axolotl-ai-co/tiny-qwen3-129m" DEVICE = "cuda" DTYPE = torch.bfloat16 diff --git a/tests/e2e/multigpu/test_dist_muon_fsdp2.py b/tests/e2e/multigpu/test_dist_muon_fsdp2.py index 93db473a9..05841bb64 100644 --- a/tests/e2e/multigpu/test_dist_muon_fsdp2.py +++ b/tests/e2e/multigpu/test_dist_muon_fsdp2.py @@ -1,23 +1,22 @@ """Test module for DistMuon optimizer with FSDP2 multi-GPU functionality.""" -import os from pathlib import Path -import torch import yaml from accelerate.test_utils import execute_subprocess_async -from tbparse import SummaryReader from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault -from tests.e2e.utils import most_recent_subdir, require_torch_2_7_0 +from tests.e2e.utils import check_tensorboard_loss_decreased, require_torch_2_7_0 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent def verify_training_success(temp_dir): - """Verify that training completed successfully by checking artifacts and loss.""" + """Verify that training completed successfully — artifacts, no-NaN, loss + stayed in qwen2-pretraining scale (tiny-qwen2-129m final pretrain CE ~3.92). + """ output_path = Path(temp_dir) model_files = list(output_path.glob("*.bin")) + list( @@ -30,19 +29,13 @@ def verify_training_success(temp_dir): "No checkpoint files found - training may have failed" ) - tb_log_path = most_recent_subdir(temp_dir + "/runs") - if tb_log_path: - event_files = sorted(os.listdir(tb_log_path)) - if event_files: - event_file = os.path.join(tb_log_path, event_files[0]) - reader = SummaryReader(event_file) - df = reader.scalars - train_loss_df = df[df.tag == "train/train_loss"] - if len(train_loss_df) > 0: - final_loss = train_loss_df.value.values[-1] - assert not torch.isnan(torch.tensor(final_loss)), ( - f"Training loss is NaN: {final_loss}" - ) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=10, + final_window=10, + max_initial=5.0, + max_final=4.7, + ) class TestDistMuon: @@ -52,7 +45,7 @@ class TestDistMuon: def test_fft_sft(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -63,11 +56,12 @@ class TestDistMuon: }, ], "num_epochs": 1, - "max_steps": 2, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.02, + "learning_rate": 2e-3, "optimizer": "muon", "weight_decay": 0.01, "lr_scheduler": "cosine", @@ -82,6 +76,9 @@ class TestDistMuon: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, } ) @@ -109,7 +106,7 @@ class TestDistMuon: def test_lora_sft(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -122,14 +119,15 @@ class TestDistMuon: "adapter": "lora", "lora_r": 8, "lora_alpha": 16, - "lora_dropout": 0.05, + "lora_dropout": 0.0, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 2, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.02, + "learning_rate": 2e-3, "optimizer": "muon", "weight_decay": 0.01, "lr_scheduler": "cosine", @@ -144,6 +142,9 @@ class TestDistMuon: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, } ) diff --git a/tests/e2e/multigpu/test_fsdp1.py b/tests/e2e/multigpu/test_fsdp1.py index 5b6724791..c6a8a47e9 100644 --- a/tests/e2e/multigpu/test_fsdp1.py +++ b/tests/e2e/multigpu/test_fsdp1.py @@ -1,24 +1,23 @@ """Test module for FSDP1 multi-GPU functionality.""" -import os from pathlib import Path import pytest -import torch import yaml from accelerate.test_utils import execute_subprocess_async -from tbparse import SummaryReader from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault -from tests.e2e.utils import most_recent_subdir +from tests.e2e.utils import check_tensorboard_loss_decreased AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent def verify_training_success(temp_dir): - """Verify that training completed successfully by checking artifacts and loss.""" + """Verify that training completed successfully — artifacts, no-NaN, loss + stayed in qwen2-pretraining scale (tiny-qwen2-129m final pretrain CE ~3.92). + """ output_path = Path(temp_dir) model_files = list(output_path.glob("*.bin")) + list( @@ -31,19 +30,13 @@ def verify_training_success(temp_dir): "No checkpoint files found - training may have failed" ) - tb_log_path = most_recent_subdir(temp_dir + "/runs") - if tb_log_path: - event_files = sorted(os.listdir(tb_log_path)) - if event_files: - event_file = os.path.join(tb_log_path, event_files[0]) - reader = SummaryReader(event_file) - df = reader.scalars - train_loss_df = df[df.tag == "train/train_loss"] - if len(train_loss_df) > 0: - final_loss = train_loss_df.value.values[-1] - assert not torch.isnan(torch.tensor(final_loss)), ( - f"Training loss is NaN: {final_loss}" - ) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=10, + final_window=10, + max_initial=5.0, + max_final=4.7, + ) class TestFSDP1: @@ -56,7 +49,7 @@ class TestFSDP1: def test_fft_sft(self, temp_dir, fsdp_cpu_ram_efficient_loading): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -67,11 +60,12 @@ class TestFSDP1: }, ], "num_epochs": 1, - "max_steps": 2, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -87,6 +81,9 @@ class TestFSDP1: "fsdp_use_orig_params": False, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, } ) @@ -126,7 +123,7 @@ class TestFSDP1: def test_lora_sft(self, temp_dir, adapter_config): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -140,14 +137,15 @@ class TestFSDP1: "load_in_4bit": adapter_config["load_in_4bit"], "lora_r": 8, "lora_alpha": 16, - "lora_dropout": 0.05, + "lora_dropout": 0.0, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 2, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -163,6 +161,9 @@ class TestFSDP1: "fsdp_use_orig_params": False, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, } ) @@ -190,7 +191,7 @@ class TestFSDP1: def test_dpo_fft(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "rl": "dpo", @@ -203,11 +204,11 @@ class TestFSDP1: }, ], "num_epochs": 1, - "max_steps": 2, + "max_steps": 20, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -223,6 +224,9 @@ class TestFSDP1: "fsdp_use_orig_params": False, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, } ) @@ -262,7 +266,7 @@ class TestFSDP1: def test_dpo_lora(self, temp_dir, adapter_config): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "load_in_4bit": adapter_config["load_in_4bit"], "rl": "dpo", "chat_template": "chatml", @@ -281,11 +285,11 @@ class TestFSDP1: }, ], "num_epochs": 1, - "max_steps": 2, + "max_steps": 20, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -301,6 +305,9 @@ class TestFSDP1: "fsdp_use_orig_params": False, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": "auto", "tf32": True, } diff --git a/tests/e2e/multigpu/test_fsdp2.py b/tests/e2e/multigpu/test_fsdp2.py index a70ff9aa7..b48ddd436 100644 --- a/tests/e2e/multigpu/test_fsdp2.py +++ b/tests/e2e/multigpu/test_fsdp2.py @@ -1,24 +1,23 @@ """Test module for FSDP2 multi-GPU functionality.""" -import os from pathlib import Path import pytest -import torch import yaml from accelerate.test_utils import execute_subprocess_async -from tbparse import SummaryReader from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault -from tests.e2e.utils import most_recent_subdir, require_torch_2_7_0 +from tests.e2e.utils import check_tensorboard_loss_decreased, require_torch_2_7_0 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent def verify_training_success(temp_dir): - """Verify that training completed successfully by checking artifacts and loss.""" + """Verify that training completed successfully — artifacts, no-NaN, loss + stayed in qwen2-pretraining scale (tiny-qwen2-129m final pretrain CE ~3.92). + """ output_path = Path(temp_dir) model_files = list(output_path.glob("*.bin")) + list( @@ -31,19 +30,13 @@ def verify_training_success(temp_dir): "No checkpoint files found - training may have failed" ) - tb_log_path = most_recent_subdir(temp_dir + "/runs") - if tb_log_path: - event_files = sorted(os.listdir(tb_log_path)) - if event_files: - event_file = os.path.join(tb_log_path, event_files[0]) - reader = SummaryReader(event_file) - df = reader.scalars - train_loss_df = df[df.tag == "train/train_loss"] - if len(train_loss_df) > 0: - final_loss = train_loss_df.value.values[-1] - assert not torch.isnan(torch.tensor(final_loss)), ( - f"Training loss is NaN: {final_loss}" - ) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=10, + final_window=10, + max_initial=5.0, + max_final=4.7, + ) class TestFSDP2: @@ -57,7 +50,7 @@ class TestFSDP2: def test_fft_sft(self, temp_dir, fsdp_cpu_ram_efficient_loading): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -68,11 +61,12 @@ class TestFSDP2: }, ], "num_epochs": 1, - "max_steps": 2, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -86,6 +80,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, } ) @@ -114,7 +111,7 @@ class TestFSDP2: def test_lora_sft(self, temp_dir, peft_use_dora): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -128,14 +125,15 @@ class TestFSDP2: "adapter": "lora", "lora_r": 8, "lora_alpha": 16, - "lora_dropout": 0.05, + "lora_dropout": 0.0, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 2, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -149,6 +147,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, # explicitly disable LORA kernels, as they may be auto-enabled "lora_mlp_kernel": False, @@ -180,7 +181,7 @@ class TestFSDP2: def test_lora_sft_kernels(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -195,11 +196,12 @@ class TestFSDP2: "lora_alpha": 16, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 2, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -213,6 +215,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, "lora_mlp_kernel": True, "lora_qkv_kernel": True, @@ -243,7 +248,7 @@ class TestFSDP2: def test_qlora_sft(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -257,14 +262,15 @@ class TestFSDP2: "adapter": "qlora", "lora_r": 8, "lora_alpha": 16, - "lora_dropout": 0.05, + "lora_dropout": 0.0, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 2, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -278,6 +284,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, } ) @@ -305,7 +314,7 @@ class TestFSDP2: def test_qlora_sft_kernels(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -321,11 +330,12 @@ class TestFSDP2: "lora_alpha": 16, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 2, + "max_steps": 80, + "warmup_steps": 5, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -339,6 +349,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, "bf16": True, "lora_mlp_kernel": True, "lora_qkv_kernel": True, @@ -370,7 +383,7 @@ class TestFSDP2: def test_dpo_fft(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "rl": "dpo", @@ -383,11 +396,11 @@ class TestFSDP2: }, ], "num_epochs": 1, - "max_steps": 2, + "max_steps": 20, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -401,6 +414,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, } ) @@ -428,7 +444,7 @@ class TestFSDP2: def test_dpo_lora(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "rl": "dpo", "chat_template": "chatml", @@ -445,11 +461,11 @@ class TestFSDP2: "lora_dropout": 0.05, "lora_target_linear": True, "num_epochs": 1, - "max_steps": 2, + "max_steps": 20, "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 1e-3, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, @@ -463,6 +479,9 @@ class TestFSDP2: "reshard_after_forward": True, }, "use_tensorboard": True, + "seed": 42, + "sample_packing": True, + "pad_to_sequence_len": True, } ) diff --git a/tests/e2e/multigpu/test_fsdp2_lora_kernels.py b/tests/e2e/multigpu/test_fsdp2_lora_kernels.py index 27ad2b8e9..0f2fd421a 100644 --- a/tests/e2e/multigpu/test_fsdp2_lora_kernels.py +++ b/tests/e2e/multigpu/test_fsdp2_lora_kernels.py @@ -40,7 +40,7 @@ def _run_training(temp_dir, cfg): def _base_lora_fsdp2_config(temp_dir, **overrides): """Base config for LoRA + FSDP2 + kernel tests.""" cfg = { - "base_model": "Qwen/Qwen3-0.6B", + "base_model": "axolotl-ai-co/tiny-qwen3-129m", "sequence_len": 512, "val_set_size": 0.0, "datasets": [ diff --git a/tests/e2e/multigpu/test_tp.py b/tests/e2e/multigpu/test_tp.py index 9891a0906..965dfa8e5 100644 --- a/tests/e2e/multigpu/test_tp.py +++ b/tests/e2e/multigpu/test_tp.py @@ -8,7 +8,7 @@ from accelerate.test_utils import execute_subprocess_async, get_torch_dist_uniqu from axolotl.utils.dict import DictDefault -from tests.e2e.utils import check_tensorboard, require_torch_2_7_0 +from tests.e2e.utils import check_tensorboard_loss_decreased, require_torch_2_7_0 class TestTensorParallel: @@ -21,7 +21,7 @@ class TestTensorParallel: def test_fft_sft(self, temp_dir): cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ @@ -63,6 +63,6 @@ class TestTensorParallel: ] ) - check_tensorboard( - temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high" + check_tensorboard_loss_decreased( + temp_dir + "/runs", max_initial=5.0, max_final=4.7 ) diff --git a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py index 2865a80f9..4844ce539 100644 --- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py +++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py @@ -32,12 +32,12 @@ from axolotl.utils.dict import DictDefault MODEL_CONFIGS = [ { - "name": "trl-internal-testing/tiny-MistralForCausalLM-0.2", + "name": "axolotl-ai-co/tiny-mistral-25m", "expected_activation": apply_lora_mlp_swiglu, "dtype": torch.float16, }, { - "name": "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5", + "name": "axolotl-ai-co/tiny-qwen2-129m", "expected_activation": apply_lora_mlp_swiglu, "dtype": torch.float16, }, @@ -47,7 +47,7 @@ MODEL_CONFIGS = [ "dtype": torch.float32, }, { - "name": "trl-internal-testing/tiny-Gemma2ForCausalLM", + "name": "axolotl-ai-co/tiny-gemma2-137m", "expected_activation": apply_lora_mlp_geglu, "dtype": torch.float16, }, @@ -159,7 +159,7 @@ def test_swiglu_mlp_integration(small_llama_model): def test_geglu_model_integration(): """Test GeGLU activation with Gemma model.""" model = AutoModelForCausalLM.from_pretrained( - "trl-internal-testing/tiny-Gemma2ForCausalLM", + "axolotl-ai-co/tiny-gemma2-137m", dtype=torch.float16, device_map="cuda:0", ) diff --git a/tests/e2e/patched/test_falcon_samplepack.py b/tests/e2e/patched/test_falcon_samplepack.py index cc5091403..1d688585e 100644 --- a/tests/e2e/patched/test_falcon_samplepack.py +++ b/tests/e2e/patched/test_falcon_samplepack.py @@ -4,14 +4,16 @@ E2E tests for falcon import unittest -import pytest - from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from ..utils import check_model_output_exists, with_temp_dir +from ..utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestFalconPatched(unittest.TestCase): @@ -19,13 +21,12 @@ class TestFalconPatched(unittest.TestCase): Test case for Falcon models """ - @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_qlora(self, temp_dir): cfg = DictDefault( { - "base_model": "illuin/tiny-random-FalconForCausalLM", - "flash_attention": True, + "base_model": "axolotl-ai-co/tiny-falcon-42m", + "flash_attention": False, "sample_packing": True, "sequence_len": 2048, "load_in_4bit": True, @@ -47,17 +48,20 @@ class TestFalconPatched(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -66,14 +70,20 @@ class TestFalconPatched(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=6.0, + max_final=4.7, + ) - @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { - "base_model": "illuin/tiny-random-FalconForCausalLM", - "flash_attention": True, + "base_model": "axolotl-ai-co/tiny-falcon-42m", + "flash_attention": False, "sample_packing": True, "sequence_len": 2048, "val_set_size": 0.05, @@ -88,17 +98,20 @@ class TestFalconPatched(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -107,3 +120,10 @@ class TestFalconPatched(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=6.0, + max_final=4.7, + ) diff --git a/tests/e2e/patched/test_mistral_samplepack.py b/tests/e2e/patched/test_mistral_samplepack.py index e03941b07..ab59a000c 100644 --- a/tests/e2e/patched/test_mistral_samplepack.py +++ b/tests/e2e/patched/test_mistral_samplepack.py @@ -9,7 +9,12 @@ from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from ..utils import check_model_output_exists, require_torch_2_6_0, with_temp_dir +from ..utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + require_torch_2_6_0, + with_temp_dir, +) class TestMistral(unittest.TestCase): @@ -22,7 +27,7 @@ class TestMistral(unittest.TestCase): def test_lora_packing(self, temp_dir): cfg = DictDefault( { - "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", + "base_model": "axolotl-ai-co/tiny-mistral-25m", "flash_attention": True, "sample_packing": True, "sequence_len": 1024, @@ -45,17 +50,20 @@ class TestMistral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 5, - "save_steps": 3, - "eval_steps": 4, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -64,12 +72,19 @@ class TestMistral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.5, + max_final=4.3, + ) @with_temp_dir def test_ft_packing(self, temp_dir): cfg = DictDefault( { - "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", + "base_model": "axolotl-ai-co/tiny-mistral-25m", "flash_attention": True, "sample_packing": True, "sequence_len": 1024, @@ -86,17 +101,20 @@ class TestMistral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 5, - "save_steps": 3, - "eval_steps": 4, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -105,3 +123,10 @@ class TestMistral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.5, + max_final=4.3, + ) diff --git a/tests/e2e/patched/test_mixtral_samplepack.py b/tests/e2e/patched/test_mixtral_samplepack.py index 3517ff3db..3c6eb8d12 100644 --- a/tests/e2e/patched/test_mixtral_samplepack.py +++ b/tests/e2e/patched/test_mixtral_samplepack.py @@ -9,7 +9,11 @@ from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from ..utils import check_model_output_exists, with_temp_dir +from ..utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestMixtral(unittest.TestCase): @@ -21,8 +25,7 @@ class TestMixtral(unittest.TestCase): def test_qlora(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, @@ -30,7 +33,7 @@ class TestMixtral(unittest.TestCase): "adapter": "qlora", "lora_r": 16, "lora_alpha": 32, - "lora_dropout": 0.1, + "lora_dropout": 0.0, "lora_target_linear": True, "val_set_size": 0.05, "special_tokens": {}, @@ -41,17 +44,21 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 3e-3, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 5, - "save_steps": 3, - "eval_steps": 4, + "max_steps": 80, + "warmup_steps": 5, + "logging_steps": 1, + "save_steps": 80, + "eval_steps": 80, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -60,13 +67,19 @@ class TestMixtral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=10, + final_window=10, + max_initial=6.0, + max_final=4.7, + ) @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, @@ -79,17 +92,21 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, - "optimizer": "adamw_bnb_8bit", + "learning_rate": 5e-4, + "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 5, - "save_steps": 3, - "eval_steps": 4, + "max_steps": 80, + "warmup_steps": 5, + "logging_steps": 1, + "save_steps": 80, + "eval_steps": 80, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -98,3 +115,10 @@ class TestMixtral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=6.0, + max_final=4.7, + ) diff --git a/tests/e2e/patched/test_model_patches.py b/tests/e2e/patched/test_model_patches.py index aaaaf5fe2..83448d175 100644 --- a/tests/e2e/patched/test_model_patches.py +++ b/tests/e2e/patched/test_model_patches.py @@ -22,8 +22,7 @@ class TestModelPatches(unittest.TestCase): def test_mixtral_multipack(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, @@ -57,7 +56,7 @@ class TestModelPatches(unittest.TestCase): def test_mistral_multipack(self, temp_dir): cfg = DictDefault( { - "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", + "base_model": "axolotl-ai-co/tiny-mistral-25m", "flash_attention": True, "sample_packing": True, "sequence_len": 2048, diff --git a/tests/e2e/patched/test_phi_multipack.py b/tests/e2e/patched/test_phi_multipack.py index 77b2d99e5..c3c8ff569 100644 --- a/tests/e2e/patched/test_phi_multipack.py +++ b/tests/e2e/patched/test_phi_multipack.py @@ -9,7 +9,11 @@ from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from ..utils import check_model_output_exists, with_temp_dir +from ..utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestPhiMultipack(unittest.TestCase): @@ -21,7 +25,7 @@ class TestPhiMultipack(unittest.TestCase): def test_ft_packed(self, temp_dir): cfg = DictDefault( { - "base_model": "microsoft/phi-1_5", + "base_model": "axolotl-ai-co/tiny-phi-64m", "model_type": "PhiForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, @@ -43,17 +47,20 @@ class TestPhiMultipack(unittest.TestCase): "dataset_shard_num": 10, "dataset_shard_idx": 0, "num_epochs": 1, - "micro_batch_size": 1, + "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, - "optimizer": "adamw_bnb_8bit", + "learning_rate": 2e-4, + "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 5, - "eval_steps": 3, - "save_steps": 4, + "max_steps": 50, + "logging_steps": 1, + "eval_steps": 50, + "save_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) @@ -63,12 +70,19 @@ class TestPhiMultipack(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=6.0, + max_final=4.7, + ) @with_temp_dir def test_qlora_packed(self, temp_dir): cfg = DictDefault( { - "base_model": "microsoft/phi-1_5", + "base_model": "axolotl-ai-co/tiny-phi-64m", "model_type": "PhiForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, @@ -94,17 +108,20 @@ class TestPhiMultipack(unittest.TestCase): "dataset_shard_num": 10, "dataset_shard_idx": 0, "num_epochs": 1, - "micro_batch_size": 1, + "micro_batch_size": 2, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 5, - "eval_steps": 3, - "save_steps": 4, + "max_steps": 50, + "logging_steps": 1, + "eval_steps": 50, + "save_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) @@ -114,3 +131,10 @@ class TestPhiMultipack(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=6.0, + max_final=4.7, + ) diff --git a/tests/e2e/solo/test_batch_flattening.py b/tests/e2e/solo/test_batch_flattening.py index 80b7b0259..7b6c59119 100644 --- a/tests/e2e/solo/test_batch_flattening.py +++ b/tests/e2e/solo/test_batch_flattening.py @@ -18,7 +18,7 @@ from transformers import AutoModelForCausalLM # Import the actual trainer methods we want to test from axolotl.core.trainers.grpo.async_trainer import AsyncGRPOTrainer -MODEL_NAME = "Qwen/Qwen3-0.6B" +MODEL_NAME = "axolotl-ai-co/tiny-qwen3-129m" def _fix_patched_attention(model): diff --git a/tests/e2e/test_falcon.py b/tests/e2e/test_falcon.py index 1a363fe6a..19de202d2 100644 --- a/tests/e2e/test_falcon.py +++ b/tests/e2e/test_falcon.py @@ -4,14 +4,16 @@ E2E tests for falcon import unittest -import pytest - from axolotl.common.datasets import load_datasets from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from .utils import check_model_output_exists, with_temp_dir +from .utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestFalcon(unittest.TestCase): @@ -19,13 +21,12 @@ class TestFalcon(unittest.TestCase): Test case for falcon """ - @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_lora(self, temp_dir): cfg = DictDefault( { - "base_model": "illuin/tiny-random-FalconForCausalLM", - "flash_attention": True, + "base_model": "axolotl-ai-co/tiny-falcon-42m", + "flash_attention": False, "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", @@ -49,17 +50,21 @@ class TestFalcon(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "warmup_steps": 5, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) @@ -69,14 +74,20 @@ class TestFalcon(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) - @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_lora_added_vocab(self, temp_dir): cfg = DictDefault( { - "base_model": "illuin/tiny-random-FalconForCausalLM", - "flash_attention": True, + "base_model": "axolotl-ai-co/tiny-falcon-42m", + "flash_attention": False, "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", @@ -104,17 +115,21 @@ class TestFalcon(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "warmup_steps": 5, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) @@ -124,14 +139,20 @@ class TestFalcon(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) - @pytest.mark.skip(reason="no tiny models for testing with safetensors") @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { - "base_model": "illuin/tiny-random-FalconForCausalLM", - "flash_attention": True, + "base_model": "axolotl-ai-co/tiny-falcon-42m", + "flash_attention": False, "sequence_len": 1024, "val_set_size": 0.02, "special_tokens": { @@ -145,17 +166,23 @@ class TestFalcon(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "sample_packing": True, + "pad_to_sequence_len": True, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 5e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 80, + "warmup_steps": 5, + "logging_steps": 1, + "save_steps": 80, + "eval_steps": 80, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) @@ -165,3 +192,10 @@ class TestFalcon(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=10, + final_window=10, + max_initial=5.0, + max_final=4.7, + ) diff --git a/tests/e2e/test_mistral.py b/tests/e2e/test_mistral.py index 08b3b05af..37cae7ce7 100644 --- a/tests/e2e/test_mistral.py +++ b/tests/e2e/test_mistral.py @@ -11,7 +11,11 @@ from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from .utils import check_model_output_exists, with_temp_dir +from .utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestMistral(unittest.TestCase): @@ -23,7 +27,7 @@ class TestMistral(unittest.TestCase): def test_lora(self, temp_dir): cfg = DictDefault( { - "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", + "base_model": "axolotl-ai-co/tiny-mistral-25m", "flash_attention": True, "sequence_len": 1024, "load_in_8bit": True, @@ -45,16 +49,18 @@ class TestMistral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) @@ -64,12 +70,19 @@ class TestMistral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=4.5, + max_final=4.3, + ) @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { - "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2", + "base_model": "axolotl-ai-co/tiny-mistral-25m", "flash_attention": True, "sequence_len": 1024, "val_set_size": 0.02, @@ -85,16 +98,18 @@ class TestMistral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) if is_torch_bf16_gpu_available(): @@ -108,3 +123,10 @@ class TestMistral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=4.5, + max_final=4.3, + ) diff --git a/tests/e2e/test_mixtral.py b/tests/e2e/test_mixtral.py index c47486b3c..00c75f426 100644 --- a/tests/e2e/test_mixtral.py +++ b/tests/e2e/test_mixtral.py @@ -12,7 +12,11 @@ from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from .utils import check_model_output_exists, with_temp_dir +from .utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestMixtral(unittest.TestCase): @@ -24,8 +28,7 @@ class TestMixtral(unittest.TestCase): def test_qlora_w_fa2(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": True, "sequence_len": 1024, "load_in_4bit": True, @@ -51,16 +54,18 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) @@ -74,13 +79,19 @@ class TestMixtral(unittest.TestCase): == torch.float32 ) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) @with_temp_dir def test_qlora_wo_fa2(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": False, "sequence_len": 1024, "load_in_4bit": True, @@ -106,16 +117,18 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) @@ -129,13 +142,19 @@ class TestMixtral(unittest.TestCase): == torch.float32 ) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) @with_temp_dir def test_16bit_lora_w_fa2(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": True, "sequence_len": 1024, "adapter": "lora", @@ -160,16 +179,18 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) if is_torch_bf16_gpu_available(): @@ -187,13 +208,19 @@ class TestMixtral(unittest.TestCase): == torch.float32 ) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) @with_temp_dir def test_16bit_lora_wo_fa2(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": False, "sequence_len": 1024, "adapter": "lora", @@ -218,16 +245,18 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) @@ -245,13 +274,19 @@ class TestMixtral(unittest.TestCase): == torch.float32 ) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) @with_temp_dir def test_ft(self, temp_dir): cfg = DictDefault( { - "base_model": "hf-internal-testing/Mixtral-tiny", - "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", + "base_model": "axolotl-ai-co/tiny-mixtral-30m", "flash_attention": True, "sequence_len": 1024, "val_set_size": 0.02, @@ -263,16 +298,18 @@ class TestMixtral(unittest.TestCase): }, ], "num_epochs": 2, - "micro_batch_size": 2, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "save_first_step": False, + "use_tensorboard": True, } ) if is_torch_bf16_gpu_available(): @@ -286,3 +323,10 @@ class TestMixtral(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) diff --git a/tests/e2e/test_optimizers.py b/tests/e2e/test_optimizers.py index a53e8b005..430bd73c5 100644 --- a/tests/e2e/test_optimizers.py +++ b/tests/e2e/test_optimizers.py @@ -13,6 +13,7 @@ from axolotl.utils.dict import DictDefault from .utils import ( check_model_output_exists, + check_tensorboard_loss_decreased, require_torch_2_5_1, require_torch_2_6_0, require_torch_2_7_0, @@ -243,20 +244,18 @@ class TestCustomOptimizers(unittest.TestCase): def test_came_pytorch(self, temp_dir): cfg = DictDefault( { - "base_model": "JackFram/llama-68m", - "tokenizer_type": "LlamaTokenizer", + "base_model": "axolotl-ai-co/tiny-llama-50m", + "tokenizer_type": "AutoTokenizer", "sequence_len": 1024, "load_in_8bit": True, "adapter": "lora", "lora_r": 8, "lora_alpha": 16, - "lora_dropout": 0.05, + "lora_dropout": 0.0, "lora_target_linear": True, "val_set_size": 0.1, "special_tokens": { - "unk_token": "", - "bos_token": "", - "eos_token": "", + "pad_token": "<|endoftext|>", }, "datasets": [ { @@ -265,16 +264,22 @@ class TestCustomOptimizers(unittest.TestCase): }, ], "num_epochs": 1, + "sample_packing": True, + "pad_to_sequence_len": True, "micro_batch_size": 8, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 1e-4, "optimizer": "came_pytorch", "adam_beta3": 0.9999, "adam_epsilon2": 1e-16, - "max_steps": 5, + "max_steps": 80, + "warmup_steps": 5, + "logging_steps": 1, "lr_scheduler": "cosine", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) @@ -284,6 +289,13 @@ class TestCustomOptimizers(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=10, + final_window=10, + max_initial=4.0, + max_final=3.0, + ) @require_torch_2_7_0 diff --git a/tests/e2e/test_phi.py b/tests/e2e/test_phi.py index ae2210249..c2a637883 100644 --- a/tests/e2e/test_phi.py +++ b/tests/e2e/test_phi.py @@ -9,7 +9,11 @@ from axolotl.train import train from axolotl.utils.config import normalize_config, validate_config from axolotl.utils.dict import DictDefault -from .utils import check_model_output_exists, with_temp_dir +from .utils import ( + check_model_output_exists, + check_tensorboard_loss_decreased, + with_temp_dir, +) class TestPhi(unittest.TestCase): @@ -21,7 +25,7 @@ class TestPhi(unittest.TestCase): def test_phi_ft(self, temp_dir): cfg = DictDefault( { - "base_model": "microsoft/phi-1_5", + "base_model": "axolotl-ai-co/tiny-phi-64m", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 2048, @@ -41,18 +45,22 @@ class TestPhi(unittest.TestCase): "dataset_shard_num": 10, "dataset_shard_idx": 0, "num_epochs": 1, - "micro_batch_size": 1, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, - "optimizer": "paged_adamw_8bit", + "learning_rate": 2e-4, + "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", "flash_attention": True, - "max_steps": 10, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "warmup_steps": 5, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -61,12 +69,19 @@ class TestPhi(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) @with_temp_dir def test_phi_qlora(self, temp_dir): cfg = DictDefault( { - "base_model": "microsoft/phi-1_5", + "base_model": "axolotl-ai-co/tiny-phi-64m", "model_type": "AutoModelForCausalLM", "tokenizer_type": "AutoTokenizer", "sequence_len": 2048, @@ -90,18 +105,22 @@ class TestPhi(unittest.TestCase): "dataset_shard_num": 10, "dataset_shard_idx": 0, "num_epochs": 1, - "micro_batch_size": 1, + "micro_batch_size": 4, "gradient_accumulation_steps": 1, "output_dir": temp_dir, - "learning_rate": 0.00001, + "learning_rate": 2e-4, "optimizer": "paged_adamw_8bit", "lr_scheduler": "cosine", "flash_attention": True, - "max_steps": 10, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 50, + "warmup_steps": 5, + "logging_steps": 1, + "save_steps": 50, + "eval_steps": 50, "bf16": "auto", "save_first_step": False, + "use_tensorboard": True, + "seed": 42, } ) cfg = validate_config(cfg) @@ -110,3 +129,10 @@ class TestPhi(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_model_output_exists(temp_dir, cfg) + check_tensorboard_loss_decreased( + temp_dir + "/runs", + initial_window=5, + final_window=5, + max_initial=5.0, + max_final=4.7, + ) diff --git a/tests/e2e/test_preprocess.py b/tests/e2e/test_preprocess.py index 8f15cbe55..895c29c87 100644 --- a/tests/e2e/test_preprocess.py +++ b/tests/e2e/test_preprocess.py @@ -18,7 +18,7 @@ class TestPreprocess: cfg = DictDefault( { - "base_model": "Qwen/Qwen2.5-0.5B", + "base_model": "axolotl-ai-co/tiny-qwen2-129m", "sequence_len": 2048, "val_set_size": 0.01, "datasets": [ diff --git a/tests/e2e/test_quantization.py b/tests/e2e/test_quantization.py index a70a46194..bb8ce969c 100644 --- a/tests/e2e/test_quantization.py +++ b/tests/e2e/test_quantization.py @@ -45,7 +45,7 @@ def _get_fake_quant_config_dtype(config): @pytest.fixture() def model(): dummy_model = AutoModelForCausalLM.from_pretrained( - "Qwen/Qwen2-0.5B", + "axolotl-ai-co/tiny-qwen2-129m", device_map="auto", dtype=torch.bfloat16, ) diff --git a/tests/e2e/test_qwen.py b/tests/e2e/test_qwen.py index 1c75d817b..b8654c0ad 100644 --- a/tests/e2e/test_qwen.py +++ b/tests/e2e/test_qwen.py @@ -17,7 +17,7 @@ class TestE2eQwen: Test cases for qwen models """ - @pytest.mark.parametrize("base_model", ["Qwen/Qwen2-0.5B", "Qwen/Qwen2.5-0.5B"]) + @pytest.mark.parametrize("base_model", ["axolotl-ai-co/tiny-qwen2-129m"]) def test_dpo(self, base_model, temp_dir): cfg = DictDefault( { diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py index e1eaca050..8306b72ce 100644 --- a/tests/e2e/utils.py +++ b/tests/e2e/utils.py @@ -199,6 +199,106 @@ def check_tensorboard( assert df.value.values[-1] > 1e-5, "Expected loss to be greater than zero" +def check_tensorboard_loss_decreased( + temp_run_dir: str, + tag: str | None = None, + initial_window: int = 1, + final_window: int = 1, + min_delta: float | None = None, + max_initial: float | None = None, + max_final: float | None = None, + max_loss_ratio: float = 0.95, +) -> None: + """Check that training actually learned — loss went down and stayed in + a sensible range. + + Used with the tiny ``axolotl-ai-co/tiny-*`` CI models, where pretraining + was brief enough that final loss won't clear the absolute thresholds used + for 135M+ models — but the training pipeline should still behave. + + ``train/train_loss`` is only logged once (end-of-training aggregate). The + per-step tag is ``train/loss`` for SFT/LM trainers and may vary across + trainers (e.g. DPO). When ``tag`` is None we try common per-step tags in + order and use the first with enough samples. + + Two kinds of regression we guard against: + + 1. **Loss blew up.** A silent bug (e.g. broken label masking) can start + training at an absurdly high loss. ``max_initial`` / ``max_final`` + assert the measured means stay at-or-below bounds measured from a + known-good run. Both are optional but strongly encouraged — loss + going *down* from a bad starting scale still looks like "learning." + + 2. **Loss didn't go down enough.** ``max_loss_ratio`` (default 0.95) + requires ``final <= initial * ratio``. A default below 1.0 means the + final window mean must sit at least 5% below the initial window mean + — real learning, not noise that happened to land below start. Only + raise this for configs where a smaller drop is expected *and* + documented (e.g. DPO with near-trivial pairs); in that case you are + intentionally weakening the test. + + ``min_delta`` is optional; when set, additionally requires + ``final + min_delta <= initial`` — use for configs with enough signal + to demand a specific minimum absolute drop. + """ + tb_log_path = most_recent_subdir(temp_run_dir) + event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0]) + reader = SummaryReader(event_file) + df = reader.scalars + + if tag is None: + candidates = ["train/loss", "train/train_loss"] + else: + candidates = [tag] + + required = initial_window + final_window + chosen_tag, values = None, None + for candidate in candidates: + sub = df[df.tag == candidate] + if len(sub) >= required: + chosen_tag = candidate + values = sub.value.values + break + + available = sorted({t for t in df.tag.unique() if "loss" in t.lower()}) + assert values is not None, ( + f"None of the tags {candidates} had ≥{required} logged steps. " + f"Loss tags present: {available}" + ) + + initial = float(values[:initial_window].mean()) + final = float(values[-final_window:].mean()) + print( + f"[check_tensorboard_loss_decreased] tag={chosen_tag} n={len(values)} " + f"initial_mean{initial_window}={initial:.4f} final_mean{final_window}={final:.4f}" + ) + assert final > 1e-5, "Expected loss to be greater than zero" + assert final <= initial * max_loss_ratio, ( + f"Loss did not decrease for {chosen_tag}: " + f"initial(mean of first {initial_window})={initial:.4f}, " + f"final(mean of last {final_window})={final:.4f}, " + f"ratio={final / initial:.4f} (max allowed {max_loss_ratio}). " + f"Expected final <= initial — training did not learn." + ) + if min_delta is not None: + assert final + min_delta <= initial, ( + f"Expected loss to decrease by at least {min_delta} for {chosen_tag}: " + f"initial={initial:.4f}, final={final:.4f}, delta={initial - final:.4f}" + ) + if max_initial is not None: + assert initial <= max_initial, ( + f"Initial loss {initial:.4f} is above the expected max {max_initial}. " + f"Absolute scale is wrong — probably a silent regression " + f"(e.g. bad label masking) that bumped the starting point." + ) + if max_final is not None: + assert final <= max_final, ( + f"Final loss {final:.4f} is above the expected max {max_final}. " + f"Absolute scale is wrong — probably a silent regression " + f"(e.g. bad label masking) that bumped the endpoint." + ) + + def check_model_output_exists(temp_dir: str, cfg: DictDefault) -> None: """ helper function to check if a model output file exists after training