diff --git a/tests/e2e/integrations/test_cut_cross_entropy.py b/tests/e2e/integrations/test_cut_cross_entropy.py index 25e36b5eb..753934563 100644 --- a/tests/e2e/integrations/test_cut_cross_entropy.py +++ b/tests/e2e/integrations/test_cut_cross_entropy.py @@ -25,7 +25,7 @@ def min_cfg(temp_dir): ], "cut_cross_entropy": True, "sequence_len": 1024, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -79,7 +79,7 @@ class TestCutCrossEntropyIntegration: ], "cut_cross_entropy": True, "sequence_len": 1024, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, diff --git a/tests/e2e/multigpu/solo/test_flex.py b/tests/e2e/multigpu/solo/test_flex.py index 5e2c9e7cc..3af6d5a76 100644 --- a/tests/e2e/multigpu/solo/test_flex.py +++ b/tests/e2e/multigpu/solo/test_flex.py @@ -55,7 +55,7 @@ class TestPackedFlex: ], "num_epochs": 1, "micro_batch_size": 2, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py index f44c775c8..3bacac821 100644 --- a/tests/e2e/multigpu/test_llama.py +++ b/tests/e2e/multigpu/test_llama.py @@ -58,12 +58,13 @@ class TestMultiGPULlama: { "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 1, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, # "gradient_checkpointing": True, "output_dir": temp_dir, "learning_rate": 0.00001, @@ -201,7 +202,7 @@ class TestMultiGPULlama: "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, # "gradient_checkpointing": True, "output_dir": temp_dir, "warmup_steps": 0, @@ -279,7 +280,7 @@ class TestMultiGPULlama: "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, # "gradient_checkpointing": True, "output_dir": temp_dir, "warmup_steps": 0, @@ -335,6 +336,7 @@ class TestMultiGPULlama: { "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 1, @@ -398,7 +400,7 @@ class TestMultiGPULlama: "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 1024, - "val_set_size": 0.01, + "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -406,6 +408,7 @@ class TestMultiGPULlama: { "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 1, @@ -484,6 +487,7 @@ class TestMultiGPULlama: { "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 1, @@ -565,7 +569,7 @@ class TestMultiGPULlama: { "path": "tatsu-lab/alpaca", "type": "alpaca", - "split": "train[:25%]", + "split": "train[:10%]", }, ], "num_epochs": 1, @@ -660,7 +664,7 @@ class TestMultiGPULlama: "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 1024, - "val_set_size": 0.01, + "val_set_size": 0.05, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -668,6 +672,7 @@ class TestMultiGPULlama: { "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 1, @@ -741,6 +746,7 @@ class TestMultiGPULlama: { "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 1, @@ -814,6 +820,7 @@ class TestMultiGPULlama: { "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 1, diff --git a/tests/e2e/multigpu/test_ray.py b/tests/e2e/multigpu/test_ray.py index 9be7c6f50..843adac91 100644 --- a/tests/e2e/multigpu/test_ray.py +++ b/tests/e2e/multigpu/test_ray.py @@ -45,6 +45,7 @@ class TestMultiGPURay: { "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 1, @@ -103,6 +104,7 @@ class TestMultiGPURay: { "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 1, diff --git a/tests/e2e/multigpu/test_sp.py b/tests/e2e/multigpu/test_sp.py index 2bd10beb5..288720eec 100644 --- a/tests/e2e/multigpu/test_sp.py +++ b/tests/e2e/multigpu/test_sp.py @@ -40,6 +40,7 @@ class TestSequenceParallelism: { "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 1, diff --git a/tests/e2e/patched/test_llama_s2_attention.py b/tests/e2e/patched/test_llama_s2_attention.py index cfa70fd73..b8ddf10da 100644 --- a/tests/e2e/patched/test_llama_s2_attention.py +++ b/tests/e2e/patched/test_llama_s2_attention.py @@ -43,7 +43,7 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase): "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { @@ -83,7 +83,7 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase): "sample_packing": False, "flash_attention": True, "s2_attention": True, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { diff --git a/tests/e2e/patched/test_model_patches.py b/tests/e2e/patched/test_model_patches.py index c6a13af19..ec09e0c81 100644 --- a/tests/e2e/patched/test_model_patches.py +++ b/tests/e2e/patched/test_model_patches.py @@ -27,7 +27,7 @@ class TestModelPatches(unittest.TestCase): "flash_attention": True, "sample_packing": True, "sequence_len": 2048, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { @@ -59,7 +59,7 @@ class TestModelPatches(unittest.TestCase): "flash_attention": True, "sample_packing": True, "sequence_len": 2048, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { diff --git a/tests/e2e/patched/test_phi_multipack.py b/tests/e2e/patched/test_phi_multipack.py index ce466460e..70b3ea124 100644 --- a/tests/e2e/patched/test_phi_multipack.py +++ b/tests/e2e/patched/test_phi_multipack.py @@ -88,7 +88,7 @@ class TestPhiMultipack(unittest.TestCase): "lora_alpha": 32, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, diff --git a/tests/e2e/solo/test_flex.py b/tests/e2e/solo/test_flex.py index 5e52204a6..6de813e37 100644 --- a/tests/e2e/solo/test_flex.py +++ b/tests/e2e/solo/test_flex.py @@ -47,7 +47,7 @@ class TestPackedFlex(unittest.TestCase): ], "num_epochs": 1, "micro_batch_size": 2, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", diff --git a/tests/e2e/test_deepseekv3.py b/tests/e2e/test_deepseekv3.py index cdaa2c416..2afda640f 100644 --- a/tests/e2e/test_deepseekv3.py +++ b/tests/e2e/test_deepseekv3.py @@ -65,7 +65,7 @@ class TestDeepseekV3: "chat_template": "deepseek_v3", "num_epochs": 1, "micro_batch_size": 1, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", @@ -115,7 +115,7 @@ class TestDeepseekV3: }, "num_epochs": 1, "micro_batch_size": 1, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", diff --git a/tests/e2e/test_falcon.py b/tests/e2e/test_falcon.py index 3c325459b..a1641a997 100644 --- a/tests/e2e/test_falcon.py +++ b/tests/e2e/test_falcon.py @@ -41,7 +41,7 @@ class TestFalcon(unittest.TestCase): "word_embeddings", "lm_head", ], - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "bos_token": "<|endoftext|>", "pad_token": "<|endoftext|>", @@ -92,7 +92,7 @@ class TestFalcon(unittest.TestCase): "word_embeddings", "lm_head", ], - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "bos_token": "<|endoftext|>", "pad_token": "<|endoftext|>", @@ -137,7 +137,7 @@ class TestFalcon(unittest.TestCase): "base_model": "illuin/tiny-random-FalconForCausalLM", "flash_attention": True, "sequence_len": 1024, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "bos_token": "<|endoftext|>", "pad_token": "<|endoftext|>", diff --git a/tests/e2e/test_gemma2.py b/tests/e2e/test_gemma2.py index df777b709..68dc4855d 100644 --- a/tests/e2e/test_gemma2.py +++ b/tests/e2e/test_gemma2.py @@ -62,7 +62,7 @@ class TestGemma2: "chat_template": "gemma", # gemma2's template is same as gemma "num_epochs": 1, "micro_batch_size": 1, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", @@ -114,7 +114,7 @@ class TestGemma2: }, "num_epochs": 1, "micro_batch_size": 1, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", diff --git a/tests/e2e/test_gemma3_text.py b/tests/e2e/test_gemma3_text.py index 14423ce73..5cbde04d1 100644 --- a/tests/e2e/test_gemma3_text.py +++ b/tests/e2e/test_gemma3_text.py @@ -61,7 +61,7 @@ class TestGemma3Text: "chat_template": "gemma3", "num_epochs": 1, "micro_batch_size": 1, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", @@ -112,7 +112,7 @@ class TestGemma3Text: }, "num_epochs": 1, "micro_batch_size": 1, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", diff --git a/tests/e2e/test_llama.py b/tests/e2e/test_llama.py index 8d6483ea4..b84e432b5 100644 --- a/tests/e2e/test_llama.py +++ b/tests/e2e/test_llama.py @@ -30,7 +30,7 @@ class TestLlama: "tokenizer_type": "LlamaTokenizer", "trust_remote_code": True, "sequence_len": 512, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "unk_token": "", "bos_token": "", diff --git a/tests/e2e/test_llama_vision.py b/tests/e2e/test_llama_vision.py index c4a41f521..3fc12afcc 100644 --- a/tests/e2e/test_llama_vision.py +++ b/tests/e2e/test_llama_vision.py @@ -52,7 +52,7 @@ class TestLlamaVision(unittest.TestCase): ], "num_epochs": 1, "micro_batch_size": 1, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", @@ -99,7 +99,7 @@ class TestLlamaVision(unittest.TestCase): ], "num_epochs": 1, "micro_batch_size": 1, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", diff --git a/tests/e2e/test_load_model.py b/tests/e2e/test_load_model.py index 255b096b0..2128dbef2 100644 --- a/tests/e2e/test_load_model.py +++ b/tests/e2e/test_load_model.py @@ -36,7 +36,7 @@ class TestLoadModelUtils: "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "unk_token": "", "bos_token": "", diff --git a/tests/e2e/test_lora_llama.py b/tests/e2e/test_lora_llama.py index d314fb197..8328d5b90 100644 --- a/tests/e2e/test_lora_llama.py +++ b/tests/e2e/test_lora_llama.py @@ -37,7 +37,7 @@ class TestLoraLlama(unittest.TestCase): "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "unk_token": "", "bos_token": "", diff --git a/tests/e2e/test_mistral.py b/tests/e2e/test_mistral.py index 2468a45e9..740fa6eed 100644 --- a/tests/e2e/test_mistral.py +++ b/tests/e2e/test_mistral.py @@ -39,7 +39,7 @@ class TestMistral(unittest.TestCase): "lora_alpha": 64, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "unk_token": "", "bos_token": "", @@ -80,7 +80,7 @@ class TestMistral(unittest.TestCase): "base_model": "openaccess-ai-collective/tiny-mistral", "flash_attention": True, "sequence_len": 1024, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "unk_token": "", "bos_token": "", diff --git a/tests/e2e/test_mixtral.py b/tests/e2e/test_mixtral.py index f31920be6..4e0693b94 100644 --- a/tests/e2e/test_mixtral.py +++ b/tests/e2e/test_mixtral.py @@ -49,7 +49,7 @@ class TestMixtral(unittest.TestCase): "q_proj", "w2", ], - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { @@ -105,7 +105,7 @@ class TestMixtral(unittest.TestCase): "q_proj", "w2", ], - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { @@ -160,7 +160,7 @@ class TestMixtral(unittest.TestCase): "q_proj", "w2", ], - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { @@ -219,7 +219,7 @@ class TestMixtral(unittest.TestCase): "q_proj", "w2", ], - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { @@ -265,7 +265,7 @@ class TestMixtral(unittest.TestCase): "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF", "flash_attention": True, "sequence_len": 1024, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": {}, "datasets": [ { diff --git a/tests/e2e/test_optimizers.py b/tests/e2e/test_optimizers.py index 43a4735aa..8a82e3469 100644 --- a/tests/e2e/test_optimizers.py +++ b/tests/e2e/test_optimizers.py @@ -37,7 +37,7 @@ class TestCustomOptimizers(unittest.TestCase): "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "unk_token": "", "bos_token": "", @@ -84,7 +84,7 @@ class TestCustomOptimizers(unittest.TestCase): "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "unk_token": "", "bos_token": "", @@ -131,7 +131,7 @@ class TestCustomOptimizers(unittest.TestCase): "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "unk_token": "", "bos_token": "", diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py index 30054962c..4e8e70419 100644 --- a/tests/e2e/test_packing_loss.py +++ b/tests/e2e/test_packing_loss.py @@ -46,7 +46,7 @@ class TestPackedLlama(unittest.TestCase): ], "num_epochs": 1, "micro_batch_size": 2, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", diff --git a/tests/e2e/test_phi.py b/tests/e2e/test_phi.py index 49f9261c9..268646432 100644 --- a/tests/e2e/test_phi.py +++ b/tests/e2e/test_phi.py @@ -35,7 +35,7 @@ class TestPhi(unittest.TestCase): "sample_packing": False, "load_in_8bit": False, "adapter": None, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -85,7 +85,7 @@ class TestPhi(unittest.TestCase): "lora_alpha": 32, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "pad_token": "<|endoftext|>", }, diff --git a/tests/e2e/test_schedulers.py b/tests/e2e/test_schedulers.py index 2d5040ae3..c20cebf4e 100644 --- a/tests/e2e/test_schedulers.py +++ b/tests/e2e/test_schedulers.py @@ -37,7 +37,7 @@ class TestCustomSchedulers(unittest.TestCase): "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.1, + "val_set_size": 0.02, "special_tokens": { "unk_token": "", "bos_token": "", diff --git a/tests/test_exact_deduplication.py b/tests/test_exact_deduplication.py index a75f97f78..4d069a11d 100644 --- a/tests/test_exact_deduplication.py +++ b/tests/test_exact_deduplication.py @@ -313,7 +313,7 @@ class TestDeduplicateNonRL(unittest.TestCase): }, ], "val_set_size": 0.0, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 2, "batch_size": 10, "micro_batch_size": 10, "num_epochs": 1,