make e2e tests a bit faster by reducing test split size (#2522) [skip ci]
* [ci] make e2e tests a bit faster by reducing test split size * use 10% split of alpaca dataset to speed up dataset loading/tokenization * reduce gas 4->2 for most e2e tests * increase val set size for packing
This commit is contained in:
@@ -25,7 +25,7 @@ def min_cfg(temp_dir):
|
|||||||
],
|
],
|
||||||
"cut_cross_entropy": True,
|
"cut_cross_entropy": True,
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"pad_token": "<|endoftext|>",
|
"pad_token": "<|endoftext|>",
|
||||||
},
|
},
|
||||||
@@ -79,7 +79,7 @@ class TestCutCrossEntropyIntegration:
|
|||||||
],
|
],
|
||||||
"cut_cross_entropy": True,
|
"cut_cross_entropy": True,
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"pad_token": "<|endoftext|>",
|
"pad_token": "<|endoftext|>",
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ class TestPackedFlex:
|
|||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"micro_batch_size": 2,
|
"micro_batch_size": 2,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_torch_fused",
|
"optimizer": "adamw_torch_fused",
|
||||||
|
|||||||
@@ -58,12 +58,13 @@ class TestMultiGPULlama:
|
|||||||
{
|
{
|
||||||
"path": "tatsu-lab/alpaca",
|
"path": "tatsu-lab/alpaca",
|
||||||
"type": "alpaca",
|
"type": "alpaca",
|
||||||
|
"split": "train[:10%]",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"max_steps": 2,
|
"max_steps": 2,
|
||||||
"micro_batch_size": 1,
|
"micro_batch_size": 1,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
# "gradient_checkpointing": True,
|
# "gradient_checkpointing": True,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
@@ -201,7 +202,7 @@ class TestMultiGPULlama:
|
|||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"max_steps": 2,
|
"max_steps": 2,
|
||||||
"micro_batch_size": 2,
|
"micro_batch_size": 2,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
# "gradient_checkpointing": True,
|
# "gradient_checkpointing": True,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"warmup_steps": 0,
|
"warmup_steps": 0,
|
||||||
@@ -279,7 +280,7 @@ class TestMultiGPULlama:
|
|||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"max_steps": 2,
|
"max_steps": 2,
|
||||||
"micro_batch_size": 2,
|
"micro_batch_size": 2,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
# "gradient_checkpointing": True,
|
# "gradient_checkpointing": True,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"warmup_steps": 0,
|
"warmup_steps": 0,
|
||||||
@@ -335,6 +336,7 @@ class TestMultiGPULlama:
|
|||||||
{
|
{
|
||||||
"path": "tatsu-lab/alpaca",
|
"path": "tatsu-lab/alpaca",
|
||||||
"type": "alpaca",
|
"type": "alpaca",
|
||||||
|
"split": "train[:10%]",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
@@ -398,7 +400,7 @@ class TestMultiGPULlama:
|
|||||||
"sample_packing": True,
|
"sample_packing": True,
|
||||||
"pad_to_sequence_len": True,
|
"pad_to_sequence_len": True,
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"val_set_size": 0.01,
|
"val_set_size": 0.05,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"pad_token": "<|endoftext|>",
|
"pad_token": "<|endoftext|>",
|
||||||
},
|
},
|
||||||
@@ -406,6 +408,7 @@ class TestMultiGPULlama:
|
|||||||
{
|
{
|
||||||
"path": "tatsu-lab/alpaca",
|
"path": "tatsu-lab/alpaca",
|
||||||
"type": "alpaca",
|
"type": "alpaca",
|
||||||
|
"split": "train[:10%]",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
@@ -484,6 +487,7 @@ class TestMultiGPULlama:
|
|||||||
{
|
{
|
||||||
"path": "tatsu-lab/alpaca",
|
"path": "tatsu-lab/alpaca",
|
||||||
"type": "alpaca",
|
"type": "alpaca",
|
||||||
|
"split": "train[:10%]",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
@@ -565,7 +569,7 @@ class TestMultiGPULlama:
|
|||||||
{
|
{
|
||||||
"path": "tatsu-lab/alpaca",
|
"path": "tatsu-lab/alpaca",
|
||||||
"type": "alpaca",
|
"type": "alpaca",
|
||||||
"split": "train[:25%]",
|
"split": "train[:10%]",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
@@ -660,7 +664,7 @@ class TestMultiGPULlama:
|
|||||||
"sample_packing": True,
|
"sample_packing": True,
|
||||||
"pad_to_sequence_len": True,
|
"pad_to_sequence_len": True,
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"val_set_size": 0.01,
|
"val_set_size": 0.05,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"pad_token": "<|endoftext|>",
|
"pad_token": "<|endoftext|>",
|
||||||
},
|
},
|
||||||
@@ -668,6 +672,7 @@ class TestMultiGPULlama:
|
|||||||
{
|
{
|
||||||
"path": "tatsu-lab/alpaca",
|
"path": "tatsu-lab/alpaca",
|
||||||
"type": "alpaca",
|
"type": "alpaca",
|
||||||
|
"split": "train[:10%]",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
@@ -741,6 +746,7 @@ class TestMultiGPULlama:
|
|||||||
{
|
{
|
||||||
"path": "tatsu-lab/alpaca",
|
"path": "tatsu-lab/alpaca",
|
||||||
"type": "alpaca",
|
"type": "alpaca",
|
||||||
|
"split": "train[:10%]",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
@@ -814,6 +820,7 @@ class TestMultiGPULlama:
|
|||||||
{
|
{
|
||||||
"path": "tatsu-lab/alpaca",
|
"path": "tatsu-lab/alpaca",
|
||||||
"type": "alpaca",
|
"type": "alpaca",
|
||||||
|
"split": "train[:10%]",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ class TestMultiGPURay:
|
|||||||
{
|
{
|
||||||
"path": "tatsu-lab/alpaca",
|
"path": "tatsu-lab/alpaca",
|
||||||
"type": "alpaca",
|
"type": "alpaca",
|
||||||
|
"split": "train[:10%]",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
@@ -103,6 +104,7 @@ class TestMultiGPURay:
|
|||||||
{
|
{
|
||||||
"path": "tatsu-lab/alpaca",
|
"path": "tatsu-lab/alpaca",
|
||||||
"type": "alpaca",
|
"type": "alpaca",
|
||||||
|
"split": "train[:10%]",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ class TestSequenceParallelism:
|
|||||||
{
|
{
|
||||||
"path": "tatsu-lab/alpaca",
|
"path": "tatsu-lab/alpaca",
|
||||||
"type": "alpaca",
|
"type": "alpaca",
|
||||||
|
"split": "train[:10%]",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
|
|||||||
"lora_alpha": 16,
|
"lora_alpha": 16,
|
||||||
"lora_dropout": 0.05,
|
"lora_dropout": 0.05,
|
||||||
"lora_target_linear": True,
|
"lora_target_linear": True,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {},
|
"special_tokens": {},
|
||||||
"datasets": [
|
"datasets": [
|
||||||
{
|
{
|
||||||
@@ -83,7 +83,7 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
|
|||||||
"sample_packing": False,
|
"sample_packing": False,
|
||||||
"flash_attention": True,
|
"flash_attention": True,
|
||||||
"s2_attention": True,
|
"s2_attention": True,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {},
|
"special_tokens": {},
|
||||||
"datasets": [
|
"datasets": [
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ class TestModelPatches(unittest.TestCase):
|
|||||||
"flash_attention": True,
|
"flash_attention": True,
|
||||||
"sample_packing": True,
|
"sample_packing": True,
|
||||||
"sequence_len": 2048,
|
"sequence_len": 2048,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {},
|
"special_tokens": {},
|
||||||
"datasets": [
|
"datasets": [
|
||||||
{
|
{
|
||||||
@@ -59,7 +59,7 @@ class TestModelPatches(unittest.TestCase):
|
|||||||
"flash_attention": True,
|
"flash_attention": True,
|
||||||
"sample_packing": True,
|
"sample_packing": True,
|
||||||
"sequence_len": 2048,
|
"sequence_len": 2048,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {},
|
"special_tokens": {},
|
||||||
"datasets": [
|
"datasets": [
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -88,7 +88,7 @@ class TestPhiMultipack(unittest.TestCase):
|
|||||||
"lora_alpha": 32,
|
"lora_alpha": 32,
|
||||||
"lora_dropout": 0.05,
|
"lora_dropout": 0.05,
|
||||||
"lora_target_linear": True,
|
"lora_target_linear": True,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"pad_token": "<|endoftext|>",
|
"pad_token": "<|endoftext|>",
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ class TestPackedFlex(unittest.TestCase):
|
|||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"micro_batch_size": 2,
|
"micro_batch_size": 2,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_torch_fused",
|
"optimizer": "adamw_torch_fused",
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ class TestDeepseekV3:
|
|||||||
"chat_template": "deepseek_v3",
|
"chat_template": "deepseek_v3",
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"micro_batch_size": 1,
|
"micro_batch_size": 1,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
@@ -115,7 +115,7 @@ class TestDeepseekV3:
|
|||||||
},
|
},
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"micro_batch_size": 1,
|
"micro_batch_size": 1,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ class TestFalcon(unittest.TestCase):
|
|||||||
"word_embeddings",
|
"word_embeddings",
|
||||||
"lm_head",
|
"lm_head",
|
||||||
],
|
],
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"bos_token": "<|endoftext|>",
|
"bos_token": "<|endoftext|>",
|
||||||
"pad_token": "<|endoftext|>",
|
"pad_token": "<|endoftext|>",
|
||||||
@@ -92,7 +92,7 @@ class TestFalcon(unittest.TestCase):
|
|||||||
"word_embeddings",
|
"word_embeddings",
|
||||||
"lm_head",
|
"lm_head",
|
||||||
],
|
],
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"bos_token": "<|endoftext|>",
|
"bos_token": "<|endoftext|>",
|
||||||
"pad_token": "<|endoftext|>",
|
"pad_token": "<|endoftext|>",
|
||||||
@@ -137,7 +137,7 @@ class TestFalcon(unittest.TestCase):
|
|||||||
"base_model": "illuin/tiny-random-FalconForCausalLM",
|
"base_model": "illuin/tiny-random-FalconForCausalLM",
|
||||||
"flash_attention": True,
|
"flash_attention": True,
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"bos_token": "<|endoftext|>",
|
"bos_token": "<|endoftext|>",
|
||||||
"pad_token": "<|endoftext|>",
|
"pad_token": "<|endoftext|>",
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ class TestGemma2:
|
|||||||
"chat_template": "gemma", # gemma2's template is same as gemma
|
"chat_template": "gemma", # gemma2's template is same as gemma
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"micro_batch_size": 1,
|
"micro_batch_size": 1,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
@@ -114,7 +114,7 @@ class TestGemma2:
|
|||||||
},
|
},
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"micro_batch_size": 1,
|
"micro_batch_size": 1,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ class TestGemma3Text:
|
|||||||
"chat_template": "gemma3",
|
"chat_template": "gemma3",
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"micro_batch_size": 1,
|
"micro_batch_size": 1,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
@@ -112,7 +112,7 @@ class TestGemma3Text:
|
|||||||
},
|
},
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"micro_batch_size": 1,
|
"micro_batch_size": 1,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class TestLlama:
|
|||||||
"tokenizer_type": "LlamaTokenizer",
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"trust_remote_code": True,
|
"trust_remote_code": True,
|
||||||
"sequence_len": 512,
|
"sequence_len": 512,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"unk_token": "<unk>",
|
"unk_token": "<unk>",
|
||||||
"bos_token": "<s>",
|
"bos_token": "<s>",
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ class TestLlamaVision(unittest.TestCase):
|
|||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"micro_batch_size": 1,
|
"micro_batch_size": 1,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
@@ -99,7 +99,7 @@ class TestLlamaVision(unittest.TestCase):
|
|||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"micro_batch_size": 1,
|
"micro_batch_size": 1,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ class TestLoadModelUtils:
|
|||||||
"lora_alpha": 16,
|
"lora_alpha": 16,
|
||||||
"lora_dropout": 0.05,
|
"lora_dropout": 0.05,
|
||||||
"lora_target_linear": True,
|
"lora_target_linear": True,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"unk_token": "<unk>",
|
"unk_token": "<unk>",
|
||||||
"bos_token": "<s>",
|
"bos_token": "<s>",
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ class TestLoraLlama(unittest.TestCase):
|
|||||||
"lora_alpha": 16,
|
"lora_alpha": 16,
|
||||||
"lora_dropout": 0.05,
|
"lora_dropout": 0.05,
|
||||||
"lora_target_linear": True,
|
"lora_target_linear": True,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"unk_token": "<unk>",
|
"unk_token": "<unk>",
|
||||||
"bos_token": "<s>",
|
"bos_token": "<s>",
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ class TestMistral(unittest.TestCase):
|
|||||||
"lora_alpha": 64,
|
"lora_alpha": 64,
|
||||||
"lora_dropout": 0.05,
|
"lora_dropout": 0.05,
|
||||||
"lora_target_linear": True,
|
"lora_target_linear": True,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"unk_token": "<unk>",
|
"unk_token": "<unk>",
|
||||||
"bos_token": "<s>",
|
"bos_token": "<s>",
|
||||||
@@ -80,7 +80,7 @@ class TestMistral(unittest.TestCase):
|
|||||||
"base_model": "openaccess-ai-collective/tiny-mistral",
|
"base_model": "openaccess-ai-collective/tiny-mistral",
|
||||||
"flash_attention": True,
|
"flash_attention": True,
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"unk_token": "<unk>",
|
"unk_token": "<unk>",
|
||||||
"bos_token": "<s>",
|
"bos_token": "<s>",
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ class TestMixtral(unittest.TestCase):
|
|||||||
"q_proj",
|
"q_proj",
|
||||||
"w2",
|
"w2",
|
||||||
],
|
],
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {},
|
"special_tokens": {},
|
||||||
"datasets": [
|
"datasets": [
|
||||||
{
|
{
|
||||||
@@ -105,7 +105,7 @@ class TestMixtral(unittest.TestCase):
|
|||||||
"q_proj",
|
"q_proj",
|
||||||
"w2",
|
"w2",
|
||||||
],
|
],
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {},
|
"special_tokens": {},
|
||||||
"datasets": [
|
"datasets": [
|
||||||
{
|
{
|
||||||
@@ -160,7 +160,7 @@ class TestMixtral(unittest.TestCase):
|
|||||||
"q_proj",
|
"q_proj",
|
||||||
"w2",
|
"w2",
|
||||||
],
|
],
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {},
|
"special_tokens": {},
|
||||||
"datasets": [
|
"datasets": [
|
||||||
{
|
{
|
||||||
@@ -219,7 +219,7 @@ class TestMixtral(unittest.TestCase):
|
|||||||
"q_proj",
|
"q_proj",
|
||||||
"w2",
|
"w2",
|
||||||
],
|
],
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {},
|
"special_tokens": {},
|
||||||
"datasets": [
|
"datasets": [
|
||||||
{
|
{
|
||||||
@@ -265,7 +265,7 @@ class TestMixtral(unittest.TestCase):
|
|||||||
"tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF",
|
"tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF",
|
||||||
"flash_attention": True,
|
"flash_attention": True,
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {},
|
"special_tokens": {},
|
||||||
"datasets": [
|
"datasets": [
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ class TestCustomOptimizers(unittest.TestCase):
|
|||||||
"lora_alpha": 16,
|
"lora_alpha": 16,
|
||||||
"lora_dropout": 0.05,
|
"lora_dropout": 0.05,
|
||||||
"lora_target_linear": True,
|
"lora_target_linear": True,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"unk_token": "<unk>",
|
"unk_token": "<unk>",
|
||||||
"bos_token": "<s>",
|
"bos_token": "<s>",
|
||||||
@@ -84,7 +84,7 @@ class TestCustomOptimizers(unittest.TestCase):
|
|||||||
"lora_alpha": 16,
|
"lora_alpha": 16,
|
||||||
"lora_dropout": 0.05,
|
"lora_dropout": 0.05,
|
||||||
"lora_target_linear": True,
|
"lora_target_linear": True,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"unk_token": "<unk>",
|
"unk_token": "<unk>",
|
||||||
"bos_token": "<s>",
|
"bos_token": "<s>",
|
||||||
@@ -131,7 +131,7 @@ class TestCustomOptimizers(unittest.TestCase):
|
|||||||
"lora_alpha": 16,
|
"lora_alpha": 16,
|
||||||
"lora_dropout": 0.05,
|
"lora_dropout": 0.05,
|
||||||
"lora_target_linear": True,
|
"lora_target_linear": True,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"unk_token": "<unk>",
|
"unk_token": "<unk>",
|
||||||
"bos_token": "<s>",
|
"bos_token": "<s>",
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ class TestPackedLlama(unittest.TestCase):
|
|||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"micro_batch_size": 2,
|
"micro_batch_size": 2,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_torch_fused",
|
"optimizer": "adamw_torch_fused",
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ class TestPhi(unittest.TestCase):
|
|||||||
"sample_packing": False,
|
"sample_packing": False,
|
||||||
"load_in_8bit": False,
|
"load_in_8bit": False,
|
||||||
"adapter": None,
|
"adapter": None,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"pad_token": "<|endoftext|>",
|
"pad_token": "<|endoftext|>",
|
||||||
},
|
},
|
||||||
@@ -85,7 +85,7 @@ class TestPhi(unittest.TestCase):
|
|||||||
"lora_alpha": 32,
|
"lora_alpha": 32,
|
||||||
"lora_dropout": 0.05,
|
"lora_dropout": 0.05,
|
||||||
"lora_target_linear": True,
|
"lora_target_linear": True,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"pad_token": "<|endoftext|>",
|
"pad_token": "<|endoftext|>",
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ class TestCustomSchedulers(unittest.TestCase):
|
|||||||
"lora_alpha": 16,
|
"lora_alpha": 16,
|
||||||
"lora_dropout": 0.05,
|
"lora_dropout": 0.05,
|
||||||
"lora_target_linear": True,
|
"lora_target_linear": True,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.02,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"unk_token": "<unk>",
|
"unk_token": "<unk>",
|
||||||
"bos_token": "<s>",
|
"bos_token": "<s>",
|
||||||
|
|||||||
@@ -313,7 +313,7 @@ class TestDeduplicateNonRL(unittest.TestCase):
|
|||||||
},
|
},
|
||||||
],
|
],
|
||||||
"val_set_size": 0.0,
|
"val_set_size": 0.0,
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
"batch_size": 10,
|
"batch_size": 10,
|
||||||
"micro_batch_size": 10,
|
"micro_batch_size": 10,
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
|
|||||||
Reference in New Issue
Block a user