llama4 support

This commit is contained in:
Wing Lian
2025-04-05 17:47:16 -04:00
parent 5f4af3665d
commit 98827e8f3b
3 changed files with 11 additions and 9 deletions

View File

@@ -13,6 +13,7 @@ from axolotl.monkeypatch.utils import get_unpad_data
SUPPORTED_MULTIPACK_MODEL_TYPES = [ SUPPORTED_MULTIPACK_MODEL_TYPES = [
"mllama_text_model", "mllama_text_model",
"llama", "llama",
"llama4",
"mistral", "mistral",
"mixtral", "mixtral",
"qwen2", "qwen2",

File diff suppressed because one or more lines are too long

View File

@@ -56,7 +56,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 4, "micro_batch_size": 1,
"gradient_accumulation_steps": 4, "gradient_accumulation_steps": 4,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -193,7 +193,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 4, "micro_batch_size": 2,
"gradient_accumulation_steps": 4, "gradient_accumulation_steps": 4,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -390,7 +390,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -403,7 +403,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 4, "micro_batch_size": 2,
"gradient_accumulation_steps": 2, "gradient_accumulation_steps": 2,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -551,7 +551,7 @@ class TestMultiGPULlama:
"sample_packing": True, "sample_packing": True,
"eval_sample_packing": False, "eval_sample_packing": False,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -565,7 +565,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 4, "micro_batch_size": 2,
"gradient_accumulation_steps": 2, "gradient_accumulation_steps": 2,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -651,7 +651,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -724,7 +724,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -797,7 +797,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",