Compare commits

...

3 Commits

Author SHA1 Message Date
Wing Lian
9509abccdd use yet-another-deepspeed branch from transformers#37324 2025-04-06 13:21:45 -04:00
Wing Lian
3acefba9ba point to branch for potential zero3 fix 2025-04-06 13:21:45 -04:00
Wing Lian
100e5ea6ea llama4 support 2025-04-06 13:21:45 -04:00
4 changed files with 13 additions and 11 deletions

View File

@@ -12,9 +12,9 @@ liger-kernel==0.5.5
packaging==23.2 packaging==23.2
peft==0.15.0 peft==0.15.0
transformers==4.50.3 transformers @ git+https://github.com/huggingface/transformers.git@yet-another-deepspeed
tokenizers>=0.21.1 tokenizers>=0.21.1
accelerate==1.5.2 accelerate==1.6.0
datasets==3.5.0 datasets==3.5.0
deepspeed==0.15.4 deepspeed==0.15.4
trl==0.16.0 trl==0.16.0

View File

@@ -13,6 +13,7 @@ from axolotl.monkeypatch.utils import get_unpad_data
SUPPORTED_MULTIPACK_MODEL_TYPES = [ SUPPORTED_MULTIPACK_MODEL_TYPES = [
"mllama_text_model", "mllama_text_model",
"llama", "llama",
"llama4",
"mistral", "mistral",
"mixtral", "mixtral",
"qwen2", "qwen2",

File diff suppressed because one or more lines are too long

View File

@@ -56,7 +56,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 4, "micro_batch_size": 1,
"gradient_accumulation_steps": 4, "gradient_accumulation_steps": 4,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -193,7 +193,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 4, "micro_batch_size": 2,
"gradient_accumulation_steps": 4, "gradient_accumulation_steps": 4,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -390,7 +390,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -403,7 +403,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 4, "micro_batch_size": 2,
"gradient_accumulation_steps": 2, "gradient_accumulation_steps": 2,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -469,7 +469,7 @@ class TestMultiGPULlama:
"sample_packing": True, "sample_packing": True,
"eval_sample_packing": False, "eval_sample_packing": False,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -483,7 +483,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 4, "micro_batch_size": 2,
"gradient_accumulation_steps": 2, "gradient_accumulation_steps": 2,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -566,7 +566,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -639,7 +639,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -712,7 +712,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",