Compare commits
3 Commits
llama-4-z3
...
af3f981f51
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
af3f981f51 | ||
|
|
52b96031b4 | ||
|
|
03dcf1a5ea |
@@ -12,7 +12,7 @@ liger-kernel==0.5.5
|
||||
packaging==23.2
|
||||
|
||||
peft==0.15.0
|
||||
transformers @ git+https://github.com/huggingface/transformers.git@yet-another-deepspeed
|
||||
transformers==4.50.3
|
||||
tokenizers>=0.21.1
|
||||
accelerate==1.6.0
|
||||
datasets==3.5.0
|
||||
|
||||
@@ -13,7 +13,6 @@ from axolotl.monkeypatch.utils import get_unpad_data
|
||||
SUPPORTED_MULTIPACK_MODEL_TYPES = [
|
||||
"mllama_text_model",
|
||||
"llama",
|
||||
"llama4",
|
||||
"mistral",
|
||||
"mixtral",
|
||||
"qwen2",
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -950,6 +950,7 @@ class AxolotlInputConfig(
|
||||
and "8bit" in data.get("optimizer", "")
|
||||
and data.get("fsdp_config")
|
||||
and data["fsdp_config"].get("fsdp_offload_params")
|
||||
and str(data["fsdp_config"].get("fsdp_version")) != "2"
|
||||
):
|
||||
raise ValueError(
|
||||
f"FSDP Offload not compatible with {data.get('optimizer')}"
|
||||
|
||||
@@ -538,6 +538,8 @@ def setup_deepspeed_env(cfg, stage=None):
|
||||
|
||||
def setup_fsdp_envs(cfg):
|
||||
os.environ["ACCELERATE_USE_FSDP"] = "true"
|
||||
if str(cfg.fsdp_version) == "2":
|
||||
os.environ["FSDP_VERSION"] = "2"
|
||||
if cfg.fsdp_config.fsdp_activation_checkpointing:
|
||||
os.environ["FSDP_ACTIVATION_CHECKPOINTING"] = "true"
|
||||
if cfg.fsdp_config.fsdp_offload_params:
|
||||
@@ -556,6 +558,10 @@ def setup_fsdp_envs(cfg):
|
||||
os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = (
|
||||
cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap
|
||||
)
|
||||
if cfg.fsdp_config.fsdp_reshard_after_forward is not None:
|
||||
os.environ["FSDP_RESHARD_AFTER_FORWARD"] = (
|
||||
"true" if cfg.fsdp_config.fsdp_reshard_after_forward else "false"
|
||||
)
|
||||
|
||||
|
||||
def prepare_optim_env(cfg):
|
||||
|
||||
@@ -56,7 +56,7 @@ class TestMultiGPULlama:
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 1,
|
||||
"micro_batch_size": 4,
|
||||
"gradient_accumulation_steps": 4,
|
||||
# "gradient_checkpointing": True,
|
||||
"output_dir": temp_dir,
|
||||
@@ -193,7 +193,7 @@ class TestMultiGPULlama:
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 2,
|
||||
"micro_batch_size": 4,
|
||||
"gradient_accumulation_steps": 4,
|
||||
# "gradient_checkpointing": True,
|
||||
"output_dir": temp_dir,
|
||||
@@ -390,7 +390,7 @@ class TestMultiGPULlama:
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
"sequence_len": 2048,
|
||||
"val_set_size": 0.01,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
@@ -403,7 +403,7 @@ class TestMultiGPULlama:
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 2,
|
||||
"micro_batch_size": 4,
|
||||
"gradient_accumulation_steps": 2,
|
||||
# "gradient_checkpointing": True,
|
||||
"output_dir": temp_dir,
|
||||
@@ -469,7 +469,7 @@ class TestMultiGPULlama:
|
||||
"sample_packing": True,
|
||||
"eval_sample_packing": False,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
"sequence_len": 2048,
|
||||
"val_set_size": 0.01,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
@@ -483,7 +483,7 @@ class TestMultiGPULlama:
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 2,
|
||||
"micro_batch_size": 4,
|
||||
"gradient_accumulation_steps": 2,
|
||||
# "gradient_checkpointing": True,
|
||||
"output_dir": temp_dir,
|
||||
@@ -566,7 +566,7 @@ class TestMultiGPULlama:
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
"sequence_len": 2048,
|
||||
"val_set_size": 0.01,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
@@ -639,7 +639,7 @@ class TestMultiGPULlama:
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
"sequence_len": 2048,
|
||||
"val_set_size": 0.01,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
@@ -712,7 +712,7 @@ class TestMultiGPULlama:
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 1024,
|
||||
"sequence_len": 2048,
|
||||
"val_set_size": 0.01,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
|
||||
Reference in New Issue
Block a user