upgrade peft==0.16.0 and datasets==4.0.0 (#2917) [skip ci]
* upgrade peft to 0.16.0 * upgrade datasets to 4.0.0 * refactor dupes from merge/rebase * fix check for fsdp1 + sharded_state_dict * use full state dict for ci
This commit is contained in:
@@ -12,11 +12,11 @@ liger-kernel==0.6.0
|
|||||||
packaging==23.2
|
packaging==23.2
|
||||||
|
|
||||||
huggingface_hub>=0.33.0
|
huggingface_hub>=0.33.0
|
||||||
peft==0.15.2
|
peft==0.16.0
|
||||||
transformers==4.53.2
|
transformers==4.53.2
|
||||||
tokenizers>=0.21.1
|
tokenizers>=0.21.1
|
||||||
accelerate==1.8.1
|
accelerate==1.8.1
|
||||||
datasets==3.6.0
|
datasets==4.0.0
|
||||||
deepspeed>=0.17.0
|
deepspeed>=0.17.0
|
||||||
trl==0.19.1
|
trl==0.19.1
|
||||||
hf_xet==1.1.2
|
hf_xet==1.1.2
|
||||||
|
|||||||
@@ -865,6 +865,7 @@ class OptimizationValidationMixin:
|
|||||||
and hasattr(self, "save_safetensors")
|
and hasattr(self, "save_safetensors")
|
||||||
and self.save_safetensors
|
and self.save_safetensors
|
||||||
and self.fsdp_config.get("state_dict_type", "") == "SHARDED_STATE_DICT"
|
and self.fsdp_config.get("state_dict_type", "") == "SHARDED_STATE_DICT"
|
||||||
|
and str(getattr(self, "fsdp_version", "1")) != "2"
|
||||||
):
|
):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"FSDP SHARDED_STATE_DICT not compatible with save_safetensors"
|
"FSDP SHARDED_STATE_DICT not compatible with save_safetensors"
|
||||||
|
|||||||
@@ -391,7 +391,10 @@ class TestMultiGPULlama:
|
|||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"fsdp_state_dict_type",
|
"fsdp_state_dict_type",
|
||||||
["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
|
[
|
||||||
|
"FULL_STATE_DICT",
|
||||||
|
# "SHARDED_STATE_DICT", # not supported since intermediate checkpoints fail with fsdp1
|
||||||
|
],
|
||||||
)
|
)
|
||||||
def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
|
def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
@@ -413,7 +416,8 @@ class TestMultiGPULlama:
|
|||||||
},
|
},
|
||||||
],
|
],
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"max_steps": 2,
|
"max_steps": 3,
|
||||||
|
"save_steps": 2,
|
||||||
"micro_batch_size": 2,
|
"micro_batch_size": 2,
|
||||||
"gradient_accumulation_steps": 2,
|
"gradient_accumulation_steps": 2,
|
||||||
# "gradient_checkpointing": True,
|
# "gradient_checkpointing": True,
|
||||||
@@ -597,7 +601,7 @@ class TestMultiGPULlama:
|
|||||||
"fsdp_use_orig_params": False,
|
"fsdp_use_orig_params": False,
|
||||||
"fsdp_cpu_ram_efficient_loading": True,
|
"fsdp_cpu_ram_efficient_loading": True,
|
||||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||||
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
"fsdp_state_dict_type": "FULL_STATE_DICT",
|
||||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||||
},
|
},
|
||||||
"use_tensorboard": True,
|
"use_tensorboard": True,
|
||||||
|
|||||||
Reference in New Issue
Block a user