diff --git a/requirements.txt b/requirements.txt index eeb3b864d..215bc1271 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,11 +12,11 @@ liger-kernel==0.6.0 packaging==23.2 huggingface_hub>=0.33.0 -peft==0.15.2 +peft==0.16.0 transformers==4.53.2 tokenizers>=0.21.1 accelerate==1.8.1 -datasets==3.6.0 +datasets==4.0.0 deepspeed>=0.17.0 trl==0.19.1 hf_xet==1.1.2 diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py index 534d89a98..bf2bc9070 100644 --- a/src/axolotl/utils/schemas/validation.py +++ b/src/axolotl/utils/schemas/validation.py @@ -865,6 +865,7 @@ class OptimizationValidationMixin: and hasattr(self, "save_safetensors") and self.save_safetensors and self.fsdp_config.get("state_dict_type", "") == "SHARDED_STATE_DICT" + and str(getattr(self, "fsdp_version", "1")) != "2" ): raise ValueError( "FSDP SHARDED_STATE_DICT not compatible with save_safetensors" diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py index fcc174f27..f0c74fbf8 100644 --- a/tests/e2e/multigpu/test_llama.py +++ b/tests/e2e/multigpu/test_llama.py @@ -391,7 +391,10 @@ class TestMultiGPULlama: @pytest.mark.parametrize( "fsdp_state_dict_type", - ["FULL_STATE_DICT", "SHARDED_STATE_DICT"], + [ + "FULL_STATE_DICT", + # "SHARDED_STATE_DICT", # not supported since intermediate checkpoints fail with fsdp1 + ], ) def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type): # pylint: disable=duplicate-code @@ -413,7 +416,8 @@ class TestMultiGPULlama: }, ], "num_epochs": 1, - "max_steps": 2, + "max_steps": 3, + "save_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 2, # "gradient_checkpointing": True, @@ -597,7 +601,7 @@ class TestMultiGPULlama: "fsdp_use_orig_params": False, "fsdp_cpu_ram_efficient_loading": True, "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", - "fsdp_state_dict_type": "SHARDED_STATE_DICT", + "fsdp_state_dict_type": "FULL_STATE_DICT", "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", }, "use_tensorboard": True,