upgrade trl and accelerate (#3161)
* upgrade trl==0.23.0 * upgrade accelerate patch fix * add hints when using gradient_checkpointing with DPO * set gradient-checpointing properly
This commit is contained in:
@@ -15,10 +15,10 @@ huggingface_hub>=0.33.0
|
|||||||
peft>=0.17.0
|
peft>=0.17.0
|
||||||
transformers==4.56.1
|
transformers==4.56.1
|
||||||
tokenizers>=0.21.1
|
tokenizers>=0.21.1
|
||||||
accelerate==1.10.0
|
accelerate==1.10.1
|
||||||
datasets==4.0.0
|
datasets==4.0.0
|
||||||
deepspeed>=0.17.0
|
deepspeed>=0.17.0
|
||||||
trl==0.21.0
|
trl==0.23.0
|
||||||
hf_xet==1.1.5
|
hf_xet==1.1.5
|
||||||
kernels==0.9.0
|
kernels==0.9.0
|
||||||
trackio
|
trackio
|
||||||
|
|||||||
@@ -435,7 +435,7 @@ class TrainerBuilderBase(abc.ABC):
|
|||||||
# don't use the HF gradient checkpointing, manually wrap
|
# don't use the HF gradient checkpointing, manually wrap
|
||||||
training_args_kwargs["gradient_checkpointing"] = False
|
training_args_kwargs["gradient_checkpointing"] = False
|
||||||
training_args_kwargs["activation_offloading"] = True
|
training_args_kwargs["activation_offloading"] = True
|
||||||
elif self.cfg.gradient_checkpointing:
|
elif self.cfg.gradient_checkpointing is not None:
|
||||||
training_args_kwargs["gradient_checkpointing"] = (
|
training_args_kwargs["gradient_checkpointing"] = (
|
||||||
self.cfg.gradient_checkpointing
|
self.cfg.gradient_checkpointing
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1378,6 +1378,21 @@ class ComplexValidationMixin:
|
|||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def hint_gradient_checkpointing_dpo_lora_ddp(self):
|
||||||
|
if (
|
||||||
|
(self.gradient_checkpointing is True or self.gradient_checkpointing is None)
|
||||||
|
and self.capabilities
|
||||||
|
and self.capabilities.get("n_gpu", 1) > 1
|
||||||
|
and self.adapter in ("lora", "qlora")
|
||||||
|
and self.rl == RLType.DPO
|
||||||
|
and not self.fsdp
|
||||||
|
and not self.deepspeed
|
||||||
|
):
|
||||||
|
LOG.warning(
|
||||||
|
"gradient_checkpointing with DPO + DDP + LoRA is not recommended."
|
||||||
|
)
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
class DistributedValidationMixin:
|
class DistributedValidationMixin:
|
||||||
"""validation for distributed training."""
|
"""validation for distributed training."""
|
||||||
|
|||||||
@@ -199,7 +199,7 @@ class TestMultiGPULlama:
|
|||||||
"max_steps": 2,
|
"max_steps": 2,
|
||||||
"micro_batch_size": 2,
|
"micro_batch_size": 2,
|
||||||
"gradient_accumulation_steps": 2,
|
"gradient_accumulation_steps": 2,
|
||||||
# "gradient_checkpointing": True,
|
"gradient_checkpointing": False,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||||
"warmup_steps": 0,
|
"warmup_steps": 0,
|
||||||
@@ -278,7 +278,7 @@ class TestMultiGPULlama:
|
|||||||
"max_steps": 2,
|
"max_steps": 2,
|
||||||
"micro_batch_size": 2,
|
"micro_batch_size": 2,
|
||||||
"gradient_accumulation_steps": 2,
|
"gradient_accumulation_steps": 2,
|
||||||
# "gradient_checkpointing": True,
|
"gradient_checkpointing": False,
|
||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||||
"warmup_steps": 0,
|
"warmup_steps": 0,
|
||||||
|
|||||||
Reference in New Issue
Block a user