Fix(config): Update handling of deepspeed config (#404)
* Fix(config): Update handling of deepspeed config * feat: auto set deepspeed env if deepspeed passed * fix: update new deepspeed instructions
This commit is contained in:
@@ -519,7 +519,7 @@ tokens:
|
|||||||
fsdp:
|
fsdp:
|
||||||
fsdp_config:
|
fsdp_config:
|
||||||
|
|
||||||
# Deepspeed
|
# Deepspeed config path
|
||||||
deepspeed:
|
deepspeed:
|
||||||
|
|
||||||
# Path to torch distx for optim 'adamw_anyprecision'
|
# Path to torch distx for optim 'adamw_anyprecision'
|
||||||
@@ -570,7 +570,10 @@ fsdp_config:
|
|||||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||||
```
|
```
|
||||||
|
|
||||||
- llama Deepspeed: append `ACCELERATE_USE_DEEPSPEED=true` in front of finetune command
|
- llama Deepspeed
|
||||||
|
```yaml
|
||||||
|
deepspeed: # path to config
|
||||||
|
```
|
||||||
|
|
||||||
##### Weights & Biases Logging
|
##### Weights & Biases Logging
|
||||||
|
|
||||||
|
|||||||
@@ -147,7 +147,7 @@ def validate_config(cfg):
|
|||||||
"You should probably set bfloat16 or float16 to true to "
|
"You should probably set bfloat16 or float16 to true to "
|
||||||
"load the model in float16 for BetterTransformers"
|
"load the model in float16 for BetterTransformers"
|
||||||
)
|
)
|
||||||
if int(torch.__version__.split(".")[0]) < 2:
|
if int(torch.__version__.split(".", maxsplit=1)[0]) < 2:
|
||||||
LOG.warning("torch>=2.0.0 required")
|
LOG.warning("torch>=2.0.0 required")
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"flash_optimum for BetterTransformers may not be used with {torch.__version__}"
|
f"flash_optimum for BetterTransformers may not be used with {torch.__version__}"
|
||||||
|
|||||||
@@ -364,6 +364,9 @@ def setup_fsdp_envs(cfg):
|
|||||||
def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
|
def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
|
||||||
if cfg.fsdp:
|
if cfg.fsdp:
|
||||||
setup_fsdp_envs(cfg)
|
setup_fsdp_envs(cfg)
|
||||||
|
elif cfg.deepspeed:
|
||||||
|
os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
|
||||||
|
|
||||||
warmup_steps = (
|
warmup_steps = (
|
||||||
cfg.warmup_steps
|
cfg.warmup_steps
|
||||||
if cfg.warmup_steps is not None
|
if cfg.warmup_steps is not None
|
||||||
@@ -411,21 +414,13 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
if cfg.fsdp_config:
|
if cfg.fsdp_config:
|
||||||
training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config)
|
training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config)
|
||||||
|
|
||||||
|
# deepspeed
|
||||||
|
if cfg.deepspeed:
|
||||||
|
training_arguments_kwargs["deepspeed"] = cfg.deepspeed
|
||||||
|
|
||||||
if cfg.lr_quadratic_warmup is not None:
|
if cfg.lr_quadratic_warmup is not None:
|
||||||
training_arguments_kwargs["lr_quadratic_warmup"] = cfg.lr_quadratic_warmup
|
training_arguments_kwargs["lr_quadratic_warmup"] = cfg.lr_quadratic_warmup
|
||||||
|
|
||||||
# deepspeed
|
|
||||||
if (
|
|
||||||
os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true"
|
|
||||||
and torch.cuda.device_count() > 1
|
|
||||||
):
|
|
||||||
if cfg.deepspeed:
|
|
||||||
training_arguments_kwargs["deepspeed"] = cfg.deepspeed
|
|
||||||
else:
|
|
||||||
# make a guess here
|
|
||||||
# TODO search Path("./") for one
|
|
||||||
training_arguments_kwargs["deepspeed"] = "./ds_config.json"
|
|
||||||
|
|
||||||
if cfg.adam_beta1:
|
if cfg.adam_beta1:
|
||||||
training_arguments_kwargs["adam_beta1"] = cfg.adam_beta1
|
training_arguments_kwargs["adam_beta1"] = cfg.adam_beta1
|
||||||
if cfg.adam_beta2:
|
if cfg.adam_beta2:
|
||||||
|
|||||||
Reference in New Issue
Block a user