From 28acebac365c091ac335be9e96cde25d585ae7de Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 27 May 2023 18:12:12 -0400 Subject: [PATCH] add flash attn context for efficient training and attempt setting model to train mode: --- scripts/finetune.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/scripts/finetune.py b/scripts/finetune.py index c23f9bfbc..0530f486f 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -242,6 +242,24 @@ def train( model.save_pretrained(cfg.output_dir) return + if cfg.debug: + logging.info("check_dataset_labels...") + check_dataset_labels( + train_dataset.select( + [random.randrange(0, len(train_dataset) - 1) for i in range(5)] + ), + tokenizer, + ) + + if prepare_ds_only: + logging.info("Finished preparing dataset. Exiting...") + return + + try: + model.train() + except: + pass + trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer) model.config.use_cache = False @@ -284,7 +302,12 @@ def train( logging.info( f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}" ) - trainer.train(resume_from_checkpoint=resume_from_checkpoint) + + if cfg.flash_optimum: + with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): + trainer.train(resume_from_checkpoint=resume_from_checkpoint) + else: + trainer.train(resume_from_checkpoint=resume_from_checkpoint) logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")