Phi2 multipack (#1173)

* phi2 multipack * update validation and examples for phi * more updates to phi examples * make sure to use the correct collator for phi multipack * phi needs attention mask now for multipack * if the special token already exists in the tokenizer, don't require in lora modules to save * fix qlora yml for phi, fix phi test validation * test qlora too * make sure flash attention is enabled for the test * don't use remote code for phi anymore * reduce sequence len for sample packing phi
2024-01-23 12:54:36 -05:00
parent b715cd549a
commit 814aee6603
18 changed files with 201 additions and 2269 deletions
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -1,8 +1,6 @@
 base_model: microsoft/phi-1_5
-model_type: PhiForCausalLM
+model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-is_llama_derived_model: false
-trust_remote_code: true

 load_in_8bit: false
 load_in_4bit: false
@@ -18,7 +16,7 @@ output_dir: ./phi-sft-out

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len:
+pad_to_sequence_len: true

 adapter:
 lora_model_dir:
@@ -35,7 +33,7 @@ wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 1
-micro_batch_size: 1
+micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_torch
 adam_beta2: 0.95
@@ -45,18 +43,20 @@ lr_scheduler: cosine
 learning_rate: 0.000003

 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: auto
 fp16:
 tf32: true

-gradient_checkpointing:
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
-flash_attention:
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 4
@@ -68,7 +68,4 @@ fsdp:
 fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
-  bos_token: "<|endoftext|>"
-  eos_token: "<|endoftext|>"
-  unk_token: "<|endoftext|>"
  pad_token: "<|endoftext|>"