streaming multipack for pretraining dataset (#959)

* [Feat] streaming multipack * WIP make continued pretraining work w multipack * fix up hadrcoding, lint * fix dict check * update test for updated pretraining multipack code * fix hardcoded data collator fix for multipack pretraining * fix the collator to be the max length for multipack pretraining * don't bother with latest tag for test * cleanup docker build/test --------- Co-authored-by: jinwonkim93@github.com <jinwonkim> Co-authored-by: Wing Lian <wing.lian@gmail.com>
2024-01-06 12:13:21 +09:00
parent eb4c99431b
commit 553c80f79a
7 changed files with 282 additions and 12 deletions
--- a/examples/tiny-llama/pretrain.yml
+++ b/examples/tiny-llama/pretrain.yml
@@ -0,0 +1,58 @@
+base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+max_steps: 200
+pretraining_dataset:
+  path: c4
+  name: en
+dataset_prepared_path:
+val_set_size: 0.0
+output_dir: ./model-out
+
+sequence_len: 2048
+sample_packing: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch:
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens: