From 39619028a37f4af77dd0b89c9b8191c783d7049a Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 27 May 2023 19:37:24 -0400 Subject: [PATCH] use pythia-12b, neox-20b is flaky --- examples/pythia-12b/README.md | 10 ++++++++++ .../pythia-12b/config.yml | 20 +++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) create mode 100644 examples/pythia-12b/README.md rename configs/gpt_neox_20b.yml => examples/pythia-12b/config.yml (72%) diff --git a/examples/pythia-12b/README.md b/examples/pythia-12b/README.md new file mode 100644 index 000000000..0953caa4e --- /dev/null +++ b/examples/pythia-12b/README.md @@ -0,0 +1,10 @@ +# Python 12B + +- Single-GPU A100 only (?) + +```shell +python scripts/finetune.py examples/pythia-12b/config.yml +``` + +⚠️ Multiple-GPU A100 - Doesn't seem to work with multi-gpu without causing OOM! ⚠️ + diff --git a/configs/gpt_neox_20b.yml b/examples/pythia-12b/config.yml similarity index 72% rename from configs/gpt_neox_20b.yml rename to examples/pythia-12b/config.yml index 25fdae53b..28e822c77 100644 --- a/configs/gpt_neox_20b.yml +++ b/examples/pythia-12b/config.yml @@ -1,11 +1,12 @@ -base_model: EleutherAI/gpt-neox-20b -base_model_config: EleutherAI/gpt-neox-20b +base_model: EleutherAI/pythia-12b-deduped +base_model_config: EleutherAI/pythia-12b-deduped base_model_ignore_patterns: pytorch* # prefer safetensors model_type: GPTNeoXForCausalLM tokenizer_type: AutoTokenizer load_in_8bit: false -load_in_4bit: true -load_4bit: false +load_in_4bit: false +gptq: false +device_map: auto datasets: - path: vicgalle/alpaca-gpt4 type: alpaca @@ -21,16 +22,16 @@ lora_dropout: 0.0 lora_target_modules: lora_target_linear: true lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific -wandb_project: gpt4all-neox-20b +wandb_project: pythia-12b wandb_watch: wandb_run_id: wandb_log_model: -output_dir: ./gpt4all-neox-20b +output_dir: ./pythia-12b gradient_accumulation_steps: 1 -micro_batch_size: 2 +micro_batch_size: 1 num_epochs: 5 learning_rate: 0.00003 -optimizer: paged_adamw_32bit +optimizer: adamw_bnb_8bit lr_scheduler: cosine train_on_inputs: false group_by_length: false @@ -43,3 +44,6 @@ early_stopping_patience: resume_from_checkpoint: local_rank: gradient_checkpointing: true +fsdp: +fsdp_transformer_layer_cls_to_wrap: +collator_pad_to_longest: true