Merge branch 'main' into strip-peft-device-map

2023-06-12 08:25:54 -04:00
parent cd0a6f6027 94f310c7a6
commit 5e616d91c0
30 changed files with 269 additions and 604 deletions
--- a/FAQS.md
+++ b/FAQS.md
@@ -2,3 +2,6 @@
 - Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
 - Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases
 - `Error invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c`
 `/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.`
 This could lead to a segmentation fault at exit. Try reinstalling bitsandbytes and transformers from source.
--- a/README.md
+++ b/README.md
@@ -16,13 +16,14 @@
 ## Axolotl supports
-|         | fp16/fp32 | fp16/fp32 w/ lora | qlora | 4bit-quant | 4bit-quant w/flash attention | flash attention | xformers attention |
+|          | fp16/fp32 | lora | qlora | gptq | gptq w/ lora | gptq w/flash attn | flash attn | xformers attn |
-|---------|:----------|:------------------|------|------------|------------------------------|-----------------|--------------------|
+|----------|:----------|:-----|-------|------|:-------------|-------------------|------------|---------------|
-| llama   | ✅         | ✅                 | ✅  | ✅          | ✅                            | ✅               | ✅                  |
+| llama    | ✅         | ✅    | ✅     | ✅    | ✅             | ✅                 | ✅          | ✅             |
-| Pythia  | ✅         | ✅                 | ❓  | ❌          | ❌                            | ❌               | ❓                  |
+| Pythia   | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❌          | ❓             |
-| cerebras | ✅         | ✅                 | ❓  | ❌          | ❌                            | ❌               | ❓                  |
+| cerebras | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❌          | ✅             |
-| mpt     | ✅         | ❌                 | ❓  | ❌          | ❌                            | ❌               | ❓                  |
+| mpt      | ✅         | ❌    | ❓     | ❌    | ❓            | ❌                 | ❌          | ❓             |
-| falcon  | ✅         | ✅                 | ✅  | ❌          | ❌                            | ❌               | ❓                  |
+| falcon   | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❌          | ✅             |
 | gpt-j    | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❓          | ✅             |
 ## Quickstart ⚡
@@ -38,10 +39,10 @@ pip3 install -U git+https://github.com/huggingface/peft.git
 accelerate config
 # finetune lora
-accelerate launch scripts/finetune.py examples/lora-openllama-3b/config.yml
+accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml
 # inference
-accelerate launch scripts/finetune.py examples/lora-openllama-3b/config.yml \
+accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
    --inference --lora_model_dir="./lora-out"
 ```
@@ -381,6 +382,8 @@ num_epochs: 3
 warmup_steps: 100
 learning_rate: 0.00003
 logging_steps:
 save_steps:
 eval_steps:
 # whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false
--- a/configs/accelerate/default_config.yaml
+++ b/configs/accelerate/default_config.yaml
@@ -1,15 +0,0 @@
 compute_environment: LOCAL_MACHINE
 distributed_type: 'NO'
 downcast_bf16: 'no'
 gpu_ids: all
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: 1
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
--- a/configs/cerebras_1_3B_alpaca.yml
+++ b/configs/cerebras_1_3B_alpaca.yml
@@ -1,40 +0,0 @@
 base_model: cerebras/Cerebras-GPT-1.3B
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 load_in_8bit: true
 datasets:
  - path: data/alpaca_data_gpt4.jsonl
    type: alpaca
  - path: data/vicuna_cleaned.jsonl
    type: sharegpt
  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
    type: gpteacher
  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
    type: gpteacher
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
 adapter: lora
 sequence_len: 2048
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - c_attn
 lora_fan_in_fan_out: false
 wandb_project: pythia-1.4b-lora
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-alpaca
 gradient_accumulation_steps: 1
 micro_batch_size: 4
 num_epochs: 5
 learning_rate: 0.0003
 train_on_inputs: false
 group_by_length: false
 bf16: True
 tf32: True
 gradient_checkpointing:
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
--- a/configs/galactica_1_3B.yml
+++ b/configs/galactica_1_3B.yml
@@ -1,41 +0,0 @@
 base_model: facebook/galactica-1.3b
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 load_in_8bit: false
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 adapter:
 lora_model_dir:
 sequence_len: 1024
 max_packed_sequence_len: 1024
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 lora_fan_in_fan_out: false
 wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-llama-alpaca
 gradient_accumulation_steps: 1
 micro_batch_size: 16
 num_epochs: 3
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: false
 tf32: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 tokens:
  pad_token: "[PAD]"
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
--- a/configs/gpt_neox_20b.yml
+++ b/configs/gpt_neox_20b.yml
@@ -1,39 +0,0 @@
 base_model: EleutherAI/gpt-neox-20b
 base_model_ignore_patterns: pytorch*  # prefer safetensors
 model_type: GPTNeoXForCausalLM
 tokenizer_type: AutoTokenizer
 load_in_8bit: true
 datasets:
  - path: nomic-ai/gpt4all-j-prompt-generations
    type: alpaca
    shards: 4
    shards_index: 0
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
 adapter: lora
 lora_model_dir:
 sequence_len: 2048
 max_packed_sequence_len: 2048
 lora_r: 8
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
  - query_key_value
 lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
 wandb_project: gpt4all-neox-20b
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./gpt4all-neox-20b
 gradient_accumulation_steps: 1
 micro_batch_size: 4
 num_epochs: 5
 learning_rate: 0.00003
 lr_scheduler: one_cycle
 train_on_inputs: false
 group_by_length: false
 bf16: True
 tf32: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
--- a/configs/llama_13B_alpaca.yml
+++ b/configs/llama_13B_alpaca.yml
@@ -1,39 +0,0 @@
 base_model: huggyllama/llama-13b
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 datasets:
  - path: anon8231489123/ShareGPT_Vicuna_unfiltered
    data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
    type: sharegpt
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.002
 adapter:
 lora_model_dir:
 sequence_len: 2048
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 lora_fan_in_fan_out: false
 wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./llama-13b-sharegpt
 gradient_accumulation_steps: 1
 micro_batch_size: 2
 warmup_steps: 1000
 save_steps:
 eval_steps:
 num_epochs: 5
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
 early_stopping_patience: 5
 resume_from_checkpoint:
 local_rank:
--- a/configs/llama_65B_alpaca.yml
+++ b/configs/llama_65B_alpaca.yml
@@ -1,44 +0,0 @@
 base_model: huggyllama/llama-65b
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 datasets:
  - path: data/alpaca_data_gpt4.jsonl
    type: alpaca
  - path: anon8231489123/ShareGPT_Vicuna_unfiltered
    data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
    type: sharegpt
  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
    type: gpteacher
  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
    type: gpteacher
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.04
 adapter: lora
 lora_model_dir:
 sequence_len: 2048
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 lora_fan_in_fan_out: false
 wandb_project: llama-65b-lora
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-llama-alpaca
 gradient_accumulation_steps: 1
 micro_batch_size: 16
 warmup_steps: 1000
 save_steps:
 num_epochs: 5
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
--- a/configs/llama_7B_4bit.yml
+++ b/configs/llama_7B_4bit.yml
@@ -1,45 +0,0 @@
 base_model: decapoda-research/llama-7b-hf-int4
 base_model_config: decapoda-research/llama-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 datasets:
  - path: tatsu-lab/alpaca  # original alpaca dataset
    type: alpaca
 dataset_prepared_path: data/last_run_prepared
 val_set_size: 0.04
 adapter: lora
 lora_model_dir:
 sequence_len: 2048
 max_packed_sequence_len: 1024
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 #  - k_proj
 #  - o_proj
 lora_fan_in_fan_out: false
 wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-test
 gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 3
 warmup_steps: 100
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
 gradient_checkpointing: false
 early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 local_rank:
 load_4bit: true
 xformers_attention: true
 flash_attention:
--- a/configs/llama_7B_alpaca.yml
+++ b/configs/llama_7B_alpaca.yml
@@ -1,41 +0,0 @@
 base_model: huggyllama/llama-7b
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 datasets:
  - path: data/alpaca_data_gpt4.jsonl
    type: alpaca
  - path: data/vicuna_cleaned.jsonl
    type: sharegpt
  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
    type: gpteacher
  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
    type: gpteacher
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.04
 adapter: lora
 lora_model_dir:
 sequence_len: 2048
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 lora_fan_in_fan_out: false
 wandb_project: llama-7b-lora
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-llama-alpaca
 gradient_accumulation_steps: 1
 micro_batch_size: 16
 num_epochs: 5
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
--- a/configs/quickstart.yml
+++ b/configs/quickstart.yml
@@ -1,45 +0,0 @@
 base_model: decapoda-research/llama-7b-hf-int4
 base_model_config: decapoda-research/llama-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 datasets:
  - path: tatsu-lab/alpaca  # original alpaca dataset
    type: alpaca
 dataset_prepared_path: data/last_run_prepared
 val_set_size: 0.04
 adapter: lora
 lora_model_dir:
 sequence_len: 1024
 max_packed_sequence_len: 1024
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 #  - k_proj
 #  - o_proj
 lora_fan_in_fan_out: false
 wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-test
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 3
 warmup_steps: 100
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
 gradient_checkpointing: false
 early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 local_rank:
 gptq: true
 xformers_attention: true
 flash_attention:
--- a/configs/sample.yml
+++ b/configs/sample.yml
@@ -1,87 +0,0 @@
 # this is the huggingface model that contains *.pt, *.safetensors, or *.bin files
 # this can also be a relative path to a model on disk
 base_model: decapoda-research/llama-7b-hf-int4
 # you can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
 base_model_ignore_patterns:
 # if the base_model repo on hf hub doesn't include configuration .json files,
 # you can set that here, or leave this empty to default to base_model
 base_model_config: decapoda-research/llama-7b-hf
 # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
 model_type: AutoModelForCausalLM
 # Corresponding tokenizer for the model AutoTokenizer is a good choice
 tokenizer_type: AutoTokenizer
 # whether you are training a 4-bit quantized model
 load_4bit: true
 # this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
 load_in_8bit: true
 # a list of one or more datasets to finetune the model with
 datasets:
  # this can be either a hf dataset, or relative path
  - path: vicgalle/alpaca-gpt4
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
    type: alpaca
 # axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
 # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc
 val_set_size: 0.04
 # if you want to use lora, leave blank to train all parameters in original model
 adapter: lora
 # if you already have a lora model trained that you want to load, put that here
 lora_model_dir:
 # the maximum length of an input to train with, this should typically be less than 2048
 # as most models have a token/context limit of 2048
 sequence_len: 2048
 # max sequence length to concatenate training samples together up to
 # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
 max_packed_sequence_len: 1024
 # lora hyperparameters
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 #  - k_proj
 #  - o_proj
 lora_fan_in_fan_out: false
 # wandb configuration if your're using it
 wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 # where to save the finsihed model to
 output_dir: ./completed-model
 # training hyperparameters
 gradient_accumulation_steps: 1
 batch_size:
 micro_batch_size: 2
 num_epochs: 3
 warmup_steps: 100
 learning_rate: 0.00003
 # whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false
 # don't use this, leads to wonky training (according to someone on the internet)
 group_by_length: false
 # Use CUDA bf16
 bf16: true
 # Use CUDA tf32
 tf32: true
 # does not work with current implementation of 4-bit LoRA
 gradient_checkpointing: false
 # stop training after this many evaluation losses have increased in a row
 # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
 early_stopping_patience: 3
 # specify a scheduler to use with the optimizer. only one_cycle is supported currently
 lr_scheduler:
 # whether to use xformers attention patch https://github.com/facebookresearch/xformers:
 xformers_attention:
 # whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
 flash_attention:
 # resume from a specific checkpoint dir
 resume_from_checkpoint:
 # if resume_from_checkpoint isn't set and you simply want it to start where it left off
 # be careful with this being turned on between different models
 auto_resume_from_checkpoints: false
 # don't mess with this, it's here for accelerate and torchrun
 local_rank:
--- a/configs/stability_3b.yml
+++ b/configs/stability_3b.yml
@@ -1,56 +0,0 @@
 base_model: stabilityai/stablelm-base-alpha-3b
 base_model_config: stabilityai/stablelm-base-alpha-3b
 load_in_8bit: false
 datasets:
  - path: vicgalle/alpaca-gpt4
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.04
 adapter:
 lora_model_dir:
 sequence_len: 4096
 max_packed_sequence_len: 4096
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 lora_fan_in_fan_out: false
 wandb_project: stable-alpaca-3b
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./stable-alpaca-3b
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0000002
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 100
 eval_steps: 50
 save_steps: 200
 debug:
 deepspeed:
 weight_decay: 0.01
 fsdp:
 fsdp_config:
 #tokens:
 #  pad_token: "[PAD]"
 #  bos_token: "<s>"
 #  eos_token: "</s>"
 #  unk_token: "<unk>"
--- a/configs/vicuna_13B_4bit_reflect.yml
+++ b/configs/vicuna_13B_4bit_reflect.yml
@@ -1,45 +0,0 @@
 base_model: anon8231489123/vicuna-13b-GPTQ-4bit-128g
 base_model_config: anon8231489123/vicuna-13b-GPTQ-4bit-128g
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_4bit: true
 gptq_groupsize: 128
 gptq_model_v1: false
 datasets:
 # https://github.com/vaguenebula/AlpacaDataReflect/blob/main/alpaca_reflect_pruned.json
  - path: data/alpaca_reflect_pruned.jsonl
    type: reflection
 dataset_prepared_path: data/last_run_prepared
 val_set_size: 0.04
 adapter: lora
 lora_model_dir:
 sequence_len: 2048
 max_packed_sequence_len: 2048
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 #  - k_proj
 #  - o_proj
 lora_fan_in_fan_out: false
 wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-reflect
 gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 3
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
 gradient_checkpointing: false
 early_stopping_patience: 3
 resume_from_checkpoint:
 local_rank:
 flash_attention: true
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -0,0 +1,60 @@
 base_model: cerebras/Cerebras-GPT-1.3B
 base_model_config: cerebras/Cerebras-GPT-1.3B
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.01
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 max_packed_sequence_len: 2048
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
  - c_fc
  - c_attn
  - c_proj
 lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./qlora-out
 batch_size: 4
 micro_batch_size: 4
 num_epochs: 2
 optimizer: paged_adamw_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: true
 bf16: true
 fp16: false
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
 eval_steps: 20
 save_steps:
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -23,7 +23,7 @@ lora_dropout: 0.0
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
-wandb_project: falcon-7b
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -23,7 +23,7 @@ lora_dropout: 0.0
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
-wandb_project: falcon-7b
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -0,0 +1,57 @@
 base_model: EleutherAI/gpt-j-6b
 base_model_config: EleutherAI/gpt-j-6b
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.01
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 max_packed_sequence_len:
 lora_r: 8
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./qlora-out
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 2
 optimizer: paged_adamw_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0001
 train_on_inputs: false
 group_by_length: true
 bf16: true
 fp16: false
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
 eval_steps: 20
 save_steps:
 debug:
 deepspeed:
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/gptq-lora-7b/README.md
+++ b/examples/gptq-lora-7b/README.md
@@ -3,6 +3,6 @@
 This is a good place to start for beginners. This will run on an NVIDIA RTX4090 with no other changes needed.
 ```shell
-accelerate launch scripts/finetune.py examples/4bit-lora-7b/config.yml
+accelerate launch scripts/finetune.py examples/gptq-lora-7b/config.yml
 ```
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -7,30 +7,28 @@ datasets:
  - path: openaccess-ai-collective/jeopardy
    type: jeopardy
 dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
+val_set_size: 0.02
 adapter:
 lora_model_dir:
-sequence_len: 2048
+sequence_len: 512
-max_packed_sequence_len: 2048
+max_packed_sequence_len:
-lora_r: 8
+lora_r:
-lora_alpha: 16
+lora_alpha:
-lora_dropout: 0.05
+lora_dropout:
 lora_target_modules:
  - q_proj
  - v_proj
 lora_fan_in_fan_out: false
-wandb_project: jeopardy-bot-7b
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./jeopardy-bot-7b
-gradient_accumulation_steps: 2
+gradient_accumulation_steps: 1
 micro_batch_size: 1
-num_epochs: 2
+num_epochs: 3
 optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
-learning_rate: 0.0000002
+learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
@@ -48,11 +46,10 @@ eval_steps: 110
 save_steps: 660
 debug:
 deepspeed:
-weight_decay: 0.0001
+weight_decay: 0.1
 fsdp:
 fsdp_config:
 tokens:
  pad_token: "[PAD]"
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
--- a/examples/openllama-3b/README.md
+++ b/examples/openllama-3b/README.md
@@ -0,0 +1,16 @@
 # openllama-3b
 Basic full tune
 ```shell
 accelerate launch scripts/finetune.py examples/openllama-3b/config.yml
 ```
 LoRA
 ```shell
 accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml
 ```
 QLoRA
 ```shell
 accelerate launch scripts/finetune.py examples/openllama-3b/qlora.yml
 ```
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -0,0 +1,61 @@
 base_model: openlm-research/open_llama_3b
 base_model_config: openlm-research/open_llama_3b
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.02
 adapter:
 lora_model_dir:
 sequence_len: 256
 max_packed_sequence_len:
 lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_modules:
 lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./openllama-out
 batch_size: 16
 micro_batch_size: 4
 num_epochs: 3
 optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: false
 fp16: true
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
 eval_steps: 50
 save_steps:
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
--- a/examples/lora-openllama-3b/config.yml
+++ b/examples/lora-openllama-3b/config.yml
@@ -1,5 +1,5 @@
-base_model: openlm-research/open_llama_3b_600bt_preview
+base_model: openlm-research/open_llama_3b
-base_model_config: openlm-research/open_llama_3b_600bt_preview
+base_model_config: openlm-research/open_llama_3b
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true
@@ -49,7 +49,7 @@ early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
-xformers_attention:
+xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
--- a/examples/qlora-openllama-3b/config.yml
+++ b/examples/qlora-openllama-3b/config.yml
@@ -1,5 +1,5 @@
-base_model: openlm-research/open_llama_3b_600bt_preview
+base_model: openlm-research/open_llama_3b
-base_model_config: openlm-research/open_llama_3b_600bt_preview
+base_model_config: openlm-research/open_llama_3b
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: false
--- a/configs/pythia_1_2B_alpaca.yml
+++ b/configs/pythia_1_2B_alpaca.yml
@@ -1,36 +1,29 @@
 base_model: EleutherAI/pythia-1.4b-deduped
-model_type: GPTNeoXForCausalLM
+base_model_config: EleutherAI/pythia-1.4b-deduped
 tokenizer_type: AutoTokenizer
 load_in_8bit: true
 datasets:
-  - path: data/alpaca_data_gpt4.jsonl
+  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
  - path: data/vicuna_cleaned.jsonl
    type: sharegpt
  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
    type: gpteacher
  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
    type: gpteacher
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
 adapter: lora
 lora_model_dir:
-sequence_len: 2048
+sequence_len: 512
-lora_r: 8
+lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
  - query_key_value
-#  - xxx
+lora_target_linear:
 lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
-wandb_project: pythia-1.4b-lora
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
-output_dir: ./lora-alpaca
+output_dir: ./lora-alpaca-pythia
 gradient_accumulation_steps: 1
 micro_batch_size: 4
-num_epochs: 5
+num_epochs: 3
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
@@ -39,3 +32,6 @@ tf32: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 weight_decay: 0.1
 eval_steps: 20
 logging_steps: 1
--- a/examples/qlora-openllama-3b/README.md
+++ b/examples/qlora-openllama-3b/README.md
@@ -1,6 +0,0 @@
 # qlora-openllama-3b
 ```shell
 accelerate launch scripts/finetune.py examples/qlora-openllama-3b/config.yml
 ```
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -165,7 +165,7 @@ def train(
    cfg_keys = cfg.keys()
    for k, _ in kwargs.items():
        # if not strict, allow writing to cfg even if it's not in the yml already
-        if k in cfg_keys or cfg.strict is False:
+        if k in cfg_keys or not cfg.strict:
            # handle booleans
            if isinstance(cfg[k], bool):
                cfg[k] = bool(kwargs[k])
@@ -205,8 +205,8 @@ def train(
    logging.info(f"loading tokenizer... {tokenizer_config}")
    tokenizer = load_tokenizer(tokenizer_config, cfg.tokenizer_type, cfg)
-    if check_not_in(
+    if (
-        ["inference", "shard", "merge_lora"], kwargs
+        check_not_in(["shard", "merge_lora"], kwargs) and not cfg.inference
    ):  # don't need to load dataset for these
        train_dataset, eval_dataset = load_prepare_datasets(
            tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
@@ -234,7 +234,6 @@ def train(
        tokenizer,
        cfg,
        adapter=cfg.adapter,
        inference=("inference" in kwargs),
    )
    if "merge_lora" in kwargs and cfg.adapter is not None:
@@ -247,7 +246,7 @@ def train(
            model.save_pretrained(str(Path(cfg.output_dir) / "merged"))
        return
-    if "inference" in kwargs:
+    if cfg.inference:
        logging.info("calling do_inference function")
        inf_kwargs: Dict[str, Any] = {}
        if "prompter" in kwargs:
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -77,15 +77,9 @@ def load_tokenizer(
 def load_model(
-    base_model,
+    base_model, base_model_config, model_type, tokenizer, cfg, adapter="lora"
    base_model_config,
    model_type,
    tokenizer,
    cfg,
    adapter="lora",
    inference=False,
 ):
-    # type: (str, str, str, AutoTokenizer, DictDefault, Optional[str], bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
+    # type: (str, str, str, AutoTokenizer, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
    """
    Load a model from a base model and a model type.
    """
@@ -98,7 +92,7 @@ def load_model(
    )
    if cfg.is_llama_derived_model and cfg.flash_attention:
-        if cfg.device not in ["mps", "cpu"] and inference is False:
+        if cfg.device not in ["mps", "cpu"] and not cfg.inference:
            from axolotl.flash_attn import replace_llama_attn_with_flash_attn
            logging.info("patching with flash attention")
@@ -305,7 +299,9 @@ def load_model(
        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
    ):
        logging.info("converting PEFT model w/ prepare_model_for_kbit_training")
-        model = prepare_model_for_kbit_training(model)
+        model = prepare_model_for_kbit_training(
            model, use_gradient_checkpointing=cfg.gradient_checkpointing
        )
    model, lora_config = load_adapter(model, cfg, adapter)
@@ -436,6 +432,7 @@ def load_lora(model, cfg):
        model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
            is_trainable=not cfg.inference,
        )
    else:
        model = get_peft_model(model, lora_config)
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -57,6 +57,11 @@ def validate_config(cfg):
    if (cfg.base_model and "falcon" in cfg.base_model.lower()) and cfg.fsdp:
        raise ValueError("FSDP is not supported for falcon models")
    if (
        cfg.base_model and "mpt" in cfg.base_model.lower()
    ) and cfg.gradient_checkpointing:
        raise ValueError("gradient_checkpointing is not supported for MPT models")
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -198,3 +198,17 @@ class ValidationTest(unittest.TestCase):
        )
        validate_config(cfg)
    def test_mpt_gradient_checkpointing(self):
        regex_exp = r".*gradient_checkpointing is not supported for MPT models*"
        # Check for lower-case
        cfg = DictDefault(
            {
                "base_model": "mosaicml/mpt-7b",
                "gradient_checkpointing": True,
            }
        )
        with pytest.raises(ValueError, match=regex_exp):
            validate_config(cfg)