Merge pull request #189 from OpenAccess-AI-Collective/fixes-20230711

various fixes
2023-06-11 09:49:23 -04:00
parent e944311442 77762a5d6b
commit f620706776
10 changed files with 33 additions and 249 deletions
--- a/configs/accelerate/default_config.yaml
+++ b/configs/accelerate/default_config.yaml
@@ -1,15 +0,0 @@
 compute_environment: LOCAL_MACHINE
 distributed_type: 'NO'
 downcast_bf16: 'no'
 gpu_ids: all
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: 1
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
--- a/configs/llama_13B_alpaca.yml
+++ b/configs/llama_13B_alpaca.yml
@@ -1,39 +0,0 @@
 base_model: huggyllama/llama-13b
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 datasets:
  - path: anon8231489123/ShareGPT_Vicuna_unfiltered
    data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
    type: sharegpt
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.002
 adapter:
 lora_model_dir:
 sequence_len: 2048
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 lora_fan_in_fan_out: false
 wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./llama-13b-sharegpt
 gradient_accumulation_steps: 1
 micro_batch_size: 2
 warmup_steps: 1000
 save_steps:
 eval_steps:
 num_epochs: 5
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
 early_stopping_patience: 5
 resume_from_checkpoint:
 local_rank:
--- a/configs/llama_65B_alpaca.yml
+++ b/configs/llama_65B_alpaca.yml
@@ -1,44 +0,0 @@
 base_model: huggyllama/llama-65b
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 datasets:
  - path: data/alpaca_data_gpt4.jsonl
    type: alpaca
  - path: anon8231489123/ShareGPT_Vicuna_unfiltered
    data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
    type: sharegpt
  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
    type: gpteacher
  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
    type: gpteacher
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.04
 adapter: lora
 lora_model_dir:
 sequence_len: 2048
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 lora_fan_in_fan_out: false
 wandb_project: llama-65b-lora
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-llama-alpaca
 gradient_accumulation_steps: 1
 micro_batch_size: 16
 warmup_steps: 1000
 save_steps:
 num_epochs: 5
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
--- a/configs/llama_7B_4bit.yml
+++ b/configs/llama_7B_4bit.yml
@@ -1,45 +0,0 @@
 base_model: decapoda-research/llama-7b-hf-int4
 base_model_config: decapoda-research/llama-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 datasets:
  - path: tatsu-lab/alpaca  # original alpaca dataset
    type: alpaca
 dataset_prepared_path: data/last_run_prepared
 val_set_size: 0.04
 adapter: lora
 lora_model_dir:
 sequence_len: 2048
 max_packed_sequence_len: 1024
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 #  - k_proj
 #  - o_proj
 lora_fan_in_fan_out: false
 wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-test
 gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 3
 warmup_steps: 100
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
 gradient_checkpointing: false
 early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 local_rank:
 load_4bit: true
 xformers_attention: true
 flash_attention:
--- a/configs/quickstart.yml
+++ b/configs/quickstart.yml
@@ -1,45 +0,0 @@
 base_model: decapoda-research/llama-7b-hf-int4
 base_model_config: decapoda-research/llama-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 datasets:
  - path: tatsu-lab/alpaca  # original alpaca dataset
    type: alpaca
 dataset_prepared_path: data/last_run_prepared
 val_set_size: 0.04
 adapter: lora
 lora_model_dir:
 sequence_len: 1024
 max_packed_sequence_len: 1024
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 #  - k_proj
 #  - o_proj
 lora_fan_in_fan_out: false
 wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-test
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 3
 warmup_steps: 100
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
 gradient_checkpointing: false
 early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 local_rank:
 gptq: true
 xformers_attention: true
 flash_attention:
--- a/configs/vicuna_13B_4bit_reflect.yml
+++ b/configs/vicuna_13B_4bit_reflect.yml
@@ -1,45 +0,0 @@
 base_model: anon8231489123/vicuna-13b-GPTQ-4bit-128g
 base_model_config: anon8231489123/vicuna-13b-GPTQ-4bit-128g
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_4bit: true
 gptq_groupsize: 128
 gptq_model_v1: false
 datasets:
 # https://github.com/vaguenebula/AlpacaDataReflect/blob/main/alpaca_reflect_pruned.json
  - path: data/alpaca_reflect_pruned.jsonl
    type: reflection
 dataset_prepared_path: data/last_run_prepared
 val_set_size: 0.04
 adapter: lora
 lora_model_dir:
 sequence_len: 2048
 max_packed_sequence_len: 2048
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules:
  - q_proj
  - v_proj
 #  - k_proj
 #  - o_proj
 lora_fan_in_fan_out: false
 wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-reflect
 gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 3
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
 tf32: true
 gradient_checkpointing: false
 early_stopping_patience: 3
 resume_from_checkpoint:
 local_rank:
 flash_attention: true
--- a/configs/pythia_1_2B_alpaca.yml
+++ b/configs/pythia_1_2B_alpaca.yml
@@ -1,36 +1,29 @@
 base_model: EleutherAI/pythia-1.4b-deduped
-model_type: GPTNeoXForCausalLM
+base_model_config: EleutherAI/pythia-1.4b-deduped
 tokenizer_type: AutoTokenizer
 load_in_8bit: true
 datasets:
-  - path: data/alpaca_data_gpt4.jsonl
+  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
  - path: data/vicuna_cleaned.jsonl
    type: sharegpt
  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
    type: gpteacher
  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
    type: gpteacher
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
 adapter: lora
 lora_model_dir:
-sequence_len: 2048
+sequence_len: 512
-lora_r: 8
+lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
  - query_key_value
-#  - xxx
+lora_target_linear:
 lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
-wandb_project: pythia-1.4b-lora
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
-output_dir: ./lora-alpaca
+output_dir: ./lora-alpaca-pythia
 gradient_accumulation_steps: 1
 micro_batch_size: 4
-num_epochs: 5
+num_epochs: 3
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
@@ -39,3 +32,6 @@ tf32: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 weight_decay: 0.1
 eval_steps: 20
 logging_steps: 1
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -305,7 +305,9 @@ def load_model(
        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
    ):
        logging.info("converting PEFT model w/ prepare_model_for_kbit_training")
-        model = prepare_model_for_kbit_training(model)
+        model = prepare_model_for_kbit_training(
            model, use_gradient_checkpointing=cfg.gradient_checkpointing
        )
    model, lora_config = load_adapter(model, cfg, adapter)
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -57,6 +57,11 @@ def validate_config(cfg):
    if (cfg.base_model and "falcon" in cfg.base_model.lower()) and cfg.fsdp:
        raise ValueError("FSDP is not supported for falcon models")
    if (
        cfg.base_model and "mpt" in cfg.base_model.lower()
    ) and cfg.gradient_checkpointing:
        raise ValueError("gradient_checkpointing is not supported for MPT models")
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -198,3 +198,17 @@ class ValidationTest(unittest.TestCase):
        )
        validate_config(cfg)
    def test_mpt_gradient_checkpointing(self):
        regex_exp = r".*gradient_checkpointing is not supported for MPT models*"
        # Check for lower-case
        cfg = DictDefault(
            {
                "base_model": "mosaicml/mpt-7b",
                "gradient_checkpointing": True,
            }
        )
        with pytest.raises(ValueError, match=regex_exp):
            validate_config(cfg)