diff --git a/README.md b/README.md
index a81ac8b50..94427fcd0 100644
--- a/README.md
+++ b/README.md
@@ -521,7 +521,7 @@ lr_quadratic_warmup:
logging_steps:
save_strategy: # set to `no` to skip checkpoint saves
save_steps: # leave empty to save at each epoch
-eval_steps:
+eval_steps: # leave empty to eval at each epoch
save_total_limit: # checkpoints saved at a time
max_steps:
diff --git a/deepspeed/zero2.json b/deepspeed/zero2.json
new file mode 100644
index 000000000..9b8a2a9b6
--- /dev/null
+++ b/deepspeed/zero2.json
@@ -0,0 +1,46 @@
+{
+ "zero_optimization": {
+ "stage": 2,
+ "offload_optimizer": {
+ "device": "cpu"
+ },
+ "contiguous_gradients": true,
+ "overlap_comm": true
+ },
+ "bf16": {
+ "enabled": "auto"
+ },
+ "fp16": {
+ "enabled": "auto",
+ "auto_cast": false,
+ "loss_scale": 0,
+ "initial_scale_power": 32,
+ "loss_scale_window": 1000,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+ "optimizer": {
+ "type": "AdamW",
+ "params": {
+ "lr": "auto",
+ "betas": [
+ 0.9,
+ 0.999
+ ],
+ "eps": 1e-8,
+ "weight_decay": "auto"
+ }
+ },
+ "scheduler": {
+ "type": "WarmupDecayLR",
+ "params": {
+ "warmup_min_lr": "auto",
+ "warmup_max_lr": "auto",
+ "warmup_num_steps": "auto",
+ "total_num_steps": "auto"
+ }
+ },
+ "train_batch_size": "auto",
+ "train_micro_batch_size_per_gpu": "auto",
+ "wall_clock_breakdown": false
+}
diff --git a/examples/code-llama/13b/lora.yml b/examples/code-llama/13b/lora.yml
new file mode 100644
index 000000000..637c05143
--- /dev/null
+++ b/examples/code-llama/13b/lora.yml
@@ -0,0 +1,67 @@
+base_model: codellama/CodeLlama-13b-hf
+base_model_config: codellama/CodeLlama-13b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+ - path: mhenrichsen/alpaca_2k_test
+ type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./lora-out
+
+sequence_len: 100000
+sample_packing: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+ bos_token: ""
+ eos_token: ""
+ unk_token: ""
diff --git a/examples/code-llama/13b/qlora.yml b/examples/code-llama/13b/qlora.yml
new file mode 100644
index 000000000..ae78f5bf2
--- /dev/null
+++ b/examples/code-llama/13b/qlora.yml
@@ -0,0 +1,69 @@
+base_model: codellama/CodeLlama-13b-hf
+base_model_config: codellama/CodeLlama-13b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+ - path: mhenrichsen/alpaca_2k_test
+ type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 100000
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: paged_adamw_32bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+ bos_token: ""
+ eos_token: ""
+ unk_token: ""
diff --git a/examples/code-llama/34b/lora.yml b/examples/code-llama/34b/lora.yml
new file mode 100644
index 000000000..9c4cfee10
--- /dev/null
+++ b/examples/code-llama/34b/lora.yml
@@ -0,0 +1,67 @@
+base_model: codellama/CodeLlama-34b-hf
+base_model_config: codellama/CodeLlama-34b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+ - path: mhenrichsen/alpaca_2k_test
+ type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./lora-out
+
+sequence_len: 100000
+sample_packing: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+ bos_token: ""
+ eos_token: ""
+ unk_token: ""
diff --git a/examples/code-llama/34b/qlora.yml b/examples/code-llama/34b/qlora.yml
new file mode 100644
index 000000000..9f5ce50f9
--- /dev/null
+++ b/examples/code-llama/34b/qlora.yml
@@ -0,0 +1,69 @@
+base_model: codellama/CodeLlama-34b-hf
+base_model_config: codellama/CodeLlama-34b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+ - path: mhenrichsen/alpaca_2k_test
+ type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 100000
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: paged_adamw_32bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+ bos_token: ""
+ eos_token: ""
+ unk_token: ""
diff --git a/examples/code-llama/7b/lora.yml b/examples/code-llama/7b/lora.yml
new file mode 100644
index 000000000..dfa3f2f7a
--- /dev/null
+++ b/examples/code-llama/7b/lora.yml
@@ -0,0 +1,67 @@
+base_model: codellama/CodeLlama-7b-hf
+base_model_config: codellama/CodeLlama-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+ - path: mhenrichsen/alpaca_2k_test
+ type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./lora-out
+
+sequence_len: 100000
+sample_packing: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+ bos_token: ""
+ eos_token: ""
+ unk_token: ""
diff --git a/examples/code-llama/7b/qlora.yml b/examples/code-llama/7b/qlora.yml
new file mode 100644
index 000000000..704f058c3
--- /dev/null
+++ b/examples/code-llama/7b/qlora.yml
@@ -0,0 +1,69 @@
+base_model: codellama/CodeLlama-7b-hf
+base_model_config: codellama/CodeLlama-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+ - path: mhenrichsen/alpaca_2k_test
+ type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 100000
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: paged_adamw_32bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+ bos_token: ""
+ eos_token: ""
+ unk_token: ""
diff --git a/examples/code-llama/README.md b/examples/code-llama/README.md
new file mode 100644
index 000000000..a5011e347
--- /dev/null
+++ b/examples/code-llama/README.md
@@ -0,0 +1,22 @@
+# Overview
+
+This is an example of CodeLLaMA configuration for 7b, 13b and 34b.
+
+The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes.
+
+The 13b variant will fit if you change these settings to these values:
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+
+The 34b variant does not fit on 24GB of VRAM - you will need something with +40 gb VRAM that also supports flash attention v2 - A6000 or A100 are good choices.
+
+```shell
+accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/qlora.yml
+
+```
+or
+
+```shell
+accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/lora.yml
+
+```
diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml
new file mode 100644
index 000000000..66515dabc
--- /dev/null
+++ b/examples/llama-2/relora.yml
@@ -0,0 +1,73 @@
+base_model: meta-llama/Llama-2-7b-hf
+base_model_config: meta-llama/Llama-2-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+ - path: teknium/GPT4-LLM-Cleaned
+ type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./relora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 4096
+sample_packing: true
+
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+relora_steps: 150
+relora_warmup_steps: 10
+relora_cpu_offload: false
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 4
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps: 50
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+ bos_token: ""
+ eos_token: ""
+ unk_token: ""
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 3255a623f..d02448ec2 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -82,6 +82,8 @@ def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
)
+ model = model.to(cfg.device)
+
while True:
print("=" * 80)
# support for multiline inputs
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index c73b4a713..24be1b8c2 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -10,19 +10,13 @@ from functools import partial
from pathlib import Path
from typing import Optional, Union
-import bitsandbytes as bnb
import numpy as np
import torch.cuda
-import transformers
from datasets import Dataset, set_caching_enabled
-from torch import nn
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
-from transformers.trainer_pt_utils import (
- SequentialDistributedSampler,
- get_parameter_names,
-)
+from transformers.trainer_pt_utils import SequentialDistributedSampler
from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
from axolotl.utils.callbacks import (
@@ -32,10 +26,7 @@ from axolotl.utils.callbacks import (
)
from axolotl.utils.collators import DataCollatorForSeq2Seq
from axolotl.utils.dataloader import MultipackDistributedDataloader
-from axolotl.utils.schedulers import (
- InterpolatingLogScheduler,
- get_cosine_schedule_with_quadratic_warmup,
-)
+from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup
LOG = logging.getLogger("axolotl")
@@ -570,66 +561,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
if Path(cfg.torchdistx_path).exists():
sys.path.append(cfg.torchdistx_path)
importlib.import_module("torchdistx")
- if (
- cfg.optimizer == "adamw_bnb_8bit"
- and not cfg.gptq
- and "deepspeed" not in training_arguments_kwargs
- and not cfg.fsdp
- ):
- decay_parameters = get_parameter_names(model, [nn.LayerNorm])
- decay_parameters = [name for name in decay_parameters if "bias" not in name]
- optimizer_grouped_parameters = [
- {
- "params": [
- p
- for n, p in model.named_parameters()
- if (n in decay_parameters and p.requires_grad)
- ],
- "weight_decay": training_args.weight_decay,
- },
- {
- "params": [
- p
- for n, p in model.named_parameters()
- if (n not in decay_parameters and p.requires_grad)
- ],
- "weight_decay": 0.0,
- },
- ]
-
- optimizer = bnb.optim.Adam8bit(
- optimizer_grouped_parameters,
- betas=(training_args.adam_beta1, training_args.adam_beta2),
- eps=training_args.adam_epsilon,
- lr=training_args.learning_rate,
- )
-
- if cfg.lr_scheduler == "one_cycle":
- lr_scheduler_kwargs = (
- cfg.lr_scheduler_kwargs if cfg.lr_scheduler_kwargs else {}
- )
- lr_scheduler = OneCycleLR(
- optimizer,
- cfg.learning_rate,
- total_steps=total_num_steps,
- epochs=cfg.num_epochs,
- div_factor=cfg.lr_div_factor if cfg.lr_div_factor else 6,
- **lr_scheduler_kwargs,
- )
- elif cfg.lr_scheduler == "log_sweep":
- lr_scheduler = InterpolatingLogScheduler(
- optimizer,
- cfg.warmup_steps,
- cfg.log_sweep_min_lr if cfg.log_sweep_min_lr else 1e-10,
- cfg.log_sweep_max_lr if cfg.log_sweep_max_lr else 10,
- )
- else:
- lr_scheduler = transformers.get_cosine_schedule_with_warmup(
- optimizer,
- training_args.warmup_steps,
- total_num_steps,
- )
- trainer_kwargs["optimizers"] = (optimizer, lr_scheduler)
callbacks = []
callbacks.append(GPUStatsCallback(cfg))