diff --git a/README.md b/README.md
index a81ac8b50..94427fcd0 100644
--- a/README.md
+++ b/README.md
@@ -521,7 +521,7 @@ lr_quadratic_warmup:
 logging_steps:
 save_strategy: # set to `no` to skip checkpoint saves
 save_steps: # leave empty to save at each epoch
-eval_steps:
+eval_steps: # leave empty to eval at each epoch
 save_total_limit: # checkpoints saved at a time
 max_steps:
 
diff --git a/deepspeed/zero2.json b/deepspeed/zero2.json
new file mode 100644
index 000000000..9b8a2a9b6
--- /dev/null
+++ b/deepspeed/zero2.json
@@ -0,0 +1,46 @@
+{
+    "zero_optimization": {
+      "stage": 2,
+      "offload_optimizer": {
+        "device": "cpu"
+      },
+      "contiguous_gradients": true,
+      "overlap_comm": true
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "fp16": {
+      "enabled": "auto",
+      "auto_cast": false,
+      "loss_scale": 0,
+      "initial_scale_power": 32,
+      "loss_scale_window": 1000,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "optimizer": {
+      "type": "AdamW",
+      "params": {
+        "lr": "auto",
+        "betas": [
+          0.9,
+          0.999
+        ],
+        "eps": 1e-8,
+        "weight_decay": "auto"
+      }
+    },
+    "scheduler": {
+      "type": "WarmupDecayLR",
+      "params": {
+        "warmup_min_lr": "auto",
+        "warmup_max_lr": "auto",
+        "warmup_num_steps": "auto",
+        "total_num_steps": "auto"
+      }
+    },
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/examples/code-llama/13b/lora.yml b/examples/code-llama/13b/lora.yml
new file mode 100644
index 000000000..637c05143
--- /dev/null
+++ b/examples/code-llama/13b/lora.yml
@@ -0,0 +1,67 @@
+base_model: codellama/CodeLlama-13b-hf
+base_model_config: codellama/CodeLlama-13b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./lora-out
+
+sequence_len: 100000
+sample_packing: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/examples/code-llama/13b/qlora.yml b/examples/code-llama/13b/qlora.yml
new file mode 100644
index 000000000..ae78f5bf2
--- /dev/null
+++ b/examples/code-llama/13b/qlora.yml
@@ -0,0 +1,69 @@
+base_model: codellama/CodeLlama-13b-hf
+base_model_config: codellama/CodeLlama-13b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 100000
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: paged_adamw_32bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/examples/code-llama/34b/lora.yml b/examples/code-llama/34b/lora.yml
new file mode 100644
index 000000000..9c4cfee10
--- /dev/null
+++ b/examples/code-llama/34b/lora.yml
@@ -0,0 +1,67 @@
+base_model: codellama/CodeLlama-34b-hf
+base_model_config: codellama/CodeLlama-34b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./lora-out
+
+sequence_len: 100000
+sample_packing: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/examples/code-llama/34b/qlora.yml b/examples/code-llama/34b/qlora.yml
new file mode 100644
index 000000000..9f5ce50f9
--- /dev/null
+++ b/examples/code-llama/34b/qlora.yml
@@ -0,0 +1,69 @@
+base_model: codellama/CodeLlama-34b-hf
+base_model_config: codellama/CodeLlama-34b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 100000
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: paged_adamw_32bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/examples/code-llama/7b/lora.yml b/examples/code-llama/7b/lora.yml
new file mode 100644
index 000000000..dfa3f2f7a
--- /dev/null
+++ b/examples/code-llama/7b/lora.yml
@@ -0,0 +1,67 @@
+base_model: codellama/CodeLlama-7b-hf
+base_model_config: codellama/CodeLlama-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./lora-out
+
+sequence_len: 100000
+sample_packing: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/examples/code-llama/7b/qlora.yml b/examples/code-llama/7b/qlora.yml
new file mode 100644
index 000000000..704f058c3
--- /dev/null
+++ b/examples/code-llama/7b/qlora.yml
@@ -0,0 +1,69 @@
+base_model: codellama/CodeLlama-7b-hf
+base_model_config: codellama/CodeLlama-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 100000
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: paged_adamw_32bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/examples/code-llama/README.md b/examples/code-llama/README.md
new file mode 100644
index 000000000..a5011e347
--- /dev/null
+++ b/examples/code-llama/README.md
@@ -0,0 +1,22 @@
+# Overview
+
+This is an example of CodeLLaMA configuration for 7b, 13b and 34b.
+
+The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes.
+
+The 13b variant will fit if you change these settings to these values:
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+
+The 34b variant does not fit on 24GB of VRAM - you will need something with +40 gb VRAM that also supports flash attention v2 - A6000 or A100 are good choices.
+
+```shell
+accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/qlora.yml
+
+```
+or
+
+```shell
+accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/lora.yml
+
+```
diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml
new file mode 100644
index 000000000..66515dabc
--- /dev/null
+++ b/examples/llama-2/relora.yml
@@ -0,0 +1,73 @@
+base_model: meta-llama/Llama-2-7b-hf
+base_model_config: meta-llama/Llama-2-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./relora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 4096
+sample_packing: true
+
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+relora_steps: 150
+relora_warmup_steps: 10
+relora_cpu_offload: false
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 4
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps: 50
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 3255a623f..d02448ec2 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -82,6 +82,8 @@ def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
             max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
         )
 
+    model = model.to(cfg.device)
+
     while True:
         print("=" * 80)
         # support for multiline inputs
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index c73b4a713..24be1b8c2 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -10,19 +10,13 @@ from functools import partial
 from pathlib import Path
 from typing import Optional, Union
 
-import bitsandbytes as bnb
 import numpy as np
 import torch.cuda
-import transformers
 from datasets import Dataset, set_caching_enabled
-from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
 from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
 from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
-from transformers.trainer_pt_utils import (
-    SequentialDistributedSampler,
-    get_parameter_names,
-)
+from transformers.trainer_pt_utils import SequentialDistributedSampler
 
 from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
 from axolotl.utils.callbacks import (
@@ -32,10 +26,7 @@ from axolotl.utils.callbacks import (
 )
 from axolotl.utils.collators import DataCollatorForSeq2Seq
 from axolotl.utils.dataloader import MultipackDistributedDataloader
-from axolotl.utils.schedulers import (
-    InterpolatingLogScheduler,
-    get_cosine_schedule_with_quadratic_warmup,
-)
+from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup
 
 LOG = logging.getLogger("axolotl")
 
@@ -570,66 +561,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
         if Path(cfg.torchdistx_path).exists():
             sys.path.append(cfg.torchdistx_path)
             importlib.import_module("torchdistx")
-    if (
-        cfg.optimizer == "adamw_bnb_8bit"
-        and not cfg.gptq
-        and "deepspeed" not in training_arguments_kwargs
-        and not cfg.fsdp
-    ):
-        decay_parameters = get_parameter_names(model, [nn.LayerNorm])
-        decay_parameters = [name for name in decay_parameters if "bias" not in name]
-        optimizer_grouped_parameters = [
-            {
-                "params": [
-                    p
-                    for n, p in model.named_parameters()
-                    if (n in decay_parameters and p.requires_grad)
-                ],
-                "weight_decay": training_args.weight_decay,
-            },
-            {
-                "params": [
-                    p
-                    for n, p in model.named_parameters()
-                    if (n not in decay_parameters and p.requires_grad)
-                ],
-                "weight_decay": 0.0,
-            },
-        ]
-
-        optimizer = bnb.optim.Adam8bit(
-            optimizer_grouped_parameters,
-            betas=(training_args.adam_beta1, training_args.adam_beta2),
-            eps=training_args.adam_epsilon,
-            lr=training_args.learning_rate,
-        )
-
-        if cfg.lr_scheduler == "one_cycle":
-            lr_scheduler_kwargs = (
-                cfg.lr_scheduler_kwargs if cfg.lr_scheduler_kwargs else {}
-            )
-            lr_scheduler = OneCycleLR(
-                optimizer,
-                cfg.learning_rate,
-                total_steps=total_num_steps,
-                epochs=cfg.num_epochs,
-                div_factor=cfg.lr_div_factor if cfg.lr_div_factor else 6,
-                **lr_scheduler_kwargs,
-            )
-        elif cfg.lr_scheduler == "log_sweep":
-            lr_scheduler = InterpolatingLogScheduler(
-                optimizer,
-                cfg.warmup_steps,
-                cfg.log_sweep_min_lr if cfg.log_sweep_min_lr else 1e-10,
-                cfg.log_sweep_max_lr if cfg.log_sweep_max_lr else 10,
-            )
-        else:
-            lr_scheduler = transformers.get_cosine_schedule_with_warmup(
-                optimizer,
-                training_args.warmup_steps,
-                total_num_steps,
-            )
-        trainer_kwargs["optimizers"] = (optimizer, lr_scheduler)
 
     callbacks = []
     callbacks.append(GPUStatsCallback(cfg))