remove debugging, use gpt2 since starcoder requires consent

fix packing for tokenizers that don't use a bos_token when the bos token and eos token are both the same
2023-06-13 21:32:47 -04:00 · 2023-06-13 21:26:13 -04:00
31 changed files with 179 additions and 1346 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -12,7 +12,6 @@ jobs:
    # this job needs to be run on self-hosted GPU runners...
    runs-on: self-hosted
    strategy:
-      fail-fast: false
      matrix:
        include:
          - cuda: "118"
@@ -26,7 +25,7 @@ jobs:
            pytorch: 2.0.0
            axolotl_extras:
          - cuda: "117"
-            cuda_version: 11.7.1
+            cuda_version: 11.7.0
            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -11,7 +11,6 @@ jobs:
    if: github.repository_owner == 'OpenAccess-AI-Collective'
    # this job needs to be run on self-hosted GPU runners...
    strategy:
-      fail-fast: false
      matrix:
        include:
          - cuda: cu118
@@ -30,7 +29,7 @@ jobs:
            pytorch: 2.0.0
            axolotl_extras: gptq
          - cuda: cu117
-            cuda_version: 11.7.1
+            cuda_version: 11.7.0
            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
@@ -85,7 +84,7 @@ jobs:
            pytorch: 2.0.0
            axolotl_extras: gptq
          - cuda: cu117
-            cuda_version: 11.7.1
+            cuda_version: 11.7.0
            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -7,7 +7,6 @@ jobs:
  test:
    runs-on: ubuntu-latest
    strategy:
-      fail-fast: false
      matrix:
        python_version: ["3.9", "3.10"]
    timeout-minutes: 10
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,5 @@
 default_language_version:
-    python: python3
+    python: python3.9

 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
--- a/README.md
+++ b/README.md
@@ -195,10 +195,6 @@ Have dataset(s) in one of the following format (JSONL recommended):
  ```json
  {"message_1": "...", "message_2": "..."}
  ```
- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
-  ```json
-  {"system_prompt": "...", "question": "...", "response": "..."}
-  ```
 - `context_qa`: in context question answering from an article
  ```json
  {"article": "...", "question": "...", "answer": "..."}
@@ -237,7 +233,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
 #### How to add custom prompts

  1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
-  2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
+  2. Use your custom file name as the dataset type.

 Optionally, download some datasets, see [data/README.md](data/README.md)

@@ -255,18 +251,10 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic

 - dataset
  ```yaml
-  sequence_len: 2048 # max token length for prompt
-
-  # huggingface repo
  datasets:
-    - path: vicgalle/alpaca-gpt4
-      type: alpaca # format from earlier
-
-  # local
-  datasets:
-    - path: json
-      data_files: data.jsonl # or json
+    - path: vicgalle/alpaca-gpt4 # local or huggingface repo
      type: alpaca # format from earlier
+  sequence_len: 2048 # max token length / prompt
  ```

 - loading
@@ -276,8 +264,6 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
  bf16: true # require >=ampere
  fp16: true
  tf32: true # require >=ampere
-  bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP (automatic mixed precision)
-  float16: true # use instead of fp16 when you don't want AMP
  ```
  Note: Repo does not do 4-bit quantization.

@@ -305,8 +291,6 @@ base_model_ignore_patterns:
 # if the base_model repo on hf hub doesn't include configuration .json files,
 # you can set that here, or leave this empty to default to base_model
 base_model_config: ./llama-7b-hf
-# you can specify to choose a specific model revision from huggingface hub
-model_revision:
 # Optional tokenizer configuration override in case you want to use a different tokenizer
 # than the one defined in the base model
 tokenizer_config:
@@ -316,8 +300,6 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Trust remote code for untrusted source
 trust_remote_code:
-# use_fast option for tokenizer loading from_pretrained, default to True
-tokenizer_use_fast:

 # whether you are training a 4-bit GPTQ quantized model
 gptq: true
@@ -338,10 +320,10 @@ tf32: true # require >=ampere

 # a list of one or more datasets to finetune the model with
 datasets:
-  # hf dataset repo | "json" for local dataset, make sure to fill data_files
+  # this can be either a hf dataset, or relative path
  - path: vicgalle/alpaca-gpt4
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
+    type: alpaca # format OR format:prompt_style (chat/instruct)
    data_files: # path to source data files
    shards: # number of shards to split data into

@@ -350,8 +332,6 @@ datasets:
 dataset_prepared_path: data/last_run_prepared
 # push prepared dataset to hub
 push_dataset_to_hub: # repo path
-# push checkpoints to hub
-hub_model_id: # repo path
 # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
 # required to be true when used in combination with `push_dataset_to_hub`
 hf_use_auth_token: # boolean
@@ -413,9 +393,6 @@ logging_steps:
 save_steps:
 eval_steps:

-# save model as safetensors (require safetensors package)
-save_safetensors:
-
 # whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false
 # don't use this, leads to wonky training (according to someone on the internet)
@@ -443,15 +420,7 @@ log_sweep_max_lr:
 optimizer:
 # specify weight decay
 weight_decay:
-# adamw hyperparams
-adam_beta1:
-adam_beta2:
-adam_epsilon:
-# Gradient clipping max norm
-max_grad_norm:

-# whether to bettertransformers
-flash_optimum:
 # whether to use xformers attention patch https://github.com/facebookresearch/xformers:
 xformers_attention:
 # whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
@@ -551,12 +520,6 @@ Add below flag to train command above
 --merge_lora --lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False
 ```

-If you run out of CUDA memory, you can try to merge in system RAM with
-
-```bash
-CUDA_VISIBLE_DEVICES="" python3 scripts/finetune.py ...
-```
-
 ## Common Errors 🧰

 > Cuda out of memory
--- a/data/README.md
+++ b/data/README.md
@@ -10,10 +10,10 @@ curl https://github.com/teknium1/GPTeacher/blob/main/Roleplay/roleplay-similarit
 ## Convert the JSON data files to JSONL.

 ```shell
-python3 ./scripts/alpaca_json_to_jsonl.py --file data/alpaca_data_gpt4.json --output data/alpaca_data_gpt4.jsonl
-python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/vicuna_cleaned.json --output data/vicuna_cleaned.jsonl
-python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/roleplay-similarity_0.6-instruct-dataset.json --output data/roleplay-similarity_0.6-instruct-dataset.jsonl
-python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/gpt4-instruct-similarity-0.6-dataset.json --output data/gpt4-instruct-similarity-0.6-dataset.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --input data/alpaca_data_gpt4.json > data/alpaca_data_gpt4.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --input data/raw/vicuna_cleaned.json > data/vicuna_cleaned.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --input data/raw/roleplay-similarity_0.6-instruct-dataset.json > data/roleplay-similarity_0.6-instruct-dataset.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --input data/raw/gpt4-instruct-similarity-0.6-dataset.json > data/gpt4-instruct-similarity-0.6-dataset.jsonl
 ```
 ---

--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -77,7 +77,7 @@ FROM base-builder
 RUN python3 -m pip uninstall -y apex
 RUN git clone https://github.com/NVIDIA/apex
 #  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
-RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .

 RUN mkdir -p /workspace/builds
 COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes
@@ -97,4 +97,4 @@ RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
 RUN git lfs install --skip-repo
 RUN pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic==1.10.10
+    pip3 install -U --no-cache-dir pydantic
--- a/examples/pythia-12b/README.md
+++ b/examples/pythia-12b/README.md
@@ -1,9 +0,0 @@
-# Pythia 12B
-
- Single-GPU A100 only (?)
-
-```shell
-python scripts/finetune.py examples/pythia-12b/config.yml
-```
-
-⚠️ Multiple-GPU A100 - Doesn't seem to work with multi-gpu without causing OOM! ⚠️
--- a/examples/pythia-12b/config.yml
+++ b/examples/pythia-12b/config.yml
@@ -1,49 +0,0 @@
-base_model: EleutherAI/pythia-12b-deduped
-base_model_config: EleutherAI/pythia-12b-deduped
-base_model_ignore_patterns: pytorch*  # prefer safetensors
-model_type: GPTNeoXForCausalLM
-tokenizer_type: AutoTokenizer
-load_in_8bit: false
-load_in_4bit: false
-gptq: false
-device_map: auto
-datasets:
-  - path: vicgalle/alpaca-gpt4
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
-adapter:
-lora_model_dir:
-sequence_len: 2048
-max_packed_sequence_len: 2048
-lora_r: 64
-lora_alpha: 32
-lora_dropout: 0.0
-lora_target_modules:
-lora_target_linear: true
-lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./pythia-12b
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 5
-learning_rate: 0.00003
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-train_on_inputs: false
-group_by_length: false
-bf16: false
-fp16: false
-float16: true
-tf32: true
-flash_optimum: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-gradient_checkpointing: true
-fsdp:
-fsdp_config:
-collator_pad_to_longest: true
--- a/examples/redpajama/config-3b.yml
+++ b/examples/redpajama/config-3b.yml
@@ -1,7 +1,7 @@
 base_model: togethercomputer/RedPajama-INCITE-Chat-3B-v1
 base_model_config: togethercomputer/RedPajama-INCITE-Chat-3B-v1
 model_type: GPTNeoXForCausalLM
-tokenizer_type: AutoTokenizer
+tokenizer_type: GPTNeoXTokenizer
 trust_remote_code:
 load_in_8bit: false
 datasets:
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 peft @ git+https://github.com/huggingface/peft.git
 transformers @ git+https://github.com/huggingface/transformers.git
 bitsandbytes>=0.39.0
+accelerate
 addict
 fire
 PyYAML==6.0
@@ -10,11 +11,9 @@ sentencepiece
 wandb
 einops
 xformers
-optimum
 # qlora things
 bert-score==0.3.13
 evaluate==0.4.0
 rouge-score==0.1.2
 scipy
 scikit-learn==1.2.2
-numba
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -12,14 +12,13 @@ from typing import Any, Dict, List, Optional, Union
 import fire
 import torch
 import yaml
-
-# add src to the pythonpath so we don't need to pip install this
-from optimum.bettertransformer import BetterTransformer
 from transformers import GenerationConfig, TextStreamer

-from axolotl.utils.data import load_prepare_datasets, load_pretraining_dataset
+from axolotl.utils.data import load_prepare_datasets
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_tokenizer
+
+# add src to the pythonpath so we don't need to pip install this
 from axolotl.utils.tokenization import check_dataset_labels
 from axolotl.utils.trainer import setup_trainer
 from axolotl.utils.validation import validate_config
@@ -218,20 +217,9 @@ def train(
    if (
        check_not_in(["shard", "merge_lora"], kwargs) and not cfg.inference
    ):  # don't need to load dataset for these
-        if not cfg.pretraining_dataset:
-            train_dataset, eval_dataset = load_prepare_datasets(
-                tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
-            )
-        else:
-            train_dataset = load_pretraining_dataset(
-                cfg.pretraining_dataset,
-                tokenizer,
-                max_tokens=cfg.sequence_len,
-                seed=cfg.seed,
-            )
-            # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230
-            train_dataset = train_dataset.with_format("torch")
-            eval_dataset = None
+        train_dataset, eval_dataset = load_prepare_datasets(
+            tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
+        )

    if cfg.debug or "debug" in kwargs:
        logging.info("check_dataset_labels...")
@@ -297,15 +285,12 @@ def train(

    # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
    if cfg.local_rank == 0:
-
-        def terminate_handler(_, __, model):
-            if cfg.flash_optimum:
-                model = BetterTransformer.reverse(model)
-            model.save_pretrained(cfg.output_dir)
-            sys.exit(0)
-
        signal.signal(
-            signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
+            signal.SIGINT,
+            lambda signal, frame: (
+                model.save_pretrained(cfg.output_dir),
+                sys.exit(0),
+            ),
        )

    logging.info("Starting trainer...")
@@ -328,21 +313,13 @@ def train(

    if not Path(cfg.output_dir).is_dir():
        os.makedirs(cfg.output_dir, exist_ok=True)
-    if cfg.flash_optimum:
-        with torch.backends.cuda.sdp_kernel(
-            enable_flash=True, enable_math=True, enable_mem_efficient=True
-        ):
-            trainer.train(resume_from_checkpoint=resume_from_checkpoint)
-    else:
-        trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+    trainer.train(resume_from_checkpoint=resume_from_checkpoint)

    logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")

    # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
    # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
    if cfg.local_rank == 0:
-        if cfg.flash_optimum:
-            model = BetterTransformer.reverse(model)
        model.save_pretrained(cfg.output_dir)

    # trainer.save_model(cfg.output_dir)  # TODO this may be needed for deepspeed to work? need to review another time
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -126,15 +126,18 @@ class ConstantLengthDataset(IterableDataset):
                    buffer_len = 0

                if example:
-                    # FIXME
                    # just going to drop data points that are too long
                    if len(example["input_ids"]) <= self.seq_length:
                        input_ids = example["input_ids"]
                        attention_mask = example["attention_mask"]
                        labels = example["labels"]
                        if (
-                            buffer["input_ids"]
-                            and input_ids[0] == self.tokenizer.bos_token_id
+                            (
+                                buffer["input_ids"]
+                                and input_ids[0] == self.tokenizer.bos_token_id
+                            )
+                            or self.tokenizer.bos_token_id
+                            == self.tokenizer.eos_token_id
                        ):
                            attention_mask[0] = 0

--- a/src/axolotl/prompt_strategies/alpaca_chat.py
+++ b/src/axolotl/prompt_strategies/alpaca_chat.py
@@ -6,7 +6,7 @@ from axolotl.prompt_tokenizers import (
    AlpacaPromptTokenizingStrategy,
    InstructionPromptTokenizingStrategy,
 )
-from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter
+from axolotl.prompters import AlpacaPrompter, PromptStyle


 def load(tokenizer, cfg):
@@ -20,38 +20,11 @@ def load(tokenizer, cfg):

 class AlpacaConcisePrompter(AlpacaPrompter):
    """
-    Alpaca Prompter extending the system prompt to ask for concise chat-instruct answers
+    Alpaca Prompter extending the system prompt to ask for concise answers
    """

-    system_prompt = "Below is an instruction from a USER that describes a task, paired with an input that provides further context. The ASSISTANT writes a response that concisely and appropriately completes the request.\n\n"
-    system_no_input_prompt = "Below is an instruction from a USER that describes a task. The ASSISTANT writes a response that appropriately and concisely completes the request.\n\n"
-
-
-class AlpacaChatPrompter(AlpacaPrompter):
-    """
-    Alpaca Chat Prompter extending the system prompt to for chat-instruct answers
-    """
-
-    system_prompt = "Below is an instruction from a USER that describes a task, paired with an input that provides further context. The ASSISTANT writes a response that concisely and appropriately completes the request.\n\n"
-    system_no_input_prompt = "Below is an instruction from a USER that describes a task. The ASSISTANT writes a response that appropriately and concisely completes the request.\n\n"
-
-    def __init__(self):  # pylint: disable=super-init-not-called
-        self.prompt_style = PromptStyle.CHAT.value
-        self.match_prompt_style()
-
-
-class NoSystemPrompter(AlpacaPrompter):
-    """
-    Null Prompter with no system prompts
-    """
-
-    system_prompt = ""
-    system_no_input_prompt = ""
-    turn_format = "{instruction} {input} "
-    turn_no_input_format = "{instruction} "
-
-    def __init__(self):  # pylint: disable=super-init-not-called
-        pass
+    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that concisely and appropriately completes the request.\n\n"
+    system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately and concisely completes the request.\n\n"


 class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
@@ -91,7 +64,7 @@ def load_concise(tokenizer, cfg):

 def load_qa(tokenizer, cfg):
    return AlpacaQAPromptTokenizingStrategy(
-        AlpacaChatPrompter(),
+        AlpacaPrompter(PromptStyle.CHAT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
@@ -100,16 +73,7 @@ def load_qa(tokenizer, cfg):

 def load_camel_ai(tokenizer, cfg):
    return CamelAIPromptTokenizingStrategy(
-        AlpacaChatPrompter(),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
-
-
-def load_no_prompt(tokenizer, cfg):
-    return AlpacaPromptTokenizingStrategy(
-        UnpromptedPrompter(PromptStyle.CHAT.value),
+        AlpacaPrompter(PromptStyle.CHAT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
--- a/src/axolotl/prompt_strategies/alpaca_instruct.py
+++ b/src/axolotl/prompt_strategies/alpaca_instruct.py
@@ -1,7 +1,7 @@
 """Module loading the AlpacaInstructPromptTokenizingStrategy class"""

 from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
-from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter
+from axolotl.prompters import AlpacaPrompter, PromptStyle


 def load(tokenizer, cfg):
@@ -11,12 +11,3 @@ def load(tokenizer, cfg):
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
-
-
-def load_no_prompt(tokenizer, cfg):
-    return AlpacaPromptTokenizingStrategy(
-        UnpromptedPrompter(PromptStyle.INSTRUCT.value),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
--- a/src/axolotl/prompt_strategies/alpaca_w_system.py
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -1,120 +0,0 @@
-"""
-Prompt strategies loader for alpaca instruction datasets with system prompts
-"""
-from typing import Generator, Tuple, Union
-
-from axolotl.prompt_tokenizers import PromptTokenizingStrategy
-from axolotl.prompters import AlpacaPrompter, PromptStyle
-
-
-class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy):
-    """
-    Tokenizing strategy for instruction-based prompts.
-    """
-
-    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
-        return (
-            prompt["instruction"],
-            prompt["input"] if "input" in prompt else "",
-            prompt["output"],
-            prompt["system"],
-        )
-
-    def tokenize_prompt(self, prompt):
-        # pylint: disable=duplicate-code
-        (
-            instruction,
-            input,  # pylint: disable=redefined-builtin
-            response,
-            system,
-        ) = self.parse_instruction_fields(prompt)
-        user_prompt = next(
-            iter(
-                self.prompter.build_prompt_w_system(
-                    system,
-                    instruction,
-                    input,
-                )
-            )
-        )
-        tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False)
-        if not self.train_on_inputs:
-            user_prompt_len = len(tokenized_prompt["input_ids"])
-            # TODO this could be sped up using numpy array slicing
-            tokenized_prompt["labels"] = [-100] * user_prompt_len
-        tokenized_res_prompt = self._tokenize(
-            response, strip_bos_token=True, add_eos_token=True
-        )
-        tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"]
-        tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"]
-        tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"]
-
-        return tokenized_prompt
-
-
-class SystemDataPrompter(AlpacaPrompter):
-    """
-    Alpaca Style Prompter that uses system prompts from the dataset
-    """
-
-    def build_prompt_w_system(
-        self,
-        system: str,
-        instruction: str,
-        input: Union[None, str] = None,  # pylint: disable=redefined-builtin
-        output: Union[None, str] = None,
-    ) -> Generator[str, None, None]:
-        # returns the full prompt from instruction and optional input
-        # if a label (=response, =output) is provided, it's also appended.
-        if input:
-            res = system + self.turn_format.format(instruction=instruction, input=input)
-        else:
-            res = system + self.turn_no_input_format.format(instruction=instruction)
-        if output:
-            res = f"{res}{output}"
-        yield res
-
-
-class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
-    """
-    Tokenizing strategy for OpenOrca datasets
-    """
-
-    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
-        return (
-            prompt["question"],
-            "",
-            prompt["response"],
-            prompt["system_prompt"],
-        )
-
-
-def load(tokenizer, cfg):
-    return load_chat(tokenizer, cfg)
-
-
-def load_instruct(tokenizer, cfg):
-    return InstructionWSystemPromptTokenizingStrategy(
-        SystemDataPrompter(PromptStyle.INSTRUCT.value),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
-
-
-def load_chat(tokenizer, cfg):
-    return InstructionWSystemPromptTokenizingStrategy(
-        SystemDataPrompter(PromptStyle.CHAT.value),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
-
-
-def load_open_orca(tokenizer, cfg):
-    return OpenOrcaPromptTokenizingStrategy(
-        SystemDataPrompter(PromptStyle.INSTRUCT.value),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -87,9 +87,7 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
    Tokenizing strategy for instruction-based prompts.
    """

-    def parse_instruction_fields(
-        self, prompt
-    ) -> Union[Tuple[str, str, str], Tuple[str, str, str, str]]:
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        raise NotImplementedError

    def tokenize_prompt(self, prompt):
@@ -98,27 +96,25 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
            input,  # pylint: disable=redefined-builtin
            response,
        ) = self.parse_instruction_fields(prompt)
-        user_prompt = next(
-            iter(
-                self.prompter.build_prompt(
-                    instruction,
-                    input,
+        full_prompt = self._build_full_prompt(instruction, input, response)
+        tokenized_full_prompt = self._tokenize(full_prompt)
+        if not self.train_on_inputs:
+            user_prompt = next(
+                iter(
+                    self.prompter.build_prompt(
+                        instruction,
+                        input,
+                    )
                )
            )
-        )
-        tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False)
-        if not self.train_on_inputs:
-            user_prompt_len = len(tokenized_prompt["input_ids"])
+            tokenized_user_prompt = self._tokenize(user_prompt, add_eos_token=False)
+            user_prompt_len = len(tokenized_user_prompt["input_ids"])
            # TODO this could be sped up using numpy array slicing
-            tokenized_prompt["labels"] = [-100] * user_prompt_len
-        tokenized_res_prompt = self._tokenize(
-            response, strip_bos_token=True, add_eos_token=True
-        )
-        tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"]
-        tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"]
-        tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"]
+            tokenized_full_prompt["labels"] = [
+                -100
+            ] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:]

-        return tokenized_prompt
+        return tokenized_full_prompt

    def _build_full_prompt(
        self, instruction, input, response  # pylint: disable=redefined-builtin
@@ -440,7 +436,7 @@ def parse_tokenized_to_result(
    result: Dict[str, List[int]],
    current_len: int,
    res: Dict[str, List[int]],
-    labels: List[int],
+    labels: list[int],
    pad_token_id: Union[int, None] = None,
 ) -> Tuple[Dict[str, List[int]], int]:
    """
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -24,8 +24,6 @@ class AlpacaPrompter:

    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
    system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
-    turn_format: str
-    turn_no_input_format: str
    prompt_style: Optional[PromptStyle] = None

    def __init__(self, prompt_style=PromptStyle.INSTRUCT.value):
@@ -34,13 +32,23 @@ class AlpacaPrompter:

    def match_prompt_style(self):
        if self.prompt_style == PromptStyle.INSTRUCT.value:
-            self.turn_format = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
-            self.turn_no_input_format = (
-                "### Instruction:\n{instruction}\n\n### Response:\n"
+            self.prompt_input = (
+                self.system_prompt
+                + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
            )
+            self.prompt_no_input = (
+                self.system_no_input_prompt
+                + "### Instruction:\n{instruction}\n\n### Response:\n"
+            )
+            self.response_split = "### Response:"
        if self.prompt_style == PromptStyle.CHAT.value:
-            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
-            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
+            self.prompt_input = (
+                self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
+            )
+            self.prompt_no_input = (
+                self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
+            )
+            self.response_split = "ASSISTANT:"

    def build_prompt(
        self,
@@ -51,17 +59,16 @@ class AlpacaPrompter:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
-            res = self.system_prompt + self.turn_format.format(
-                instruction=instruction, input=input
-            )
+            res = self.prompt_input.format(instruction=instruction, input=input)
        else:
-            res = self.system_no_input_prompt + self.turn_no_input_format.format(
-                instruction=instruction
-            )
+            res = self.prompt_no_input.format(instruction=instruction)
        if output:
            res = f"{res}{output}"
        yield res

+    def get_response(self, output: str) -> str:
+        return output.split(self.response_split)[1].strip()
+

 class UnpromptedPrompter(AlpacaPrompter):
    """
@@ -86,10 +93,7 @@ class MultipleChoiceExplainPrompter(AlpacaPrompter):
    """

    system_prompt = (
-        "Choose the answer that best answers the question. Explain your reasoning.\n"
-    )
-    system_no_input_prompt = (
-        "Choose the answer that best answers the question. Explain your reasoning.\n"
+        "Choose the answer that best answers the question. Explain your reasoning."
    )


@@ -98,12 +102,7 @@ class MultipleChoiceConcisePrompter(AlpacaPrompter):
    Prompter for multiple choice concise
    """

-    system_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
-    system_no_input_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
-
-    def match_prompt_style(self):
-        self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
-        self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
+    prompt_input = "Choose the answer that best answers the question. Be concise in your response.\n\nUSER: {instruction}\n{input}\nASSISTANT:\n"


 class SummarizeTLDRPrompter(AlpacaPrompter):
@@ -111,12 +110,9 @@ class SummarizeTLDRPrompter(AlpacaPrompter):
    Prompter for summarize TLDR
    """

-    system_prompt = ""
-    system_no_input_prompt = ""
-
-    def match_prompt_style(self):
-        self.turn_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\n{input}\nASSISTANT:"
-        self.turn_no_input_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
+    prompt_no_input = (
+        "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
+    )


 class CompletionPrompter:
@@ -132,6 +128,9 @@ class CompletionPrompter:
    ) -> Generator[str, None, None]:
        yield instruction

+    def get_response(self, output: str) -> str:
+        return output.strip()
+

 class GPTeacherPrompter(AlpacaPrompter):
    """
@@ -211,6 +210,9 @@ class ReflectAlpacaPrompter:
            res = f"{res}{label}"
        yield res

+    def get_response(self, output: str) -> str:
+        return output.split(self.response_split)[1].strip()
+

 class SeparatorStyle(Enum):
    """Different separator style."""
@@ -287,6 +289,12 @@ class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
            sep2=" ",
        )

+    # def match_prompt_style(self):
+    #     if self.prompt_style == PromptStyle.chat.value:
+    #         self.prompt_input = self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
+    #         self.prompt_no_input = self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
+    #         self.response_split = "ASSISTANT:"
+
    def build_prompt(self, source) -> Generator[str, None, None]:
        # ignore the system prompt if provided
        if source[0]["from"] == "system":
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -2,14 +2,13 @@

 import os

-from optimum.bettertransformer import BetterTransformer
 from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
 )
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR


 class SavePeftModelCallback(TrainerCallback):  # pylint: disable=too-few-public-methods
@@ -31,39 +30,3 @@ class SavePeftModelCallback(TrainerCallback):  # pylint: disable=too-few-public-
        kwargs["model"].save_pretrained(peft_model_path)

        return control
-
-
-class SaveBetterTransformerModelCallback(
-    TrainerCallback
-):  # pylint: disable=too-few-public-methods
-    """Callback to save the BetterTransformer wrapped model"""
-
-    def on_step_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        # Save
-        if (
-            args.save_strategy == IntervalStrategy.STEPS
-            and args.save_steps > 0
-            and state.global_step % args.save_steps == 0
-        ):
-            control.should_save = True
-
-        if control.should_save:
-            checkpoint_folder = os.path.join(
-                args.output_dir,
-                f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
-            )
-
-            model = BetterTransformer.reverse(kwargs["model"])
-            model.save_pretrained(checkpoint_folder)
-            # FIXME - need to cleanup old checkpoints
-
-            # since we're saving here, we don't need the trainer loop to attempt to save too b/c
-            # the trainer will raise an exception since it can't save a BetterTransformer wrapped model
-            control.should_save = False
-        return control
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -1,11 +1,10 @@
 """Module containing data utilities"""
-import functools
+
 import logging
 from hashlib import md5
 from pathlib import Path
 from typing import List, Tuple, Union

-import torch
 from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
 from huggingface_hub import hf_hub_download
 from transformers import PreTrainedTokenizerBase
@@ -102,26 +101,13 @@ def load_tokenized_prepared_datasets(
                pass

            # prefer local dataset, even if hub exists
-            local_path = Path(d.path)
-            if local_path.exists():
-                if local_path.is_dir():
-                    ds = load_dataset(
-                        d.path,
-                        data_files=d.data_files,
-                        streaming=False,
-                        split=None,
-                    )
-                elif local_path.is_file():
-                    ds = load_dataset(
-                        "json",
-                        data_files=d.path,
-                        streaming=False,
-                        split=None,
-                    )
-                else:
-                    raise ValueError(
-                        "unhandled dataset load: local path exists, but is neither a directory or a file"
-                    )
+            if Path(d.path).exists():
+                ds = load_dataset(
+                    "json",
+                    data_files=d.path,
+                    streaming=False,
+                    split=None,
+                )
            elif ds_from_hub:
                if d.data_files:
                    ds = load_dataset(
@@ -408,127 +394,8 @@ def load_prepare_datasets(
            index=cfg.dataset_shard_idx,
        )

-    if cfg.val_set_size:
-        dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
-        train_dataset = dataset["train"]
-        eval_dataset = dataset["test"]
-    else:
-        train_dataset = dataset
-        eval_dataset = None
+    dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
+    train_dataset = dataset["train"]
+    eval_dataset = dataset["test"]

    return train_dataset, eval_dataset
-
-
-def encode_pretraining(tokenizer, max_tokens, examples):
-    res = tokenizer(
-        examples["text"],
-        truncation=True,
-        max_length=max_tokens - 2,
-        add_special_tokens=True,
-    )
-    # Convert to PyTorch tensors
-    input_ids = [torch.tensor(seq) for seq in res["input_ids"]]
-    attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]]
-    new_input_ids = []
-    new_attention_mask = []
-    # Append EOS and PAD tokens to input_ids, and correct attention_mask
-    for i, _ in enumerate(input_ids):
-        input_ids[i] = torch.cat(
-            (
-                input_ids[i],
-                torch.tensor([tokenizer.eos_token_id, tokenizer.pad_token_id]),
-            ),
-            dim=0,
-        )
-        attention_mask[i] = torch.cat((attention_mask[i], torch.tensor([1, 0])), dim=0)
-
-    # Concatenate tokens so that their lengths are less than max_tokens
-    buffer_input_ids = torch.tensor([], dtype=torch.long)
-    buffer_attention_mask = torch.tensor([], dtype=torch.long)
-
-    for ids, mask in zip(input_ids, attention_mask):
-        if buffer_input_ids.numel() == max_tokens:
-            new_input_ids.append(buffer_input_ids)
-            new_attention_mask.append(buffer_attention_mask)
-            buffer_input_ids = torch.tensor([], dtype=torch.long)
-            buffer_attention_mask = torch.tensor([], dtype=torch.long)
-            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
-            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
-        elif buffer_input_ids.numel() + ids.numel() <= max_tokens:
-            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
-            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
-        else:
-            buffer_input_ids = torch.cat(
-                (
-                    buffer_input_ids,
-                    torch.full(
-                        (max_tokens - buffer_input_ids.numel(),),
-                        tokenizer.pad_token_id,
-                        dtype=torch.long,
-                    ),
-                ),
-                dim=0,
-            )
-            buffer_attention_mask = torch.cat(
-                (
-                    buffer_attention_mask,
-                    torch.full(
-                        (max_tokens - buffer_attention_mask.numel(),),
-                        0,
-                        dtype=torch.long,
-                    ),
-                ),
-                dim=0,
-            )
-            new_input_ids.append(buffer_input_ids)
-            new_attention_mask.append(buffer_attention_mask)
-            buffer_input_ids = torch.tensor([], dtype=torch.long)
-            buffer_attention_mask = torch.tensor([], dtype=torch.long)
-
-            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
-            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
-
-    if buffer_input_ids.numel() > 0:  # for any leftover tokens
-        while buffer_input_ids.numel() < max_tokens:  # make all sequences equal in size
-            buffer_input_ids = torch.cat(
-                (
-                    buffer_input_ids,
-                    torch.full(
-                        (max_tokens - buffer_input_ids.numel(),),
-                        tokenizer.pad_token_id,
-                        dtype=torch.long,
-                    ),
-                ),
-                dim=0,
-            )
-            buffer_attention_mask = torch.cat(
-                (
-                    buffer_attention_mask,
-                    torch.full(
-                        (max_tokens - buffer_attention_mask.numel(),),
-                        0,
-                        dtype=torch.long,
-                    ),
-                ),
-                dim=0,
-            )
-        new_input_ids.append(buffer_input_ids)
-        new_attention_mask.append(buffer_attention_mask)
-
-    ret = {
-        "input_ids": [seq.tolist() for seq in new_input_ids],
-        "labels": [seq.tolist() for seq in new_input_ids],
-        "attention_mask": [seq.tolist() for seq in new_attention_mask],
-    }
-
-    logging.debug(len(ret["input_ids"]))
-    return ret
-
-
-def load_pretraining_dataset(path, tokenizer, max_tokens=2048, seed=42):
-    encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
-    dataset = load_dataset(path, streaming=True, split="train")
-    dataset = dataset.shuffle(seed=seed, buffer_size=10_000)
-    # TODO dynamically figure out which columns/features to remove
-    dataset = dataset.map(encode, batched=True, remove_columns=["text", "meta"])
-    return dataset
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -10,15 +10,13 @@ from typing import TYPE_CHECKING, Optional, Tuple  # noqa: F401
 import bitsandbytes as bnb
 import torch
 import transformers
-from optimum.bettertransformer import BetterTransformer
+from transformers import PreTrainedModel  # noqa: F401
 from transformers import (  # noqa: F401
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    LlamaConfig,
-    PreTrainedModel,
-    PreTrainedTokenizerBase,
 )

 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
@@ -34,20 +32,15 @@ def load_tokenizer(
    tokenizer_type,
    cfg,
 ):
-    use_fast = True  # this is the default
-    if cfg.tokenizer_use_fast is not None:
-        use_fast = cfg.tokenizer_use_fast
    if tokenizer_type:
        tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
            tokenizer_config,
            trust_remote_code=cfg.trust_remote_code or False,
-            use_fast=use_fast,
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_config,
            trust_remote_code=cfg.trust_remote_code or False,
-            use_fast=use_fast,
        )

    logging.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
@@ -77,7 +70,7 @@ def load_tokenizer(
 def load_model(
    base_model, base_model_config, model_type, tokenizer, cfg, adapter="lora"
 ):
-    # type: (str, str, str, PreTrainedTokenizerBase, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
+    # type: (str, str, str, AutoTokenizer, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
    """
    Load a model from a base model and a model type.
    """
@@ -128,9 +121,9 @@ def load_model(
        logging.info("patching with xpos rope")
        replace_llama_rope_with_xpos_rope()

-    if cfg.bf16 or cfg.bfloat16:
+    if cfg.bf16:
        torch_dtype = torch.bfloat16
-    elif cfg.load_in_8bit or cfg.fp16 or cfg.float16:
+    elif cfg.load_in_8bit or cfg.fp16:
        torch_dtype = torch.float16
    else:
        torch_dtype = torch.float32
@@ -154,8 +147,6 @@ def load_model(
        )

    model_kwargs = {}
-    if cfg.model_revision:
-        model_kwargs["revision"] = cfg.model_revision
    if cfg.adapter == "qlora" and cfg.load_in_4bit:
        model_kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_4bit=True,
@@ -204,7 +195,7 @@ def load_model(
                else True,
            )
            load_in_8bit = False
-        elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
+        elif cfg.is_llama_derived_model:
            from transformers import LlamaForCausalLM

            config = LlamaConfig.from_pretrained(base_model_config)
@@ -243,7 +234,7 @@ def load_model(
        #         device=cfg.device,
        #     )
        #     model.train() # sets to train instead of eval mode
-        elif model_type and not cfg.trust_remote_code:
+        elif model_type:
            model = getattr(transformers, model_type).from_pretrained(
                base_model,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
@@ -260,16 +251,11 @@ def load_model(
            )
            # Shouldn't be a problem most of the time. will obviously error if the model doesn't support this
            # when training starts
-            if (
-                hasattr(config, "max_seq_len")
-                and config.max_seq_len
-                and cfg.sequence_len > config.max_seq_len
-            ):
+            if hasattr(config, "max_seq_len") and cfg.sequence_len > config.max_seq_len:
                config.max_seq_len = cfg.sequence_len
                logging.warning(f"increasing context length to {cfg.sequence_len}")
            elif (
                hasattr(config, "max_sequence_length")
-                and config.max_sequence_length
                and cfg.sequence_len > config.max_sequence_length
            ):
                config.max_sequence_length = cfg.sequence_len
@@ -292,7 +278,6 @@ def load_model(
        model = AutoModelForCausalLM.from_pretrained(
            base_model,
            load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
-            load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
            torch_dtype=torch_dtype,
            device_map=cfg.device_map,
            trust_remote_code=cfg.trust_remote_code or False,
@@ -302,16 +287,6 @@ def load_model(
    embeddings_len = math.ceil(len(tokenizer) / 32) * 32
    model.resize_token_embeddings(embeddings_len)

-    if (
-        hasattr(model.config, "max_position_embeddings")
-        and model.config.max_position_embeddings
-        and cfg.sequence_len >= model.config.max_position_embeddings
-    ):
-        logging.warning(
-            f"increasing model.config.max_position_embeddings to {cfg.sequence_len}"
-        )
-        model.config.max_position_embeddings = cfg.sequence_len
-
    if not cfg.gptq and (
        (cfg.adapter == "lora" and load_in_8bit)
        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
@@ -357,9 +332,6 @@ def load_model(
        logging.warning("there are no parameters that require gradient updates")
    model.config.use_cache = False

-    if cfg.flash_optimum:
-        model = BetterTransformer.transform(model)
-
    # TODO resume_from_checkpoint handling
    return model, lora_config

--- a/src/axolotl/utils/sampler.py
+++ b/src/axolotl/utils/sampler.py
@@ -1,173 +0,0 @@
-# pylint: skip-file
-
-from typing import Any, List, Optional
-
-import numba
-import numpy as np
-import torch.distributed as dist
-from torch.utils.data import Sampler
-
-
-@numba.njit
-def ffd_check(a: np.ndarray, c: int, n: int):
-    # First-fit-decreasing bin packing
-    # Check if a[] could fit in n bins with capacity c
-    # https://en.wikipedia.org/wiki/First-fit-decreasing_bin_packing
-
-    a = np.sort(a)[::-1]
-    bins = np.full((n,), c, dtype=a.dtype)
-    for size in a:
-        not_found = True
-        for idx in range(n):
-            if bins[idx] >= size:
-                bins[idx] -= size
-                not_found = False
-                break
-
-        if not_found:
-            return False
-
-    return True
-
-
-@numba.njit
-def ffd_with_result(a: np.ndarray, c: int, start_index: int):
-    # First-fit-decreasing bin packing (with result return)
-
-    indices = np.argsort(a)[::-1]
-    a = a[indices]
-
-    bins: List[int] = []
-    bins_result: List[Any] = []
-    for a_id, size in enumerate(a):
-        add_new = True
-        for idx in range(len(bins)):
-            if bins[idx] >= size:
-                bins[idx] -= size
-                bins_result[idx].append(indices[a_id] + start_index)
-                add_new = False
-                break
-
-        if add_new:
-            bins.append(c - size)
-            bins_result.append([indices[a_id] + start_index])
-
-    return bins_result
-
-
-@numba.njit
-def allocate(
-    lengths: np.ndarray, lengths_cumsum: np.ndarray, rank: int, c: int, n: int
-):
-    # Dynamic batch allocator, similar to Multifit
-    # https://en.wikipedia.org/wiki/Multifit_algorithm
-    # ~99.5% efficiency on OpenChat training set (12 * 2048 ctx len)
-
-    s = 0
-    start_index = 0
-    result = []
-
-    while True:
-        # binary search [l, r)
-        left = 1
-        right = 1 + np.searchsorted(lengths_cumsum[start_index:], s + c * n, "right")
-
-        while right - left > 1:
-            m = (left + right) // 2
-            if ffd_check(lengths[start_index : start_index + m], c, n):
-                left = m
-            else:
-                right = m
-
-        # use length l
-        batch = ffd_with_result(
-            lengths[start_index : start_index + left], c, start_index
-        )
-        assert len(batch) <= n
-        if len(batch) < n:
-            break
-
-        start_index += left
-        s = lengths_cumsum[start_index - 1]
-
-        # add local rank
-        result.append(batch[rank])
-
-    return result, s, len(result) * c * n
-
-
-class MultipackDistributedBatchSampler(Sampler):
-    """Unpadded length sampling using Multipack.
-    Approximate (at most ~1.22x) the optimal solution of the identical-machines scheduling problem, which is NP-hard.
-    """
-
-    def __init__(
-        self,
-        batch_max_length: int,
-        lengths: List[int],
-        num_replicas: Optional[int] = None,
-        rank: Optional[int] = None,
-        seed: int = 0,
-    ):
-        # Get rank
-        if num_replicas is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            num_replicas = dist.get_world_size()
-        if rank is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            rank = dist.get_rank()
-
-        self.num_replicas = num_replicas
-        self.rank = rank
-        self.seed = seed
-
-        self.batch_max_length = batch_max_length
-        self.lengths = lengths
-        assert isinstance(self.lengths, np.ndarray)
-
-        self.epoch = 0
-
-        # statistics
-        self.eff_total_used = 0
-        self.eff_total_slots = 0
-
-    def set_epoch(self, epoch: int):
-        self.epoch = epoch
-
-    def generate_batches(self, set_stats=False):
-        indices = np.random.default_rng(seed=self.seed + self.epoch).permutation(
-            len(self.lengths)
-        )
-
-        lengths = self.lengths[indices]
-        lengths_cumsum = np.cumsum(lengths)
-
-        batches, total_used, total_slots = allocate(
-            lengths=lengths,
-            lengths_cumsum=lengths_cumsum,
-            rank=self.rank,
-            c=self.batch_max_length,
-            n=self.num_replicas,
-        )
-
-        batches = [indices[batch] for batch in batches]
-
-        # statistics
-        if set_stats:
-            self.eff_total_used += total_used
-            self.eff_total_slots += total_slots
-
-        return batches
-
-    def __iter__(self):
-        batches = self.generate_batches(set_stats=True)
-        return iter(batches)
-
-    def num_batches(self):
-        batches = self.generate_batches()
-        return len(batches)
-
-    def efficiency(self):
-        return self.eff_total_used / self.eff_total_slots
--- a/src/axolotl/utils/schedulers.py
+++ b/src/axolotl/utils/schedulers.py
@@ -1,9 +1,6 @@
 """Module for custom LRScheduler class"""
-import math
-from functools import partial

-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import LambdaLR, LRScheduler
+from torch.optim.lr_scheduler import LRScheduler


 class InterpolatingLogScheduler(LRScheduler):
@@ -45,58 +42,3 @@ class InterpolatingLogScheduler(LRScheduler):
            lrs = [self.max_lr for base_lr in self.base_lrs]

        return lrs
-
-
-def _get_cosine_schedule_with_quadratic_warmup_lr_lambda(
-    current_step: int,
-    *,
-    num_warmup_steps: int,
-    num_training_steps: int,
-    num_cycles: float
-):
-    if current_step < num_warmup_steps:
-        return (float(current_step) / float(max(1, num_warmup_steps))) ** 2
-    progress = float(current_step - num_warmup_steps) / float(
-        max(1, num_training_steps - num_warmup_steps)
-    )
-    return max(
-        0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
-    )
-
-
-def get_cosine_schedule_with_quadratic_warmup(
-    optimizer: Optimizer,
-    num_warmup_steps: int,
-    num_training_steps: int,
-    num_cycles: float = 0.5,
-    last_epoch: int = -1,
-):
-    """
-    Create a schedule with a learning rate that decreases following the values of the cosine function between the
-    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
-    initial lr set in the optimizer.
-
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        num_cycles (`float`, *optional*, defaults to 0.5):
-            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
-            following a half-cosine).
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-
-    lr_lambda = partial(
-        _get_cosine_schedule_with_quadratic_warmup_lr_lambda,
-        num_warmup_steps=num_warmup_steps,
-        num_training_steps=num_training_steps,
-        num_cycles=num_cycles,
-    )
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -35,4 +35,4 @@ def check_example_labels(example, tokenizer):
    logging.info(" ".join(colored_tokens))
    logging.info("\n\n\n")

-    return " ".join(colored_tokens)
+    print(" ".join(colored_tokens))
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -5,185 +5,22 @@ import logging
 import math
 import os
 import sys
-from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Optional

 import bitsandbytes as bnb
-import numpy as np
 import torch.cuda
 import transformers
 from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
-from torch.utils.data import Dataset
-from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
+from transformers import EarlyStoppingCallback, Trainer
 from transformers.trainer_pt_utils import get_parameter_names

-from axolotl.utils.callbacks import (
-    SaveBetterTransformerModelCallback,
-    SavePeftModelCallback,
-)
-from axolotl.utils.sampler import MultipackDistributedBatchSampler
-from axolotl.utils.schedulers import (
-    InterpolatingLogScheduler,
-    get_cosine_schedule_with_quadratic_warmup,
-)
-
-IGNORE_LABEL_ID = -100
+from axolotl.utils.callbacks import SavePeftModelCallback
+from axolotl.utils.schedulers import InterpolatingLogScheduler


-def _find_multiple(val1, val2):
-    return (-(val1 // -val2)) * val2
-
-
-def batch_to_tensor(batch, pad_id=0, dtype=torch.long, loss_dtype=torch.bfloat16):
-    # Pad an unused item to reach multiple of 64, for faster GEMM
-    pad_cur_len = sum(list(batch["length"]))
-    pad_len = _find_multiple(pad_cur_len, 64) - pad_cur_len
-
-    if pad_len > 0:
-        assert pad_len < 64
-
-        batch["input_ids"].append([pad_id] * pad_len)
-        batch["labels"].append([pad_id] * pad_len)
-        batch["attention_mask"].append([0] * pad_len)
-        batch["length"].append(pad_len)
-
-    # seqlen
-    batch_lengths = torch.tensor(list(batch["length"]), dtype=torch.int32, device="cpu")
-
-    max_seqlen = torch.max(batch_lengths)
-    cu_seqlens = torch.nn.functional.pad(
-        batch_lengths.cumsum(-1, dtype=torch.int32), (1, 0)
-    )
-
-    # nz elements
-    nz_num = cu_seqlens[-1]
-    nz_input_ids = torch.zeros((nz_num,), dtype=dtype, pin_memory=True, device="cpu")
-    nz_position_ids = torch.zeros((nz_num,), dtype=dtype, pin_memory=True, device="cpu")
-    nz_shifted_label_ids = torch.zeros(
-        (nz_num,), dtype=dtype, pin_memory=True, device="cpu"
-    )
-    nz_shifted_loss_weights = torch.zeros(
-        (nz_num,), dtype=loss_dtype, pin_memory=True, device="cpu"
-    )
-
-    index = 0
-    for token_list, length, labels_list in zip(
-        batch["input_ids"], batch["length"], batch["labels"]
-    ):
-        tokens = torch.tensor(token_list, dtype=dtype, device="cpu")
-        position_ids = torch.arange(length, dtype=dtype, device="cpu")
-
-        # Input IDs & shifted labels
-        # shifted_label_ids = torch.where(masks, tokens, IGNORE_LABEL_ID)
-        shifted_label_ids = labels_list
-        shifted_label_ids = torch.nn.functional.pad(
-            shifted_label_ids[1:], (0, 1), "constant", IGNORE_LABEL_ID
-        )
-
-        nz_input_ids[index : index + length] = tokens
-        nz_position_ids[index : index + length] = position_ids
-        nz_shifted_label_ids[index : index + length] = shifted_label_ids
-
-        # Loss weights
-        mask_count = sum(1 for label in labels_list[1:] if label != IGNORE_LABEL_ID)
-        loss_weight = (
-            1 / mask_count if mask_count > 0 else 0
-        )  # Avoid division by zero for paddings
-
-        nz_shifted_loss_weights[index : index + length] = loss_weight
-
-        index += length
-
-    # inputs
-    return {
-        "max_seqlen": max_seqlen,
-        "cu_seqlens": cu_seqlens,
-        "nz_input_ids": nz_input_ids,
-        "nz_position_ids": nz_position_ids,
-        "nz_shifted_label_ids": nz_shifted_label_ids,
-        "nz_shifted_loss_weights": nz_shifted_loss_weights,
-    }
-
-
-@dataclass
-class AxolotlTrainingArguments(TrainingArguments):
-    """
-    Extend the base TrainingArguments for axolotl helpers
-    """
-
-    lr_quadratic_warmup: bool = field(
-        default=False,
-        metadata={"help": "Use quadratic warmup for cosine scheduling."},
-    )
-    sample_packing: bool = field(
-        default=True,
-        metadata={"help": "Use sample packing for efficient training."},
-    )
-    max_seq_length: int = field(
-        default=2048,
-        metadata={"help": "The maximum sequence length the model can handle"},
-    )
-
-
-class AxolotlTrainer(Trainer):
-    """
-    Extend the base Trainer for axolotl helpers
-    """
-
-    args = None  # type: AxolotlTrainingArguments
-
-    def create_scheduler(
-        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
-    ):
-        """
-        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
-        passed as an argument.
-
-        Args:
-            num_training_steps (int): The number of training steps to do.
-            optimizer (torch.optim.Optimizer): The training optimizer
-        """
-
-        # fmt: off
-        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
-            # fmt: on
-            if (
-                self.args.lr_scheduler_type == "cosine"
-                and self.args.lr_quadratic_warmup is True
-            ):
-                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
-                    optimizer,
-                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
-                    num_training_steps=num_training_steps,
-                )
-            else:
-                return super().create_scheduler(num_training_steps, optimizer)
-        return self.lr_scheduler
-
-    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
-        lengths = np.array([len(sample["input_ids"]) for sample in self.train_dataset])
-        return MultipackDistributedBatchSampler(
-            batch_max_length=self.args.per_device_train_batch_size
-            * self.args.max_seq_length,
-            lengths=lengths,
-            seed=self.args.seed,
-        )
-
-    def _get_eval_sampler(
-        self, eval_dataset: Dataset
-    ) -> Optional[torch.utils.data.Sampler]:
-        lengths = np.array([len(sample["input_ids"]) for sample in eval_dataset])
-        return MultipackDistributedBatchSampler(
-            batch_max_length=self.args.per_device_eval_batch_size
-            * self.args.max_seq_length,
-            lengths=lengths,
-            seed=self.args.seed,
-        )
-
-
-class OneCycleLRSchedulerTrainer(AxolotlTrainer):
+class OneCycleLRSchedulerTrainer(Trainer):
    """
    Trainer subclass that uses the OneCycleLR scheduler
    """
@@ -263,9 +100,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        if cfg.fsdp_config:
            training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config)

-    if cfg.lr_quadratic_warmup is not None:
-        training_arguments_kwargs["lr_quadratic_warmup"] = cfg.lr_quadratic_warmup
-
    # deepspeed
    if (
        os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true"
@@ -278,25 +112,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
            # TODO search Path("./") for one
            training_arguments_kwargs["deepspeed"] = "./ds_config.json"

-    if cfg.adam_beta1:
-        training_arguments_kwargs["adam_beta1"] = cfg.adam_beta1
-    if cfg.adam_beta2:
-        training_arguments_kwargs["adam_beta2"] = cfg.adam_beta2
-    if cfg.adam_epsilon:
-        training_arguments_kwargs["adam_epsilon"] = cfg.adam_epsilon
-    if cfg.max_grad_norm:
-        training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm
-
-    if cfg.hub_model_id:
-        training_arguments_kwargs["hub_model_id"] = cfg.hub_model_id
-        training_arguments_kwargs["push_to_hub"] = True
-        training_arguments_kwargs["hub_private_repo"] = True
-
-    if cfg.save_safetensors:
-        training_arguments_kwargs["save_safetensors"] = cfg.save_safetensors
-
-    training_args = AxolotlTrainingArguments(  # pylint: disable=unexpected-keyword-arg
-        max_steps=total_num_steps * cfg.num_epochs,
+    training_args = transformers.TrainingArguments(
        per_device_train_batch_size=cfg.micro_batch_size,
        per_device_eval_batch_size=cfg.eval_batch_size
        if cfg.eval_batch_size is not None
@@ -412,9 +228,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    ]:  # only save in rank 0
        callbacks.append(SavePeftModelCallback)

-    if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True:
-        callbacks.append(SaveBetterTransformerModelCallback)
-
    data_collator_kwargs = {
        "padding": True,
    }
@@ -446,7 +259,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    trainer_cls = (
        OneCycleLRSchedulerTrainer
        if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
-        else AxolotlTrainer
+        else transformers.Trainer
    )
    trainer = trainer_cls(
        model=model,
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -2,8 +2,6 @@

 import logging

-import torch
-

 def validate_config(cfg):
    if cfg.gradient_accumulation_steps and cfg.batch_size:
@@ -64,47 +62,7 @@ def validate_config(cfg):
    ) and cfg.gradient_checkpointing:
        raise ValueError("gradient_checkpointing is not supported for MPT models")

-    if cfg.flash_optimum is True:
-        if cfg.adapter:
-            logging.warning(
-                "BetterTransformers probably doesn't work with PEFT adapters"
-            )
-        if cfg.fp16 or cfg.bf16:
-            raise ValueError("AMP is not supported with BetterTransformer")
-        if cfg.float16 is not True and cfg.bloat16 is not True:
-            logging.warning(
-                "You should probably set bfloat16 or float16 to true to "
-                "load the model in float16 for BetterTransformers"
-            )
-        if int(torch.__version__.split(".")[0]) < 2:
-            logging.warning("torch>=2.0.0 required")
-            raise ValueError(
-                f"flash_optimum for BetterTransformers may not be used with {torch.__version__}"
-            )
-
-    if cfg.pretraining_dataset and cfg.group_by_length:
-        logging.warning(
-            "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
-        )
-
-    if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
-        not cfg.optimizer or "adamw" not in cfg.optimizer
-    ):
-        logging.warning("adamw hyperparameters found, but no adamw optimizer set")
-
-    if cfg.push_to_hub_model_id:
-        raise ValueError(
-            "push_to_hub_model_id is deprecated. Please use hub_model_id instead."
-        )
-
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
-    # no 8bit adaAmw w bf16
-
-    # GPT-NeoX
-    # evals broken when extending context len
-    # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 162, in forward                        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
-    # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/optimum/bettertransformer/models/attention.py", line 74, in gpt2_wrapped_scaled_dot_product
-    # attention_mask = causal_mask + attention_mask
-    # RuntimeError: The size of tensor a (2048) must match the size of tensor b (8132) at non-singleton dimension 3
+    # no 8bit adamw w bf16
--- a/tests/test_packed_dataset.py
+++ b/tests/test_packed_dataset.py
@@ -11,7 +11,57 @@ from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
 from axolotl.prompters import AlpacaPrompter


-class TestPacking(unittest.TestCase):
+class TestGpt2Packing(unittest.TestCase):
+    """
+    Test class for packing dataset sequences
+    """
+
+    def setUp(self) -> None:
+        # pylint: disable=duplicate-code
+        self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        self.tokenizer.add_special_tokens(
+            {
+                "bos_token": "<|endoftext|>",
+                "eos_token": "<|endoftext|>",
+                "unk_token": "<|endoftext|>",
+            }
+        )
+        self.tokenizer.bos_token_id = 0
+        self.tokenizer.eos_token_id = 0
+        self.tokenizer.unk_token_id = 0
+
+    def test_resets_attention(self):
+        prompter = AlpacaPrompter("chat")
+        strat = AlpacaPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        dateset = load_dataset(
+            "json",
+            data_files=str(Path(__file__).parent / "fixtures/alpaca/alpaca.json"),
+        )["train"]
+        dataset = Dataset.from_list(list(TokenizedPromptDataset(strat, dateset)))
+
+        constant_len_dataset = ConstantLengthDataset(
+            self.tokenizer,
+            [dataset],
+            seq_length=2048,
+        )
+        packed_dataset = Dataset.from_list(list(constant_len_dataset))
+
+        example = packed_dataset[0]
+        # tokenizers where eos and bos tokens are the same, don't have a bos token
+        next_eos_index = (
+            example["input_ids"][1:].index(self.tokenizer.eos_token_id) + 1
+        )  # add one since we sliced
+
+        assert example["input_ids"][next_eos_index] == self.tokenizer.eos_token_id
+        assert example["attention_mask"][next_eos_index + 1] == 0
+
+
+class TestLlamaPacking(unittest.TestCase):
    """
    Test class for packing dataset sequences
    """
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -6,16 +6,8 @@ from pathlib import Path

 from transformers import AutoTokenizer

-from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
-from axolotl.prompt_strategies.alpaca_w_system import (
-    InstructionWSystemPromptTokenizingStrategy,
-    SystemDataPrompter,
-)
-from axolotl.prompt_tokenizers import (
-    AlpacaPromptTokenizingStrategy,
-    ShareGPTPromptTokenizingStrategy,
-)
-from axolotl.prompters import AlpacaPrompter, PromptStyle, ShareGPTPrompter
+from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
+from axolotl.prompters import ShareGPTPrompter

 logging.basicConfig(level="INFO")

@@ -37,6 +29,7 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
        )

    def test_sharegpt_integration(self):
+        print(Path(__file__).parent)
        with open(
            Path(__file__).parent / "fixtures/conversation.json", encoding="utf-8"
        ) as fin:
@@ -60,79 +53,6 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
            self.assertEqual(len(example[fields]), len(tokenized_conversation[fields]))
            self.assertEqual(example[fields], tokenized_conversation[fields])

-    def test_no_sys_prompt(self):
-        """
-        tests the interface between the user and assistant parts
-        """
-        prompter = NoSystemPrompter()
-        # pylint: disable=duplicate-code
-        strat = AlpacaPromptTokenizingStrategy(
-            prompter,
-            self.tokenizer,
-            False,
-            2048,
-        )
-        sample = {
-            "instruction": "hello cruel. lorem ipsum dolor sit amet.",
-            "output": "world!",
-        }
-        example = strat.tokenize_prompt(sample)
-        world_idx = example["input_ids"].index(3186)
-        assert example["labels"][world_idx] == 3186
-        assert example["labels"][world_idx - 1] == -100
-
-    def test_alpaca(self):
-        """
-        tests the interface between the user and assistant parts
-        """
-        # pylint: disable=duplicate-code
-        prompter = AlpacaPrompter()
-        strat = AlpacaPromptTokenizingStrategy(
-            prompter,
-            self.tokenizer,
-            False,
-            2048,
-        )
-        sample = {"instruction": "hello!", "output": "Hi! How can I help?"}
-        example = strat.tokenize_prompt(sample)
-        world_idx = example["input_ids"].index(6324)
-        assert example["labels"][world_idx] == 6324
-        assert example["labels"][world_idx - 1] == -100
-
-
-class InstructionWSystemPromptTokenizingStrategyTest(unittest.TestCase):
-    """
-    Test class for prompt tokenization strategies with sys prompt from the dataset
-    """
-
-    def setUp(self) -> None:
-        # pylint: disable=duplicate-code
-        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
-        self.tokenizer.add_special_tokens(
-            {
-                "bos_token": "<s>",
-                "eos_token": "</s>",
-                "unk_token": "<unk>",
-            }
-        )
-
-    def test_system_alpaca(self):
-        prompter = SystemDataPrompter(PromptStyle.CHAT.value)
-        strat = InstructionWSystemPromptTokenizingStrategy(
-            prompter,
-            self.tokenizer,
-            False,
-            2048,
-        )
-        sample = {
-            "system": "use cot",
-            "instruction": "hello!",
-            "output": "Hi! How can I help?",
-        }
-        example = strat.tokenize_prompt(sample)
-        assert example["input_ids"][0:3] == [1, 671, 20118]  # <s>use cot
-        assert example["input_ids"][3] == 11889  # USER
-

 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_prompters.py
+++ b/tests/test_prompters.py
@@ -2,13 +2,7 @@

 import unittest

-from axolotl.prompt_strategies.alpaca_w_system import SystemDataPrompter
-from axolotl.prompters import (
-    AlpacaPrompter,
-    MultipleChoiceExplainPrompter,
-    PromptStyle,
-    UnpromptedPrompter,
-)
+from axolotl.prompters import AlpacaPrompter, PromptStyle


 class AlpacaPrompterTest(unittest.TestCase):
@@ -61,64 +55,3 @@ class AlpacaPrompterTest(unittest.TestCase):
        assert "### Response:" not in res
        assert "USER:" in res
        assert "ASSISTANT:" in res
-
-    def test_system_prompt(self):
-        prompter = SystemDataPrompter(prompt_style=PromptStyle.CHAT.value)
-        res = next(
-            prompter.build_prompt_w_system(
-                "use cot", "tell me a joke about the following", "alpacas"
-            )
-        )
-        assert "use cot" in res
-        assert res.startswith("use cot")
-        assert "### Instruction:" not in res
-        assert "### Input:" not in res
-        assert "alpacas" in res
-        assert "### Response:" not in res
-        assert "USER:" in res
-        assert "ASSISTANT:" in res
-
-
-class UnpromptedPrompterTest(unittest.TestCase):
-    """
-    Test class for UnpromptedPrompter with no system prompts
-    """
-
-    def test_prompt_style_w_none(self):
-        prompter = UnpromptedPrompter(prompt_style=None)
-        res = next(prompter.build_prompt("tell me a joke"))
-        assert "### Instruction:" in res
-        assert "tell me a joke" in res
-        assert res.startswith("###")
-
-    def test_prompt_style_w_instruct(self):
-        prompter = UnpromptedPrompter(prompt_style=PromptStyle.INSTRUCT.value)
-        res = next(
-            prompter.build_prompt("tell me a joke about the following", "alpacas")
-        )
-        assert "### Instruction:" in res
-        assert "tell me a joke" in res
-        assert res.startswith("###")
-
-    def test_prompt_style_w_chat(self):
-        prompter = UnpromptedPrompter(prompt_style=PromptStyle.CHAT.value)
-        res = next(
-            prompter.build_prompt("tell me a joke about the following", "alpacas")
-        )
-        assert "USER:" in res
-        assert "tell me a joke" in res
-        assert res.startswith("USER:")
-
-
-class MultipleChoiceExplainPrompterTest(unittest.TestCase):
-    """
-    Test class for MultipleChoiceExplainPrompter
-    """
-
-    def test_prompt_style_w_chat(self):
-        prompter = MultipleChoiceExplainPrompter(prompt_style=PromptStyle.CHAT.value)
-        res = next(prompter.build_prompt("choose one", "- A\n- B\n- C", "C"))
-        assert "USER:" in res
-        assert "choose one" in res
-        assert "Choose the answer that best answers the question." in res
-        assert "- A\n- B\n- C" in res
--- a/tests/test_tokenizers.py
+++ b/tests/test_tokenizers.py
@@ -1,31 +0,0 @@
-"""
-Test cases for the tokenizer loading
-"""
-import unittest
-
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.models import load_tokenizer
-
-
-class TestTokenizers(unittest.TestCase):
-    """
-    test class for the load_tokenizer fn
-    """
-
-    def test_default_use_fast(self):
-        cfg = DictDefault({})
-        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
-        assert "Fast" in tokenizer.__class__.__name__
-
-    def test_dont_use_fast(self):
-        cfg = DictDefault(
-            {
-                "tokenizer_use_fast": False,
-            }
-        )
-        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
-        assert "Fast" not in tokenizer.__class__.__name__
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -212,104 +212,3 @@ class ValidationTest(unittest.TestCase):

        with pytest.raises(ValueError, match=regex_exp):
            validate_config(cfg)
-
-    def test_flash_optimum(self):
-        cfg = DictDefault(
-            {
-                "flash_optimum": True,
-                "adapter": "lora",
-            }
-        )
-
-        with self._caplog.at_level(logging.WARNING):
-            validate_config(cfg)
-            assert any(
-                "BetterTransformers probably doesn't work with PEFT adapters"
-                in record.message
-                for record in self._caplog.records
-            )
-
-        cfg = DictDefault(
-            {
-                "flash_optimum": True,
-            }
-        )
-
-        with self._caplog.at_level(logging.WARNING):
-            validate_config(cfg)
-            assert any(
-                "probably set bfloat16 or float16" in record.message
-                for record in self._caplog.records
-            )
-
-        cfg = DictDefault(
-            {
-                "flash_optimum": True,
-                "fp16": True,
-            }
-        )
-        regex_exp = r".*AMP is not supported.*"
-
-        with pytest.raises(ValueError, match=regex_exp):
-            validate_config(cfg)
-
-        cfg = DictDefault(
-            {
-                "flash_optimum": True,
-                "bf16": True,
-            }
-        )
-        regex_exp = r".*AMP is not supported.*"
-
-        with pytest.raises(ValueError, match=regex_exp):
-            validate_config(cfg)
-
-    def test_adamw_hyperparams(self):
-        cfg = DictDefault(
-            {
-                "optimizer": None,
-                "adam_epsilon": 0.0001,
-            }
-        )
-
-        with self._caplog.at_level(logging.WARNING):
-            validate_config(cfg)
-            assert any(
-                "adamw hyperparameters found, but no adamw optimizer set"
-                in record.message
-                for record in self._caplog.records
-            )
-
-        cfg = DictDefault(
-            {
-                "optimizer": "adafactor",
-                "adam_beta1": 0.0001,
-            }
-        )
-
-        with self._caplog.at_level(logging.WARNING):
-            validate_config(cfg)
-            assert any(
-                "adamw hyperparameters found, but no adamw optimizer set"
-                in record.message
-                for record in self._caplog.records
-            )
-
-        cfg = DictDefault(
-            {
-                "optimizer": "adamw_bnb_8bit",
-                "adam_beta1": 0.9,
-                "adam_beta2": 0.99,
-                "adam_epsilon": 0.0001,
-            }
-        )
-
-        validate_config(cfg)
-
-        cfg = DictDefault(
-            {
-                "optimizer": "adafactor",
-            }
-        )
-
-        validate_config(cfg)
Author	SHA1	Message	Date
Wing Lian	05d19d2037	remove debugging, use gpt2 since starcoder requires consent Some checks failed pre-commit / pre-commit (push) Has been cancelled Details PyTest / test (3.10) (push) Has been cancelled Details PyTest / test (3.9) (push) Has been cancelled Details	2023-06-13 21:32:47 -04:00
Wing Lian	61f44f311e	fix packing for tokenizers that don't use a bos_token when the bos token and eos token are both the same	2023-06-13 21:26:13 -04:00