better handling for tokenizers like flan that don't have a bos token

2023-06-23 15:47:40 -04:00
21 changed files with 93 additions and 527 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -26,7 +26,7 @@ jobs:
            pytorch: 2.0.0
            axolotl_extras:
          - cuda: "117"
-            cuda_version: 11.7.1
+            cuda_version: 11.7.0
            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -30,7 +30,7 @@ jobs:
            pytorch: 2.0.0
            axolotl_extras: gptq
          - cuda: cu117
-            cuda_version: 11.7.1
+            cuda_version: 11.7.0
            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
@@ -85,7 +85,7 @@ jobs:
            pytorch: 2.0.0
            axolotl_extras: gptq
          - cuda: cu117
-            cuda_version: 11.7.1
+            cuda_version: 11.7.0
            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,5 @@
 default_language_version:
-    python: python3
+    python: python3.9

 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
--- a/README.md
+++ b/README.md
@@ -195,10 +195,6 @@ Have dataset(s) in one of the following format (JSONL recommended):
  ```json
  {"message_1": "...", "message_2": "..."}
  ```
- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
-  ```json
-  {"system_prompt": "...", "question": "...", "response": "..."}
-  ```
 - `context_qa`: in context question answering from an article
  ```json
  {"article": "...", "question": "...", "answer": "..."}
@@ -237,7 +233,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
 #### How to add custom prompts

  1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
-  2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
+  2. Use your custom file name as the dataset type.

 Optionally, download some datasets, see [data/README.md](data/README.md)

@@ -255,18 +251,10 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic

 - dataset
  ```yaml
-  sequence_len: 2048 # max token length for prompt
-
-  # huggingface repo
  datasets:
-    - path: vicgalle/alpaca-gpt4
-      type: alpaca # format from earlier
-
-  # local
-  datasets:
-    - path: json
-      data_files: data.jsonl # or json
+    - path: vicgalle/alpaca-gpt4 # local or huggingface repo
      type: alpaca # format from earlier
+  sequence_len: 2048 # max token length / prompt
  ```

 - loading
@@ -314,8 +302,6 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Trust remote code for untrusted source
 trust_remote_code:
-# use_fast option for tokenizer loading from_pretrained, default to True
-tokenizer_use_fast:

 # whether you are training a 4-bit GPTQ quantized model
 gptq: true
@@ -336,10 +322,10 @@ tf32: true # require >=ampere

 # a list of one or more datasets to finetune the model with
 datasets:
-  # hf dataset repo | "json" for local dataset, make sure to fill data_files
+  # this can be either a hf dataset, or relative path
  - path: vicgalle/alpaca-gpt4
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
+    type: alpaca # format OR format:prompt_style (chat/instruct)
    data_files: # path to source data files
    shards: # number of shards to split data into

@@ -348,8 +334,6 @@ datasets:
 dataset_prepared_path: data/last_run_prepared
 # push prepared dataset to hub
 push_dataset_to_hub: # repo path
-# push checkpoints to hub
-hub_model_id: # repo path
 # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
 # required to be true when used in combination with `push_dataset_to_hub`
 hf_use_auth_token: # boolean
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -77,7 +77,7 @@ FROM base-builder
 RUN python3 -m pip uninstall -y apex
 RUN git clone https://github.com/NVIDIA/apex
 #  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
-RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .

 RUN mkdir -p /workspace/builds
 COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes
@@ -97,4 +97,4 @@ RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
 RUN git lfs install --skip-repo
 RUN pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic==1.10.10
+    pip3 install -U --no-cache-dir pydantic
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -126,7 +126,6 @@ class ConstantLengthDataset(IterableDataset):
                    buffer_len = 0

                if example:
-                    # FIXME
                    # just going to drop data points that are too long
                    if len(example["input_ids"]) <= self.seq_length:
                        input_ids = example["input_ids"]
--- a/src/axolotl/prompt_strategies/alpaca_chat.py
+++ b/src/axolotl/prompt_strategies/alpaca_chat.py
@@ -6,7 +6,7 @@ from axolotl.prompt_tokenizers import (
    AlpacaPromptTokenizingStrategy,
    InstructionPromptTokenizingStrategy,
 )
-from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter
+from axolotl.prompters import AlpacaPrompter, PromptStyle


 def load(tokenizer, cfg):
@@ -45,10 +45,8 @@ class NoSystemPrompter(AlpacaPrompter):
    Null Prompter with no system prompts
    """

-    system_prompt = ""
-    system_no_input_prompt = ""
-    turn_format = "{instruction} {input} "
-    turn_no_input_format = "{instruction} "
+    prompt_input = "{instruction} {input} "
+    prompt_no_input = "{instruction} "

    def __init__(self):  # pylint: disable=super-init-not-called
        pass
@@ -105,12 +103,3 @@ def load_camel_ai(tokenizer, cfg):
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
-
-
-def load_no_prompt(tokenizer, cfg):
-    return AlpacaPromptTokenizingStrategy(
-        UnpromptedPrompter(PromptStyle.CHAT.value),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
--- a/src/axolotl/prompt_strategies/alpaca_instruct.py
+++ b/src/axolotl/prompt_strategies/alpaca_instruct.py
@@ -1,7 +1,7 @@
 """Module loading the AlpacaInstructPromptTokenizingStrategy class"""

 from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
-from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter
+from axolotl.prompters import AlpacaPrompter, PromptStyle


 def load(tokenizer, cfg):
@@ -11,12 +11,3 @@ def load(tokenizer, cfg):
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
-
-
-def load_no_prompt(tokenizer, cfg):
-    return AlpacaPromptTokenizingStrategy(
-        UnpromptedPrompter(PromptStyle.INSTRUCT.value),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
--- a/src/axolotl/prompt_strategies/alpaca_w_system.py
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -1,120 +0,0 @@
-"""
-Prompt strategies loader for alpaca instruction datasets with system prompts
-"""
-from typing import Generator, Tuple, Union
-
-from axolotl.prompt_tokenizers import PromptTokenizingStrategy
-from axolotl.prompters import AlpacaPrompter, PromptStyle
-
-
-class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy):
-    """
-    Tokenizing strategy for instruction-based prompts.
-    """
-
-    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
-        return (
-            prompt["instruction"],
-            prompt["input"] if "input" in prompt else "",
-            prompt["output"],
-            prompt["system"],
-        )
-
-    def tokenize_prompt(self, prompt):
-        # pylint: disable=duplicate-code
-        (
-            instruction,
-            input,  # pylint: disable=redefined-builtin
-            response,
-            system,
-        ) = self.parse_instruction_fields(prompt)
-        user_prompt = next(
-            iter(
-                self.prompter.build_prompt_w_system(
-                    system,
-                    instruction,
-                    input,
-                )
-            )
-        )
-        tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False)
-        if not self.train_on_inputs:
-            user_prompt_len = len(tokenized_prompt["input_ids"])
-            # TODO this could be sped up using numpy array slicing
-            tokenized_prompt["labels"] = [-100] * user_prompt_len
-        tokenized_res_prompt = self._tokenize(
-            response, strip_bos_token=True, add_eos_token=True
-        )
-        tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"]
-        tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"]
-        tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"]
-
-        return tokenized_prompt
-
-
-class SystemDataPrompter(AlpacaPrompter):
-    """
-    Alpaca Style Prompter that uses system prompts from the dataset
-    """
-
-    def build_prompt_w_system(
-        self,
-        system: str,
-        instruction: str,
-        input: Union[None, str] = None,  # pylint: disable=redefined-builtin
-        output: Union[None, str] = None,
-    ) -> Generator[str, None, None]:
-        # returns the full prompt from instruction and optional input
-        # if a label (=response, =output) is provided, it's also appended.
-        if input:
-            res = system + self.turn_format.format(instruction=instruction, input=input)
-        else:
-            res = system + self.turn_no_input_format.format(instruction=instruction)
-        if output:
-            res = f"{res}{output}"
-        yield res
-
-
-class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
-    """
-    Tokenizing strategy for OpenOrca datasets
-    """
-
-    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
-        return (
-            prompt["question"],
-            "",
-            prompt["response"],
-            prompt["system_prompt"],
-        )
-
-
-def load(tokenizer, cfg):
-    return load_chat(tokenizer, cfg)
-
-
-def load_instruct(tokenizer, cfg):
-    return InstructionWSystemPromptTokenizingStrategy(
-        SystemDataPrompter(PromptStyle.INSTRUCT.value),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
-
-
-def load_chat(tokenizer, cfg):
-    return InstructionWSystemPromptTokenizingStrategy(
-        SystemDataPrompter(PromptStyle.CHAT.value),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
-
-
-def load_open_orca(tokenizer, cfg):
-    return OpenOrcaPromptTokenizingStrategy(
-        SystemDataPrompter(PromptStyle.INSTRUCT.value),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -73,8 +73,17 @@ class PromptTokenizingStrategy(abc.ABC):
        ):
            result["input_ids"].append(self.tokenizer.eos_token_id)
            result["attention_mask"].append(1)
+        elif (  # some tokenizers automatically add an eos token, let's remove it
+            not add_eos_token and result["input_ids"][-1] == self.tokenizer.eos_token_id
+        ):
+            result["input_ids"] = result["input_ids"][:-1]
+            result["attention_mask"] = result["attention_mask"][:-1]

-        if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
+        if (
+            self.tokenizer.bos_token_id
+            and result["input_ids"][0] == self.tokenizer.bos_token_id
+            and strip_bos_token
+        ):
            result["input_ids"] = result["input_ids"][1:]
            result["attention_mask"] = result["attention_mask"][1:]

@@ -87,9 +96,7 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
    Tokenizing strategy for instruction-based prompts.
    """

-    def parse_instruction_fields(
-        self, prompt
-    ) -> Union[Tuple[str, str, str], Tuple[str, str, str, str]]:
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        raise NotImplementedError

    def tokenize_prompt(self, prompt):
@@ -414,7 +421,11 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
            result["input_ids"].append(self.tokenizer.eos_token_id)
            result["attention_mask"].append(1)

-        if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
+        if (
+            self.tokenizer.bos_token_id
+            and result["input_ids"][0] == self.tokenizer.bos_token_id
+            and strip_bos_token
+        ):
            result["input_ids"] = result["input_ids"][1:]
            result["attention_mask"] = result["attention_mask"][1:]

@@ -440,7 +451,7 @@ def parse_tokenized_to_result(
    result: Dict[str, List[int]],
    current_len: int,
    res: Dict[str, List[int]],
-    labels: List[int],
+    labels: list[int],
    pad_token_id: Union[int, None] = None,
 ) -> Tuple[Dict[str, List[int]], int]:
    """
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -24,8 +24,6 @@ class AlpacaPrompter:

    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
    system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
-    turn_format: str
-    turn_no_input_format: str
    prompt_style: Optional[PromptStyle] = None

    def __init__(self, prompt_style=PromptStyle.INSTRUCT.value):
@@ -34,13 +32,23 @@ class AlpacaPrompter:

    def match_prompt_style(self):
        if self.prompt_style == PromptStyle.INSTRUCT.value:
-            self.turn_format = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
-            self.turn_no_input_format = (
-                "### Instruction:\n{instruction}\n\n### Response:\n"
+            self.prompt_input = (
+                self.system_prompt
+                + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
            )
+            self.prompt_no_input = (
+                self.system_no_input_prompt
+                + "### Instruction:\n{instruction}\n\n### Response:\n"
+            )
+            self.response_split = "### Response:"
        if self.prompt_style == PromptStyle.CHAT.value:
-            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
-            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
+            self.prompt_input = (
+                self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
+            )
+            self.prompt_no_input = (
+                self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
+            )
+            self.response_split = "ASSISTANT:"

    def build_prompt(
        self,
@@ -51,17 +59,16 @@ class AlpacaPrompter:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
-            res = self.system_prompt + self.turn_format.format(
-                instruction=instruction, input=input
-            )
+            res = self.prompt_input.format(instruction=instruction, input=input)
        else:
-            res = self.system_no_input_prompt + self.turn_no_input_format.format(
-                instruction=instruction
-            )
+            res = self.prompt_no_input.format(instruction=instruction)
        if output:
            res = f"{res}{output}"
        yield res

+    def get_response(self, output: str) -> str:
+        return output.split(self.response_split)[1].strip()
+

 class UnpromptedPrompter(AlpacaPrompter):
    """
@@ -86,10 +93,7 @@ class MultipleChoiceExplainPrompter(AlpacaPrompter):
    """

    system_prompt = (
-        "Choose the answer that best answers the question. Explain your reasoning.\n"
-    )
-    system_no_input_prompt = (
-        "Choose the answer that best answers the question. Explain your reasoning.\n"
+        "Choose the answer that best answers the question. Explain your reasoning."
    )


@@ -98,12 +102,7 @@ class MultipleChoiceConcisePrompter(AlpacaPrompter):
    Prompter for multiple choice concise
    """

-    system_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
-    system_no_input_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
-
-    def match_prompt_style(self):
-        self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
-        self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
+    prompt_input = "Choose the answer that best answers the question. Be concise in your response.\n\nUSER: {instruction}\n{input}\nASSISTANT:\n"


 class SummarizeTLDRPrompter(AlpacaPrompter):
@@ -111,12 +110,9 @@ class SummarizeTLDRPrompter(AlpacaPrompter):
    Prompter for summarize TLDR
    """

-    system_prompt = ""
-    system_no_input_prompt = ""
-
-    def match_prompt_style(self):
-        self.turn_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\n{input}\nASSISTANT:"
-        self.turn_no_input_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
+    prompt_no_input = (
+        "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
+    )


 class CompletionPrompter:
@@ -132,6 +128,9 @@ class CompletionPrompter:
    ) -> Generator[str, None, None]:
        yield instruction

+    def get_response(self, output: str) -> str:
+        return output.strip()
+

 class GPTeacherPrompter(AlpacaPrompter):
    """
@@ -211,6 +210,9 @@ class ReflectAlpacaPrompter:
            res = f"{res}{label}"
        yield res

+    def get_response(self, output: str) -> str:
+        return output.split(self.response_split)[1].strip()
+

 class SeparatorStyle(Enum):
    """Different separator style."""
@@ -287,6 +289,12 @@ class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
            sep2=" ",
        )

+    # def match_prompt_style(self):
+    #     if self.prompt_style == PromptStyle.chat.value:
+    #         self.prompt_input = self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
+    #         self.prompt_no_input = self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
+    #         self.response_split = "ASSISTANT:"
+
    def build_prompt(self, source) -> Generator[str, None, None]:
        # ignore the system prompt if provided
        if source[0]["from"] == "system":
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -102,26 +102,13 @@ def load_tokenized_prepared_datasets(
                pass

            # prefer local dataset, even if hub exists
-            local_path = Path(d.path)
-            if local_path.exists():
-                if local_path.is_dir():
-                    ds = load_dataset(
-                        d.path,
-                        data_files=d.data_files,
-                        streaming=False,
-                        split=None,
-                    )
-                elif local_path.is_file():
-                    ds = load_dataset(
-                        "json",
-                        data_files=d.path,
-                        streaming=False,
-                        split=None,
-                    )
-                else:
-                    raise ValueError(
-                        "unhandled dataset load: local path exists, but is neither a directory or a file"
-                    )
+            if Path(d.path).exists():
+                ds = load_dataset(
+                    "json",
+                    data_files=d.path,
+                    streaming=False,
+                    split=None,
+                )
            elif ds_from_hub:
                if d.data_files:
                    ds = load_dataset(
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -34,20 +34,15 @@ def load_tokenizer(
    tokenizer_type,
    cfg,
 ):
-    use_fast = True  # this is the default
-    if cfg.tokenizer_use_fast is not None:
-        use_fast = cfg.tokenizer_use_fast
    if tokenizer_type:
        tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
            tokenizer_config,
            trust_remote_code=cfg.trust_remote_code or False,
-            use_fast=use_fast,
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_config,
            trust_remote_code=cfg.trust_remote_code or False,
-            use_fast=use_fast,
        )

    logging.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
@@ -202,7 +197,7 @@ def load_model(
                else True,
            )
            load_in_8bit = False
-        elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
+        elif cfg.is_llama_derived_model:
            from transformers import LlamaForCausalLM

            config = LlamaConfig.from_pretrained(base_model_config)
@@ -241,7 +236,7 @@ def load_model(
        #         device=cfg.device,
        #     )
        #     model.train() # sets to train instead of eval mode
-        elif model_type and not cfg.trust_remote_code:
+        elif model_type:
            model = getattr(transformers, model_type).from_pretrained(
                base_model,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
--- a/src/axolotl/utils/schedulers.py
+++ b/src/axolotl/utils/schedulers.py
@@ -1,9 +1,6 @@
 """Module for custom LRScheduler class"""
-import math
-from functools import partial

-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import LambdaLR, LRScheduler
+from torch.optim.lr_scheduler import LRScheduler


 class InterpolatingLogScheduler(LRScheduler):
@@ -45,58 +42,3 @@ class InterpolatingLogScheduler(LRScheduler):
            lrs = [self.max_lr for base_lr in self.base_lrs]

        return lrs
-
-
-def _get_cosine_schedule_with_quadratic_warmup_lr_lambda(
-    current_step: int,
-    *,
-    num_warmup_steps: int,
-    num_training_steps: int,
-    num_cycles: float
-):
-    if current_step < num_warmup_steps:
-        return (float(current_step) / float(max(1, num_warmup_steps))) ** 2
-    progress = float(current_step - num_warmup_steps) / float(
-        max(1, num_training_steps - num_warmup_steps)
-    )
-    return max(
-        0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
-    )
-
-
-def get_cosine_schedule_with_quadratic_warmup(
-    optimizer: Optimizer,
-    num_warmup_steps: int,
-    num_training_steps: int,
-    num_cycles: float = 0.5,
-    last_epoch: int = -1,
-):
-    """
-    Create a schedule with a learning rate that decreases following the values of the cosine function between the
-    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
-    initial lr set in the optimizer.
-
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        num_cycles (`float`, *optional*, defaults to 0.5):
-            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
-            following a half-cosine).
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-
-    lr_lambda = partial(
-        _get_cosine_schedule_with_quadratic_warmup_lr_lambda,
-        num_warmup_steps=num_warmup_steps,
-        num_training_steps=num_training_steps,
-        num_cycles=num_cycles,
-    )
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -34,5 +34,3 @@ def check_example_labels(example, tokenizer):

    logging.info(" ".join(colored_tokens))
    logging.info("\n\n\n")
-
-    return " ".join(colored_tokens)
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -5,82 +5,25 @@ import logging
 import math
 import os
 import sys
-from dataclasses import field
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Optional

 import bitsandbytes as bnb
 import torch.cuda
-import torch.nn.functional as F
 import transformers
 from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
-from transformers import (
-    EarlyStoppingCallback,
-    EvalPrediction,
-    Trainer,
-    TrainingArguments,
-)
+from transformers import EarlyStoppingCallback, Trainer
 from transformers.trainer_pt_utils import get_parameter_names

 from axolotl.utils.callbacks import (
    SaveBetterTransformerModelCallback,
    SavePeftModelCallback,
 )
-from axolotl.utils.schedulers import (
-    InterpolatingLogScheduler,
-    get_cosine_schedule_with_quadratic_warmup,
-)
+from axolotl.utils.schedulers import InterpolatingLogScheduler


-class AxolotlTrainingArguments(TrainingArguments):
-    """
-    Extend the base TrainingArguments for axolotl helpers
-    """
-
-    lr_quadratic_warmup: bool = field(
-        default=False,
-        metadata={"help": "Use quadratic warmup for cosine scheduling."},
-    )
-
-
-class AxolotlTrainer(Trainer):
-    """
-    Extend the base Trainer for axolotl helpers
-    """
-
-    args = None  # type: AxolotlTrainingArguments
-
-    def create_scheduler(
-        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
-    ):
-        """
-        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
-        passed as an argument.
-
-        Args:
-            num_training_steps (int): The number of training steps to do.
-            optimizer (torch.optim.Optimizer): The training optimizer
-        """
-
-        # fmt: off
-        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
-            # fmt: on
-            if (
-                self.args.lr_scheduler_type == "cosine"
-                and self.args.lr_quadratic_warmup is True
-            ):
-                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
-                    optimizer,
-                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
-                    num_training_steps=num_training_steps,
-                )
-            else:
-                return super().create_scheduler(num_training_steps, optimizer)
-        return self.lr_scheduler
-
-
-class OneCycleLRSchedulerTrainer(AxolotlTrainer):
+class OneCycleLRSchedulerTrainer(Trainer):
    """
    Trainer subclass that uses the OneCycleLR scheduler
    """
@@ -160,9 +103,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        if cfg.fsdp_config:
            training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config)

-    if cfg.lr_quadratic_warmup is not None:
-        training_arguments_kwargs["lr_quadratic_warmup"] = cfg.lr_quadratic_warmup
-
    # deepspeed
    if (
        os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true"
@@ -184,11 +124,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    if cfg.max_grad_norm:
        training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm

-    if cfg.hub_model_id:
-        training_arguments_kwargs["hub_model_id"] = cfg.hub_model_id
-        training_arguments_kwargs["push_to_hub"] = True
-
-    training_args = AxolotlTrainingArguments(
+    training_args = transformers.TrainingArguments(
        per_device_train_batch_size=cfg.micro_batch_size,
        per_device_eval_batch_size=cfg.eval_batch_size
        if cfg.eval_batch_size is not None
@@ -335,23 +271,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
                num_proc=32,
            )

-    if cfg.compute_perplexity_metrics:
-
-        def compute_metrics(eval_preds: EvalPrediction) -> Dict[str, Any]:
-            logits = eval_preds.predictions
-            labels = eval_preds.label_ids
-            cross_entropy_loss = F.cross_entropy(
-                logits.view(-1, model.config.vocab_size), labels.view(-1)
-            )
-            perplexity = torch.exp(cross_entropy_loss)
-            return {"cross_entropy_loss": cross_entropy_loss, "perplexity": perplexity}
-
-        trainer_kwargs["compute_metrics"] = compute_metrics
-
    trainer_cls = (
        OneCycleLRSchedulerTrainer
        if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
-        else AxolotlTrainer
+        else transformers.Trainer
    )
    trainer = trainer_cls(
        model=model,
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -87,16 +87,11 @@ def validate_config(cfg):
            "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
        )

-    if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
+    if any([cfg.adamw_beta1, cfg.adamw_beta2, cfg.adamw_epsilon]) and (
        not cfg.optimizer or "adamw" not in cfg.optimizer
    ):
        logging.warning("adamw hyperparameters found, but no adamw optimizer set")

-    if cfg.push_to_hub_model_id:
-        raise ValueError(
-            "push_to_hub_model_id is deprecated. Please use hub_model_id instead."
-        )
-
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -7,15 +7,11 @@ from pathlib import Path
 from transformers import AutoTokenizer

 from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
-from axolotl.prompt_strategies.alpaca_w_system import (
-    InstructionWSystemPromptTokenizingStrategy,
-    SystemDataPrompter,
-)
 from axolotl.prompt_tokenizers import (
    AlpacaPromptTokenizingStrategy,
    ShareGPTPromptTokenizingStrategy,
 )
-from axolotl.prompters import AlpacaPrompter, PromptStyle, ShareGPTPrompter
+from axolotl.prompters import AlpacaPrompter, ShareGPTPrompter

 logging.basicConfig(level="INFO")

@@ -100,39 +96,5 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
        assert example["labels"][world_idx - 1] == -100


-class InstructionWSystemPromptTokenizingStrategyTest(unittest.TestCase):
-    """
-    Test class for prompt tokenization strategies with sys prompt from the dataset
-    """
-
-    def setUp(self) -> None:
-        # pylint: disable=duplicate-code
-        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
-        self.tokenizer.add_special_tokens(
-            {
-                "bos_token": "<s>",
-                "eos_token": "</s>",
-                "unk_token": "<unk>",
-            }
-        )
-
-    def test_system_alpaca(self):
-        prompter = SystemDataPrompter(PromptStyle.CHAT.value)
-        strat = InstructionWSystemPromptTokenizingStrategy(
-            prompter,
-            self.tokenizer,
-            False,
-            2048,
-        )
-        sample = {
-            "system": "use cot",
-            "instruction": "hello!",
-            "output": "Hi! How can I help?",
-        }
-        example = strat.tokenize_prompt(sample)
-        assert example["input_ids"][0:3] == [1, 671, 20118]  # <s>use cot
-        assert example["input_ids"][3] == 11889  # USER
-
-
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_prompters.py
+++ b/tests/test_prompters.py
@@ -2,13 +2,7 @@

 import unittest

-from axolotl.prompt_strategies.alpaca_w_system import SystemDataPrompter
-from axolotl.prompters import (
-    AlpacaPrompter,
-    MultipleChoiceExplainPrompter,
-    PromptStyle,
-    UnpromptedPrompter,
-)
+from axolotl.prompters import AlpacaPrompter, PromptStyle


 class AlpacaPrompterTest(unittest.TestCase):
@@ -61,64 +55,3 @@ class AlpacaPrompterTest(unittest.TestCase):
        assert "### Response:" not in res
        assert "USER:" in res
        assert "ASSISTANT:" in res
-
-    def test_system_prompt(self):
-        prompter = SystemDataPrompter(prompt_style=PromptStyle.CHAT.value)
-        res = next(
-            prompter.build_prompt_w_system(
-                "use cot", "tell me a joke about the following", "alpacas"
-            )
-        )
-        assert "use cot" in res
-        assert res.startswith("use cot")
-        assert "### Instruction:" not in res
-        assert "### Input:" not in res
-        assert "alpacas" in res
-        assert "### Response:" not in res
-        assert "USER:" in res
-        assert "ASSISTANT:" in res
-
-
-class UnpromptedPrompterTest(unittest.TestCase):
-    """
-    Test class for UnpromptedPrompter with no system prompts
-    """
-
-    def test_prompt_style_w_none(self):
-        prompter = UnpromptedPrompter(prompt_style=None)
-        res = next(prompter.build_prompt("tell me a joke"))
-        assert "### Instruction:" in res
-        assert "tell me a joke" in res
-        assert res.startswith("###")
-
-    def test_prompt_style_w_instruct(self):
-        prompter = UnpromptedPrompter(prompt_style=PromptStyle.INSTRUCT.value)
-        res = next(
-            prompter.build_prompt("tell me a joke about the following", "alpacas")
-        )
-        assert "### Instruction:" in res
-        assert "tell me a joke" in res
-        assert res.startswith("###")
-
-    def test_prompt_style_w_chat(self):
-        prompter = UnpromptedPrompter(prompt_style=PromptStyle.CHAT.value)
-        res = next(
-            prompter.build_prompt("tell me a joke about the following", "alpacas")
-        )
-        assert "USER:" in res
-        assert "tell me a joke" in res
-        assert res.startswith("USER:")
-
-
-class MultipleChoiceExplainPrompterTest(unittest.TestCase):
-    """
-    Test class for MultipleChoiceExplainPrompter
-    """
-
-    def test_prompt_style_w_chat(self):
-        prompter = MultipleChoiceExplainPrompter(prompt_style=PromptStyle.CHAT.value)
-        res = next(prompter.build_prompt("choose one", "- A\n- B\n- C", "C"))
-        assert "USER:" in res
-        assert "choose one" in res
-        assert "Choose the answer that best answers the question." in res
-        assert "- A\n- B\n- C" in res
--- a/tests/test_tokenizers.py
+++ b/tests/test_tokenizers.py
@@ -1,31 +0,0 @@
-"""
-Test cases for the tokenizer loading
-"""
-import unittest
-
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.models import load_tokenizer
-
-
-class TestTokenizers(unittest.TestCase):
-    """
-    test class for the load_tokenizer fn
-    """
-
-    def test_default_use_fast(self):
-        cfg = DictDefault({})
-        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
-        assert "Fast" in tokenizer.__class__.__name__
-
-    def test_dont_use_fast(self):
-        cfg = DictDefault(
-            {
-                "tokenizer_use_fast": False,
-            }
-        )
-        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
-        assert "Fast" not in tokenizer.__class__.__name__
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -268,7 +268,7 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": None,
-                "adam_epsilon": 0.0001,
+                "adamw_epsilon": 0.0001,
            }
        )

@@ -283,7 +283,7 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": "adafactor",
-                "adam_beta1": 0.0001,
+                "adamw_beta1": 0.0001,
            }
        )

@@ -298,9 +298,9 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": "adamw_bnb_8bit",
-                "adam_beta1": 0.9,
-                "adam_beta2": 0.99,
-                "adam_epsilon": 0.0001,
+                "adamw_beta1": 0.0001,
+                "adamw_beta2": 0.0001,
+                "adamw_epsilon": 0.0001,
            }
        )