use different perplexity calc

fix perplexity calculation and make it configurable
compute perplexity from cross entropy
2023-07-10 13:43:50 -04:00 · 2023-07-10 12:49:51 -04:00 · 2023-07-10 12:49:47 -04:00 · 2023-07-10 12:48:02 -04:00 · 2023-07-10 12:42:12 -04:00 · 2023-07-10 11:52:59 -04:00
21 changed files with 527 additions and 93 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -26,7 +26,7 @@ jobs:
            pytorch: 2.0.0
            axolotl_extras:
          - cuda: "117"
-            cuda_version: 11.7.0
+            cuda_version: 11.7.1
            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -30,7 +30,7 @@ jobs:
            pytorch: 2.0.0
            axolotl_extras: gptq
          - cuda: cu117
-            cuda_version: 11.7.0
+            cuda_version: 11.7.1
            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
@@ -85,7 +85,7 @@ jobs:
            pytorch: 2.0.0
            axolotl_extras: gptq
          - cuda: cu117
-            cuda_version: 11.7.0
+            cuda_version: 11.7.1
            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,5 @@
 default_language_version:
-    python: python3.9
+    python: python3

 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
--- a/README.md
+++ b/README.md
@@ -195,6 +195,10 @@ Have dataset(s) in one of the following format (JSONL recommended):
  ```json
  {"message_1": "...", "message_2": "..."}
  ```
+- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
+  ```json
+  {"system_prompt": "...", "question": "...", "response": "..."}
+  ```
 - `context_qa`: in context question answering from an article
  ```json
  {"article": "...", "question": "...", "answer": "..."}
@@ -233,7 +237,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
 #### How to add custom prompts

  1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
-  2. Use your custom file name as the dataset type.
+  2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.

 Optionally, download some datasets, see [data/README.md](data/README.md)

@@ -251,10 +255,18 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic

 - dataset
  ```yaml
+  sequence_len: 2048 # max token length for prompt
+
+  # huggingface repo
  datasets:
-    - path: vicgalle/alpaca-gpt4 # local or huggingface repo
+    - path: vicgalle/alpaca-gpt4
+      type: alpaca # format from earlier
+
+  # local
+  datasets:
+    - path: json
+      data_files: data.jsonl # or json
      type: alpaca # format from earlier
-  sequence_len: 2048 # max token length / prompt
  ```

 - loading
@@ -302,6 +314,8 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Trust remote code for untrusted source
 trust_remote_code:
+# use_fast option for tokenizer loading from_pretrained, default to True
+tokenizer_use_fast:

 # whether you are training a 4-bit GPTQ quantized model
 gptq: true
@@ -322,10 +336,10 @@ tf32: true # require >=ampere

 # a list of one or more datasets to finetune the model with
 datasets:
-  # this can be either a hf dataset, or relative path
+  # hf dataset repo | "json" for local dataset, make sure to fill data_files
  - path: vicgalle/alpaca-gpt4
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca # format OR format:prompt_style (chat/instruct)
+    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
    data_files: # path to source data files
    shards: # number of shards to split data into

@@ -334,6 +348,8 @@ datasets:
 dataset_prepared_path: data/last_run_prepared
 # push prepared dataset to hub
 push_dataset_to_hub: # repo path
+# push checkpoints to hub
+hub_model_id: # repo path
 # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
 # required to be true when used in combination with `push_dataset_to_hub`
 hf_use_auth_token: # boolean
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -77,7 +77,7 @@ FROM base-builder
 RUN python3 -m pip uninstall -y apex
 RUN git clone https://github.com/NVIDIA/apex
 #  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
-RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
+RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./

 RUN mkdir -p /workspace/builds
 COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes
@@ -97,4 +97,4 @@ RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
 RUN git lfs install --skip-repo
 RUN pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic
+    pip3 install -U --no-cache-dir pydantic==1.10.10
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -126,6 +126,7 @@ class ConstantLengthDataset(IterableDataset):
                    buffer_len = 0

                if example:
+                    # FIXME
                    # just going to drop data points that are too long
                    if len(example["input_ids"]) <= self.seq_length:
                        input_ids = example["input_ids"]
--- a/src/axolotl/prompt_strategies/alpaca_chat.py
+++ b/src/axolotl/prompt_strategies/alpaca_chat.py
@@ -6,7 +6,7 @@ from axolotl.prompt_tokenizers import (
    AlpacaPromptTokenizingStrategy,
    InstructionPromptTokenizingStrategy,
 )
-from axolotl.prompters import AlpacaPrompter, PromptStyle
+from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter


 def load(tokenizer, cfg):
@@ -45,8 +45,10 @@ class NoSystemPrompter(AlpacaPrompter):
    Null Prompter with no system prompts
    """

-    prompt_input = "{instruction} {input} "
-    prompt_no_input = "{instruction} "
+    system_prompt = ""
+    system_no_input_prompt = ""
+    turn_format = "{instruction} {input} "
+    turn_no_input_format = "{instruction} "

    def __init__(self):  # pylint: disable=super-init-not-called
        pass
@@ -103,3 +105,12 @@ def load_camel_ai(tokenizer, cfg):
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
+
+
+def load_no_prompt(tokenizer, cfg):
+    return AlpacaPromptTokenizingStrategy(
+        UnpromptedPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
--- a/src/axolotl/prompt_strategies/alpaca_instruct.py
+++ b/src/axolotl/prompt_strategies/alpaca_instruct.py
@@ -1,7 +1,7 @@
 """Module loading the AlpacaInstructPromptTokenizingStrategy class"""

 from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
-from axolotl.prompters import AlpacaPrompter, PromptStyle
+from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter


 def load(tokenizer, cfg):
@@ -11,3 +11,12 @@ def load(tokenizer, cfg):
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
+
+
+def load_no_prompt(tokenizer, cfg):
+    return AlpacaPromptTokenizingStrategy(
+        UnpromptedPrompter(PromptStyle.INSTRUCT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
--- a/src/axolotl/prompt_strategies/alpaca_w_system.py
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -0,0 +1,120 @@
+"""
+Prompt strategies loader for alpaca instruction datasets with system prompts
+"""
+from typing import Generator, Tuple, Union
+
+from axolotl.prompt_tokenizers import PromptTokenizingStrategy
+from axolotl.prompters import AlpacaPrompter, PromptStyle
+
+
+class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy):
+    """
+    Tokenizing strategy for instruction-based prompts.
+    """
+
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
+        return (
+            prompt["instruction"],
+            prompt["input"] if "input" in prompt else "",
+            prompt["output"],
+            prompt["system"],
+        )
+
+    def tokenize_prompt(self, prompt):
+        # pylint: disable=duplicate-code
+        (
+            instruction,
+            input,  # pylint: disable=redefined-builtin
+            response,
+            system,
+        ) = self.parse_instruction_fields(prompt)
+        user_prompt = next(
+            iter(
+                self.prompter.build_prompt_w_system(
+                    system,
+                    instruction,
+                    input,
+                )
+            )
+        )
+        tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False)
+        if not self.train_on_inputs:
+            user_prompt_len = len(tokenized_prompt["input_ids"])
+            # TODO this could be sped up using numpy array slicing
+            tokenized_prompt["labels"] = [-100] * user_prompt_len
+        tokenized_res_prompt = self._tokenize(
+            response, strip_bos_token=True, add_eos_token=True
+        )
+        tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"]
+        tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"]
+        tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"]
+
+        return tokenized_prompt
+
+
+class SystemDataPrompter(AlpacaPrompter):
+    """
+    Alpaca Style Prompter that uses system prompts from the dataset
+    """
+
+    def build_prompt_w_system(
+        self,
+        system: str,
+        instruction: str,
+        input: Union[None, str] = None,  # pylint: disable=redefined-builtin
+        output: Union[None, str] = None,
+    ) -> Generator[str, None, None]:
+        # returns the full prompt from instruction and optional input
+        # if a label (=response, =output) is provided, it's also appended.
+        if input:
+            res = system + self.turn_format.format(instruction=instruction, input=input)
+        else:
+            res = system + self.turn_no_input_format.format(instruction=instruction)
+        if output:
+            res = f"{res}{output}"
+        yield res
+
+
+class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
+    """
+    Tokenizing strategy for OpenOrca datasets
+    """
+
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
+        return (
+            prompt["question"],
+            "",
+            prompt["response"],
+            prompt["system_prompt"],
+        )
+
+
+def load(tokenizer, cfg):
+    return load_chat(tokenizer, cfg)
+
+
+def load_instruct(tokenizer, cfg):
+    return InstructionWSystemPromptTokenizingStrategy(
+        SystemDataPrompter(PromptStyle.INSTRUCT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+def load_chat(tokenizer, cfg):
+    return InstructionWSystemPromptTokenizingStrategy(
+        SystemDataPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+def load_open_orca(tokenizer, cfg):
+    return OpenOrcaPromptTokenizingStrategy(
+        SystemDataPrompter(PromptStyle.INSTRUCT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -73,17 +73,8 @@ class PromptTokenizingStrategy(abc.ABC):
        ):
            result["input_ids"].append(self.tokenizer.eos_token_id)
            result["attention_mask"].append(1)
-        elif (  # some tokenizers automatically add an eos token, let's remove it
-            not add_eos_token and result["input_ids"][-1] == self.tokenizer.eos_token_id
-        ):
-            result["input_ids"] = result["input_ids"][:-1]
-            result["attention_mask"] = result["attention_mask"][:-1]

-        if (
-            self.tokenizer.bos_token_id
-            and result["input_ids"][0] == self.tokenizer.bos_token_id
-            and strip_bos_token
-        ):
+        if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
            result["input_ids"] = result["input_ids"][1:]
            result["attention_mask"] = result["attention_mask"][1:]

@@ -96,7 +87,9 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
    Tokenizing strategy for instruction-based prompts.
    """

-    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
+    def parse_instruction_fields(
+        self, prompt
+    ) -> Union[Tuple[str, str, str], Tuple[str, str, str, str]]:
        raise NotImplementedError

    def tokenize_prompt(self, prompt):
@@ -421,11 +414,7 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
            result["input_ids"].append(self.tokenizer.eos_token_id)
            result["attention_mask"].append(1)

-        if (
-            self.tokenizer.bos_token_id
-            and result["input_ids"][0] == self.tokenizer.bos_token_id
-            and strip_bos_token
-        ):
+        if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
            result["input_ids"] = result["input_ids"][1:]
            result["attention_mask"] = result["attention_mask"][1:]

@@ -451,7 +440,7 @@ def parse_tokenized_to_result(
    result: Dict[str, List[int]],
    current_len: int,
    res: Dict[str, List[int]],
-    labels: list[int],
+    labels: List[int],
    pad_token_id: Union[int, None] = None,
 ) -> Tuple[Dict[str, List[int]], int]:
    """
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -24,6 +24,8 @@ class AlpacaPrompter:

    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
    system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
+    turn_format: str
+    turn_no_input_format: str
    prompt_style: Optional[PromptStyle] = None

    def __init__(self, prompt_style=PromptStyle.INSTRUCT.value):
@@ -32,23 +34,13 @@ class AlpacaPrompter:

    def match_prompt_style(self):
        if self.prompt_style == PromptStyle.INSTRUCT.value:
-            self.prompt_input = (
-                self.system_prompt
-                + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
+            self.turn_format = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
+            self.turn_no_input_format = (
+                "### Instruction:\n{instruction}\n\n### Response:\n"
            )
-            self.prompt_no_input = (
-                self.system_no_input_prompt
-                + "### Instruction:\n{instruction}\n\n### Response:\n"
-            )
-            self.response_split = "### Response:"
        if self.prompt_style == PromptStyle.CHAT.value:
-            self.prompt_input = (
-                self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
-            )
-            self.prompt_no_input = (
-                self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
-            )
-            self.response_split = "ASSISTANT:"
+            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
+            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"

    def build_prompt(
        self,
@@ -59,16 +51,17 @@ class AlpacaPrompter:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
-            res = self.prompt_input.format(instruction=instruction, input=input)
+            res = self.system_prompt + self.turn_format.format(
+                instruction=instruction, input=input
+            )
        else:
-            res = self.prompt_no_input.format(instruction=instruction)
+            res = self.system_no_input_prompt + self.turn_no_input_format.format(
+                instruction=instruction
+            )
        if output:
            res = f"{res}{output}"
        yield res

-    def get_response(self, output: str) -> str:
-        return output.split(self.response_split)[1].strip()
-

 class UnpromptedPrompter(AlpacaPrompter):
    """
@@ -93,7 +86,10 @@ class MultipleChoiceExplainPrompter(AlpacaPrompter):
    """

    system_prompt = (
-        "Choose the answer that best answers the question. Explain your reasoning."
+        "Choose the answer that best answers the question. Explain your reasoning.\n"
+    )
+    system_no_input_prompt = (
+        "Choose the answer that best answers the question. Explain your reasoning.\n"
    )


@@ -102,7 +98,12 @@ class MultipleChoiceConcisePrompter(AlpacaPrompter):
    Prompter for multiple choice concise
    """

-    prompt_input = "Choose the answer that best answers the question. Be concise in your response.\n\nUSER: {instruction}\n{input}\nASSISTANT:\n"
+    system_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
+    system_no_input_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
+
+    def match_prompt_style(self):
+        self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
+        self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"


 class SummarizeTLDRPrompter(AlpacaPrompter):
@@ -110,9 +111,12 @@ class SummarizeTLDRPrompter(AlpacaPrompter):
    Prompter for summarize TLDR
    """

-    prompt_no_input = (
-        "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
-    )
+    system_prompt = ""
+    system_no_input_prompt = ""
+
+    def match_prompt_style(self):
+        self.turn_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\n{input}\nASSISTANT:"
+        self.turn_no_input_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"


 class CompletionPrompter:
@@ -128,9 +132,6 @@ class CompletionPrompter:
    ) -> Generator[str, None, None]:
        yield instruction

-    def get_response(self, output: str) -> str:
-        return output.strip()
-

 class GPTeacherPrompter(AlpacaPrompter):
    """
@@ -210,9 +211,6 @@ class ReflectAlpacaPrompter:
            res = f"{res}{label}"
        yield res

-    def get_response(self, output: str) -> str:
-        return output.split(self.response_split)[1].strip()
-

 class SeparatorStyle(Enum):
    """Different separator style."""
@@ -289,12 +287,6 @@ class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
            sep2=" ",
        )

-    # def match_prompt_style(self):
-    #     if self.prompt_style == PromptStyle.chat.value:
-    #         self.prompt_input = self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
-    #         self.prompt_no_input = self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
-    #         self.response_split = "ASSISTANT:"
-
    def build_prompt(self, source) -> Generator[str, None, None]:
        # ignore the system prompt if provided
        if source[0]["from"] == "system":
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -102,13 +102,26 @@ def load_tokenized_prepared_datasets(
                pass

            # prefer local dataset, even if hub exists
-            if Path(d.path).exists():
-                ds = load_dataset(
-                    "json",
-                    data_files=d.path,
-                    streaming=False,
-                    split=None,
-                )
+            local_path = Path(d.path)
+            if local_path.exists():
+                if local_path.is_dir():
+                    ds = load_dataset(
+                        d.path,
+                        data_files=d.data_files,
+                        streaming=False,
+                        split=None,
+                    )
+                elif local_path.is_file():
+                    ds = load_dataset(
+                        "json",
+                        data_files=d.path,
+                        streaming=False,
+                        split=None,
+                    )
+                else:
+                    raise ValueError(
+                        "unhandled dataset load: local path exists, but is neither a directory or a file"
+                    )
            elif ds_from_hub:
                if d.data_files:
                    ds = load_dataset(
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -34,15 +34,20 @@ def load_tokenizer(
    tokenizer_type,
    cfg,
 ):
+    use_fast = True  # this is the default
+    if cfg.tokenizer_use_fast is not None:
+        use_fast = cfg.tokenizer_use_fast
    if tokenizer_type:
        tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
            tokenizer_config,
            trust_remote_code=cfg.trust_remote_code or False,
+            use_fast=use_fast,
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_config,
            trust_remote_code=cfg.trust_remote_code or False,
+            use_fast=use_fast,
        )

    logging.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
@@ -197,7 +202,7 @@ def load_model(
                else True,
            )
            load_in_8bit = False
-        elif cfg.is_llama_derived_model:
+        elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
            from transformers import LlamaForCausalLM

            config = LlamaConfig.from_pretrained(base_model_config)
@@ -236,7 +241,7 @@ def load_model(
        #         device=cfg.device,
        #     )
        #     model.train() # sets to train instead of eval mode
-        elif model_type:
+        elif model_type and not cfg.trust_remote_code:
            model = getattr(transformers, model_type).from_pretrained(
                base_model,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
--- a/src/axolotl/utils/schedulers.py
+++ b/src/axolotl/utils/schedulers.py
@@ -1,6 +1,9 @@
 """Module for custom LRScheduler class"""
+import math
+from functools import partial

-from torch.optim.lr_scheduler import LRScheduler
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR, LRScheduler


 class InterpolatingLogScheduler(LRScheduler):
@@ -42,3 +45,58 @@ class InterpolatingLogScheduler(LRScheduler):
            lrs = [self.max_lr for base_lr in self.base_lrs]

        return lrs
+
+
+def _get_cosine_schedule_with_quadratic_warmup_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float
+):
+    if current_step < num_warmup_steps:
+        return (float(current_step) / float(max(1, num_warmup_steps))) ** 2
+    progress = float(current_step - num_warmup_steps) / float(
+        max(1, num_training_steps - num_warmup_steps)
+    )
+    return max(
+        0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
+    )
+
+
+def get_cosine_schedule_with_quadratic_warmup(
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+    last_epoch: int = -1,
+):
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_cycles (`float`, *optional*, defaults to 0.5):
+            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+            following a half-cosine).
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    lr_lambda = partial(
+        _get_cosine_schedule_with_quadratic_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=num_cycles,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -34,3 +34,5 @@ def check_example_labels(example, tokenizer):

    logging.info(" ".join(colored_tokens))
    logging.info("\n\n\n")
+
+    return " ".join(colored_tokens)
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -5,25 +5,82 @@ import logging
 import math
 import os
 import sys
+from dataclasses import field
 from pathlib import Path
-from typing import Optional
+from typing import Any, Dict, Optional

 import bitsandbytes as bnb
 import torch.cuda
+import torch.nn.functional as F
 import transformers
 from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
-from transformers import EarlyStoppingCallback, Trainer
+from transformers import (
+    EarlyStoppingCallback,
+    EvalPrediction,
+    Trainer,
+    TrainingArguments,
+)
 from transformers.trainer_pt_utils import get_parameter_names

 from axolotl.utils.callbacks import (
    SaveBetterTransformerModelCallback,
    SavePeftModelCallback,
 )
-from axolotl.utils.schedulers import InterpolatingLogScheduler
+from axolotl.utils.schedulers import (
+    InterpolatingLogScheduler,
+    get_cosine_schedule_with_quadratic_warmup,
+)


-class OneCycleLRSchedulerTrainer(Trainer):
+class AxolotlTrainingArguments(TrainingArguments):
+    """
+    Extend the base TrainingArguments for axolotl helpers
+    """
+
+    lr_quadratic_warmup: bool = field(
+        default=False,
+        metadata={"help": "Use quadratic warmup for cosine scheduling."},
+    )
+
+
+class AxolotlTrainer(Trainer):
+    """
+    Extend the base Trainer for axolotl helpers
+    """
+
+    args = None  # type: AxolotlTrainingArguments
+
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
+    ):
+        """
+        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
+        passed as an argument.
+
+        Args:
+            num_training_steps (int): The number of training steps to do.
+            optimizer (torch.optim.Optimizer): The training optimizer
+        """
+
+        # fmt: off
+        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
+            # fmt: on
+            if (
+                self.args.lr_scheduler_type == "cosine"
+                and self.args.lr_quadratic_warmup is True
+            ):
+                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
+                    optimizer,
+                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
+                    num_training_steps=num_training_steps,
+                )
+            else:
+                return super().create_scheduler(num_training_steps, optimizer)
+        return self.lr_scheduler
+
+
+class OneCycleLRSchedulerTrainer(AxolotlTrainer):
    """
    Trainer subclass that uses the OneCycleLR scheduler
    """
@@ -103,6 +160,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        if cfg.fsdp_config:
            training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config)

+    if cfg.lr_quadratic_warmup is not None:
+        training_arguments_kwargs["lr_quadratic_warmup"] = cfg.lr_quadratic_warmup
+
    # deepspeed
    if (
        os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true"
@@ -124,7 +184,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    if cfg.max_grad_norm:
        training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm

-    training_args = transformers.TrainingArguments(
+    if cfg.hub_model_id:
+        training_arguments_kwargs["hub_model_id"] = cfg.hub_model_id
+        training_arguments_kwargs["push_to_hub"] = True
+
+    training_args = AxolotlTrainingArguments(
        per_device_train_batch_size=cfg.micro_batch_size,
        per_device_eval_batch_size=cfg.eval_batch_size
        if cfg.eval_batch_size is not None
@@ -271,10 +335,23 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
                num_proc=32,
            )

+    if cfg.compute_perplexity_metrics:
+
+        def compute_metrics(eval_preds: EvalPrediction) -> Dict[str, Any]:
+            logits = eval_preds.predictions
+            labels = eval_preds.label_ids
+            cross_entropy_loss = F.cross_entropy(
+                logits.view(-1, model.config.vocab_size), labels.view(-1)
+            )
+            perplexity = torch.exp(cross_entropy_loss)
+            return {"cross_entropy_loss": cross_entropy_loss, "perplexity": perplexity}
+
+        trainer_kwargs["compute_metrics"] = compute_metrics
+
    trainer_cls = (
        OneCycleLRSchedulerTrainer
        if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
-        else transformers.Trainer
+        else AxolotlTrainer
    )
    trainer = trainer_cls(
        model=model,
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -87,11 +87,16 @@ def validate_config(cfg):
            "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
        )

-    if any([cfg.adamw_beta1, cfg.adamw_beta2, cfg.adamw_epsilon]) and (
+    if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
        not cfg.optimizer or "adamw" not in cfg.optimizer
    ):
        logging.warning("adamw hyperparameters found, but no adamw optimizer set")

+    if cfg.push_to_hub_model_id:
+        raise ValueError(
+            "push_to_hub_model_id is deprecated. Please use hub_model_id instead."
+        )
+
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -7,11 +7,15 @@ from pathlib import Path
 from transformers import AutoTokenizer

 from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
+from axolotl.prompt_strategies.alpaca_w_system import (
+    InstructionWSystemPromptTokenizingStrategy,
+    SystemDataPrompter,
+)
 from axolotl.prompt_tokenizers import (
    AlpacaPromptTokenizingStrategy,
    ShareGPTPromptTokenizingStrategy,
 )
-from axolotl.prompters import AlpacaPrompter, ShareGPTPrompter
+from axolotl.prompters import AlpacaPrompter, PromptStyle, ShareGPTPrompter

 logging.basicConfig(level="INFO")

@@ -96,5 +100,39 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
        assert example["labels"][world_idx - 1] == -100


+class InstructionWSystemPromptTokenizingStrategyTest(unittest.TestCase):
+    """
+    Test class for prompt tokenization strategies with sys prompt from the dataset
+    """
+
+    def setUp(self) -> None:
+        # pylint: disable=duplicate-code
+        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+        self.tokenizer.add_special_tokens(
+            {
+                "bos_token": "<s>",
+                "eos_token": "</s>",
+                "unk_token": "<unk>",
+            }
+        )
+
+    def test_system_alpaca(self):
+        prompter = SystemDataPrompter(PromptStyle.CHAT.value)
+        strat = InstructionWSystemPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        sample = {
+            "system": "use cot",
+            "instruction": "hello!",
+            "output": "Hi! How can I help?",
+        }
+        example = strat.tokenize_prompt(sample)
+        assert example["input_ids"][0:3] == [1, 671, 20118]  # <s>use cot
+        assert example["input_ids"][3] == 11889  # USER
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_prompters.py
+++ b/tests/test_prompters.py
@@ -2,7 +2,13 @@

 import unittest

-from axolotl.prompters import AlpacaPrompter, PromptStyle
+from axolotl.prompt_strategies.alpaca_w_system import SystemDataPrompter
+from axolotl.prompters import (
+    AlpacaPrompter,
+    MultipleChoiceExplainPrompter,
+    PromptStyle,
+    UnpromptedPrompter,
+)


 class AlpacaPrompterTest(unittest.TestCase):
@@ -55,3 +61,64 @@ class AlpacaPrompterTest(unittest.TestCase):
        assert "### Response:" not in res
        assert "USER:" in res
        assert "ASSISTANT:" in res
+
+    def test_system_prompt(self):
+        prompter = SystemDataPrompter(prompt_style=PromptStyle.CHAT.value)
+        res = next(
+            prompter.build_prompt_w_system(
+                "use cot", "tell me a joke about the following", "alpacas"
+            )
+        )
+        assert "use cot" in res
+        assert res.startswith("use cot")
+        assert "### Instruction:" not in res
+        assert "### Input:" not in res
+        assert "alpacas" in res
+        assert "### Response:" not in res
+        assert "USER:" in res
+        assert "ASSISTANT:" in res
+
+
+class UnpromptedPrompterTest(unittest.TestCase):
+    """
+    Test class for UnpromptedPrompter with no system prompts
+    """
+
+    def test_prompt_style_w_none(self):
+        prompter = UnpromptedPrompter(prompt_style=None)
+        res = next(prompter.build_prompt("tell me a joke"))
+        assert "### Instruction:" in res
+        assert "tell me a joke" in res
+        assert res.startswith("###")
+
+    def test_prompt_style_w_instruct(self):
+        prompter = UnpromptedPrompter(prompt_style=PromptStyle.INSTRUCT.value)
+        res = next(
+            prompter.build_prompt("tell me a joke about the following", "alpacas")
+        )
+        assert "### Instruction:" in res
+        assert "tell me a joke" in res
+        assert res.startswith("###")
+
+    def test_prompt_style_w_chat(self):
+        prompter = UnpromptedPrompter(prompt_style=PromptStyle.CHAT.value)
+        res = next(
+            prompter.build_prompt("tell me a joke about the following", "alpacas")
+        )
+        assert "USER:" in res
+        assert "tell me a joke" in res
+        assert res.startswith("USER:")
+
+
+class MultipleChoiceExplainPrompterTest(unittest.TestCase):
+    """
+    Test class for MultipleChoiceExplainPrompter
+    """
+
+    def test_prompt_style_w_chat(self):
+        prompter = MultipleChoiceExplainPrompter(prompt_style=PromptStyle.CHAT.value)
+        res = next(prompter.build_prompt("choose one", "- A\n- B\n- C", "C"))
+        assert "USER:" in res
+        assert "choose one" in res
+        assert "Choose the answer that best answers the question." in res
+        assert "- A\n- B\n- C" in res
--- a/tests/test_tokenizers.py
+++ b/tests/test_tokenizers.py
@@ -0,0 +1,31 @@
+"""
+Test cases for the tokenizer loading
+"""
+import unittest
+
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.models import load_tokenizer
+
+
+class TestTokenizers(unittest.TestCase):
+    """
+    test class for the load_tokenizer fn
+    """
+
+    def test_default_use_fast(self):
+        cfg = DictDefault({})
+        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
+        assert "Fast" in tokenizer.__class__.__name__
+
+    def test_dont_use_fast(self):
+        cfg = DictDefault(
+            {
+                "tokenizer_use_fast": False,
+            }
+        )
+        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
+        assert "Fast" not in tokenizer.__class__.__name__
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -268,7 +268,7 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": None,
-                "adamw_epsilon": 0.0001,
+                "adam_epsilon": 0.0001,
            }
        )

@@ -283,7 +283,7 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": "adafactor",
-                "adamw_beta1": 0.0001,
+                "adam_beta1": 0.0001,
            }
        )

@@ -298,9 +298,9 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": "adamw_bnb_8bit",
-                "adamw_beta1": 0.0001,
-                "adamw_beta2": 0.0001,
-                "adamw_epsilon": 0.0001,
+                "adam_beta1": 0.9,
+                "adam_beta2": 0.99,
+                "adam_epsilon": 0.0001,
            }
        )
Author	SHA1	Message	Date
Wing Lian	0f2a16aa33	use different perplexity calc Some checks failed pre-commit / pre-commit (push) Has been cancelled Details PyTest / test (3.10) (push) Has been cancelled Details PyTest / test (3.9) (push) Has been cancelled Details	2023-07-10 13:43:50 -04:00
Wing Lian	e7c84254ba	fix perplexity calculation and make it configurable	2023-07-10 12:49:51 -04:00
Wing Lian	1d02606934	compute perplexity from cross entropy	2023-07-10 12:49:47 -04:00
Wing Lian	687d889928	Merge pull request #271 from OpenAccess-AI-Collective/quadratic-warmup Quadratic warmup	2023-07-10 12:48:02 -04:00
Wing Lian	c4cf567b55	Merge branch 'main' into quadratic-warmup	2023-07-10 12:42:12 -04:00
Wing Lian	c49729d2bc	better configuration for quadratic warmup	2023-07-10 11:52:59 -04:00
Wing Lian	13ac4d8de2	Merge pull request #268 from OpenAccess-AI-Collective/fix-adam-args params are adam_, not adamw_	2023-07-08 12:33:34 -04:00
Wing Lian	19cf0bda99	params are adam_, not adamw_	2023-07-08 12:13:39 -04:00
Wing Lian	f74edd5b56	Merge pull request #266 from OpenAccess-AI-Collective/trust-remote-no-llama	2023-07-07 21:38:11 -04:00
Wing Lian	d69da99c2c	skip explicit model type too if using trust_remote_code	2023-07-07 21:33:11 -04:00
Wing Lian	66afb76a15	don't use llama if trust_remote_code is set since that needs to use AutoModel path	2023-07-07 21:31:02 -04:00
NanoCode012	a692ad3f4c	Merge pull request #264 from OpenAccess-AI-Collective/NanoCode012-patch-1 Fix(readme): local path loading and custom strategy type	2023-07-06 23:34:57 +09:00
NanoCode012	41da98b982	Fix for linter	2023-07-06 23:20:11 +09:00
NanoCode012	9e64f42e0f	Fix local path loading and custom strategy type	2023-07-06 23:08:09 +09:00
Wing Lian	b9b7d4ce92	Merge pull request #221 from utensil/local_dataset [WIP] Support loading data files from a local directory	2023-07-03 09:10:13 -04:00
Wing Lian	9bed281867	Merge pull request #258 from NanoCode012/fix/deprecate-push Fix future deprecation push_to_hub_model_id	2023-07-03 09:08:26 -04:00
NanoCode012	e79c8e617e	Fix future deprecation push_to_hub_model_id	2023-07-03 12:44:29 +09:00
Wing Lian	71456955f5	pin pydantic so deepspeed isn't broken	2023-07-02 22:26:51 -04:00
Wing Lian	3a783c04e4	Merge pull request #247 from OpenAccess-AI-Collective/fix-apex-base update pip install command for apex	2023-07-01 06:18:25 -04:00
Wing Lian	1e5014acec	Merge pull request #255 from OpenAccess-AI-Collective/open-orca-prompts open orca support	2023-07-01 01:11:23 -04:00
Wing Lian	a10da1caff	11.7.0 nvidia/cuda docker images are deprecated, move to 11.7.1 Some checks failed ci-cd-base / build-base (<nil>, 117, 11.7.1, 3.9, 1.13.1) (push) Has been cancelled Details ci-cd-base / build-base (<nil>, 118, 11.8.0, 3.10, 2.0.0) (push) Has been cancelled Details ci-cd-base / build-base (<nil>, 118, 11.8.0, 3.9, 2.0.0) (push) Has been cancelled Details ci-cd-base / build-base (gptq, 118, 11.8.0, 3.9, 2.0.0) (push) Has been cancelled Details pre-commit / pre-commit (push) Has been cancelled Details PyTest / test (3.10) (push) Has been cancelled Details PyTest / test (3.9) (push) Has been cancelled Details	2023-07-01 00:29:07 -04:00
Wing Lian	4066c78631	Merge pull request #246 from OpenAccess-AI-Collective/sys-prompts-instruct add option for instruct w sys prompts	2023-07-01 00:27:29 -04:00
Wing Lian	78a1e1fa12	open orca support	2023-07-01 00:19:41 -04:00
NanoCode012	bc8a2e5547	Merge pull request #249 from OpenAccess-AI-Collective/NanoCode012-patch-1 Fix typing list in prompt tokenizer	2023-06-30 15:01:41 +09:00
NanoCode012	910ebe47f5	Merge pull request #252 from OpenAccess-AI-Collective/NanoCode012-readme-fix Add cfg.push_to_hub_model_id to readme	2023-06-30 14:56:55 +09:00
NanoCode012	c146880a75	Update README.md	2023-06-30 11:33:53 +09:00
NanoCode012	77bdb7d144	Fix typing list	2023-06-29 14:29:55 +09:00
Wing Lian	530809fd74	update pip install command for apex	2023-06-28 22:36:28 -04:00
Wing Lian	924bbfddec	add option for instruct w sys prompts	2023-06-28 22:27:17 -04:00
Wing Lian	f150c027e3	Merge pull request #224 from OpenAccess-AI-Collective/system-prompt-data System prompt data	2023-06-27 17:57:43 -04:00
Wing Lian	5c39c006c9	Merge pull request #244 from OpenAccess-AI-Collective/push-to-hub push intermediate model checkpoints to hub	2023-06-27 17:57:30 -04:00
Wing Lian	612aabd8c4	push intermediate model checkpoints to hub	2023-06-27 15:40:25 -04:00
Wing Lian	af05883f75	Merge pull request #243 from OpenAccess-AI-Collective/unprompted-instruct skip the system prompt	2023-06-25 22:50:35 -04:00
Wing Lian	05ab9092e3	skip the system prompt	2023-06-25 22:40:50 -04:00
Wing Lian	7b57ed7618	pylint for duplicated code for system prompts	2023-06-25 22:28:07 -04:00
Wing Lian	3a38271276	add tests and supoort for loader for sys prompt data	2023-06-25 22:28:07 -04:00
Wing Lian	8d20e0a3d3	initial wip to get sys prompt from dataset	2023-06-25 22:28:07 -04:00
Wing Lian	de8ed229c3	Merge pull request #240 from OpenAccess-AI-Collective/tokenizer-fast optionally define whether to use_fast tokenizer	2023-06-25 12:47:55 -04:00
Wing Lian	478d8c7b8e	Merge pull request #241 from OpenAccess-AI-Collective/py3-pre-commit better py3 support w pre-commit	2023-06-25 12:47:02 -04:00
Wing Lian	645c13592c	better py3 support w pre-commit	2023-06-25 10:26:02 -04:00
Wing Lian	47d601fa23	optionally define whether to use_fast tokenizer	2023-06-25 10:19:49 -04:00
Utensil	9bdd30cdfd	Support loading data files from a local directory ref: https://huggingface.co/docs/datasets/v2.13.0/en/package_reference/loading_methods#datasets.load_dataset.path	2023-06-21 08:00:58 +00:00
Wing Lian	7dc580b837	add axolotl trainer and quadratic warmup	2023-06-12 13:16:40 -04:00