multipack sampler support from openchat

Merge pull request #274 from OpenAccess-AI-Collective/NanoCode012-patch-2
Feat: Set push to hub as private by default
2023-07-15 08:01:33 -04:00 · 2023-07-14 23:15:47 +09:00 · 2023-07-14 23:07:26 +09:00 · 2023-07-14 21:09:50 +09:00 · 2023-07-14 13:21:47 +09:00 · 2023-07-14 13:17:49 +09:00
23 changed files with 800 additions and 78 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -26,7 +26,7 @@ jobs:
            pytorch: 2.0.0
            axolotl_extras:
          - cuda: "117"
-            cuda_version: 11.7.0
+            cuda_version: 11.7.1
            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -30,7 +30,7 @@ jobs:
            pytorch: 2.0.0
            axolotl_extras: gptq
          - cuda: cu117
-            cuda_version: 11.7.0
+            cuda_version: 11.7.1
            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
@@ -85,7 +85,7 @@ jobs:
            pytorch: 2.0.0
            axolotl_extras: gptq
          - cuda: cu117
-            cuda_version: 11.7.0
+            cuda_version: 11.7.1
            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,5 @@
 default_language_version:
-    python: python3.9
+    python: python3
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
--- a/README.md
+++ b/README.md
@@ -195,6 +195,10 @@ Have dataset(s) in one of the following format (JSONL recommended):
  ```json
  {"message_1": "...", "message_2": "..."}
  ```
 - `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
  ```json
  {"system_prompt": "...", "question": "...", "response": "..."}
  ```
 - `context_qa`: in context question answering from an article
  ```json
  {"article": "...", "question": "...", "answer": "..."}
@@ -233,7 +237,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
 #### How to add custom prompts
  1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
-  2. Use your custom file name as the dataset type.
+  2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
 Optionally, download some datasets, see [data/README.md](data/README.md)
@@ -251,10 +255,18 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
 - dataset
  ```yaml
  sequence_len: 2048 # max token length for prompt
  # huggingface repo
  datasets:
-    - path: vicgalle/alpaca-gpt4 # local or huggingface repo
+    - path: vicgalle/alpaca-gpt4
      type: alpaca # format from earlier
  # local
  datasets:
    - path: json
      data_files: data.jsonl # or json
      type: alpaca # format from earlier
  sequence_len: 2048 # max token length / prompt
  ```
 - loading
@@ -293,6 +305,8 @@ base_model_ignore_patterns:
 # if the base_model repo on hf hub doesn't include configuration .json files,
 # you can set that here, or leave this empty to default to base_model
 base_model_config: ./llama-7b-hf
 # you can specify to choose a specific model revision from huggingface hub
 model_revision:
 # Optional tokenizer configuration override in case you want to use a different tokenizer
 # than the one defined in the base model
 tokenizer_config:
@@ -302,6 +316,8 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Trust remote code for untrusted source
 trust_remote_code:
 # use_fast option for tokenizer loading from_pretrained, default to True
 tokenizer_use_fast:
 # whether you are training a 4-bit GPTQ quantized model
 gptq: true
@@ -322,10 +338,10 @@ tf32: true # require >=ampere
 # a list of one or more datasets to finetune the model with
 datasets:
-  # this can be either a hf dataset, or relative path
+  # hf dataset repo | "json" for local dataset, make sure to fill data_files
  - path: vicgalle/alpaca-gpt4
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca # format OR format:prompt_style (chat/instruct)
+    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
    data_files: # path to source data files
    shards: # number of shards to split data into
@@ -334,6 +350,8 @@ datasets:
 dataset_prepared_path: data/last_run_prepared
 # push prepared dataset to hub
 push_dataset_to_hub: # repo path
 # push checkpoints to hub
 hub_model_id: # repo path
 # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
 # required to be true when used in combination with `push_dataset_to_hub`
 hf_use_auth_token: # boolean
@@ -395,6 +413,9 @@ logging_steps:
 save_steps:
 eval_steps:
 # save model as safetensors (require safetensors package)
 save_safetensors:
 # whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false
 # don't use this, leads to wonky training (according to someone on the internet)
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -77,7 +77,7 @@ FROM base-builder
 RUN python3 -m pip uninstall -y apex
 RUN git clone https://github.com/NVIDIA/apex
 #  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
-RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
+RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
 RUN mkdir -p /workspace/builds
 COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes
@@ -97,4 +97,4 @@ RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
 RUN git lfs install --skip-repo
 RUN pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic
+    pip3 install -U --no-cache-dir pydantic==1.10.10
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 peft @ git+https://github.com/huggingface/peft.git
 transformers @ git+https://github.com/huggingface/transformers.git
 bitsandbytes>=0.39.0
 accelerate
 addict
 fire
 PyYAML==6.0
@@ -18,3 +17,4 @@ evaluate==0.4.0
 rouge-score==0.1.2
 scipy
 scikit-learn==1.2.2
 numba
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -126,6 +126,7 @@ class ConstantLengthDataset(IterableDataset):
                    buffer_len = 0
                if example:
                    # FIXME
                    # just going to drop data points that are too long
                    if len(example["input_ids"]) <= self.seq_length:
                        input_ids = example["input_ids"]
--- a/src/axolotl/prompt_strategies/alpaca_chat.py
+++ b/src/axolotl/prompt_strategies/alpaca_chat.py
@@ -6,7 +6,7 @@ from axolotl.prompt_tokenizers import (
    AlpacaPromptTokenizingStrategy,
    InstructionPromptTokenizingStrategy,
 )
-from axolotl.prompters import AlpacaPrompter, PromptStyle
+from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter
 def load(tokenizer, cfg):
@@ -45,8 +45,10 @@ class NoSystemPrompter(AlpacaPrompter):
    Null Prompter with no system prompts
    """
-    prompt_input = "{instruction} {input} "
+    system_prompt = ""
-    prompt_no_input = "{instruction} "
+    system_no_input_prompt = ""
    turn_format = "{instruction} {input} "
    turn_no_input_format = "{instruction} "
    def __init__(self):  # pylint: disable=super-init-not-called
        pass
@@ -103,3 +105,12 @@ def load_camel_ai(tokenizer, cfg):
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
 def load_no_prompt(tokenizer, cfg):
    return AlpacaPromptTokenizingStrategy(
        UnpromptedPrompter(PromptStyle.CHAT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
--- a/src/axolotl/prompt_strategies/alpaca_instruct.py
+++ b/src/axolotl/prompt_strategies/alpaca_instruct.py
@@ -1,7 +1,7 @@
 """Module loading the AlpacaInstructPromptTokenizingStrategy class"""
 from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
-from axolotl.prompters import AlpacaPrompter, PromptStyle
+from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter
 def load(tokenizer, cfg):
@@ -11,3 +11,12 @@ def load(tokenizer, cfg):
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
 def load_no_prompt(tokenizer, cfg):
    return AlpacaPromptTokenizingStrategy(
        UnpromptedPrompter(PromptStyle.INSTRUCT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
--- a/src/axolotl/prompt_strategies/alpaca_w_system.py
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -0,0 +1,120 @@
 """
 Prompt strategies loader for alpaca instruction datasets with system prompts
 """
 from typing import Generator, Tuple, Union
 from axolotl.prompt_tokenizers import PromptTokenizingStrategy
 from axolotl.prompters import AlpacaPrompter, PromptStyle
 class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy):
    """
    Tokenizing strategy for instruction-based prompts.
    """
    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
        return (
            prompt["instruction"],
            prompt["input"] if "input" in prompt else "",
            prompt["output"],
            prompt["system"],
        )
    def tokenize_prompt(self, prompt):
        # pylint: disable=duplicate-code
        (
            instruction,
            input,  # pylint: disable=redefined-builtin
            response,
            system,
        ) = self.parse_instruction_fields(prompt)
        user_prompt = next(
            iter(
                self.prompter.build_prompt_w_system(
                    system,
                    instruction,
                    input,
                )
            )
        )
        tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False)
        if not self.train_on_inputs:
            user_prompt_len = len(tokenized_prompt["input_ids"])
            # TODO this could be sped up using numpy array slicing
            tokenized_prompt["labels"] = [-100] * user_prompt_len
        tokenized_res_prompt = self._tokenize(
            response, strip_bos_token=True, add_eos_token=True
        )
        tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"]
        tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"]
        tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"]
        return tokenized_prompt
 class SystemDataPrompter(AlpacaPrompter):
    """
    Alpaca Style Prompter that uses system prompts from the dataset
    """
    def build_prompt_w_system(
        self,
        system: str,
        instruction: str,
        input: Union[None, str] = None,  # pylint: disable=redefined-builtin
        output: Union[None, str] = None,
    ) -> Generator[str, None, None]:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
            res = system + self.turn_format.format(instruction=instruction, input=input)
        else:
            res = system + self.turn_no_input_format.format(instruction=instruction)
        if output:
            res = f"{res}{output}"
        yield res
 class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
    """
    Tokenizing strategy for OpenOrca datasets
    """
    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
        return (
            prompt["question"],
            "",
            prompt["response"],
            prompt["system_prompt"],
        )
 def load(tokenizer, cfg):
    return load_chat(tokenizer, cfg)
 def load_instruct(tokenizer, cfg):
    return InstructionWSystemPromptTokenizingStrategy(
        SystemDataPrompter(PromptStyle.INSTRUCT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
 def load_chat(tokenizer, cfg):
    return InstructionWSystemPromptTokenizingStrategy(
        SystemDataPrompter(PromptStyle.CHAT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
 def load_open_orca(tokenizer, cfg):
    return OpenOrcaPromptTokenizingStrategy(
        SystemDataPrompter(PromptStyle.INSTRUCT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -87,7 +87,9 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
    Tokenizing strategy for instruction-based prompts.
    """
-    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
+    def parse_instruction_fields(
        self, prompt
    ) -> Union[Tuple[str, str, str], Tuple[str, str, str, str]]:
        raise NotImplementedError
    def tokenize_prompt(self, prompt):
@@ -438,7 +440,7 @@ def parse_tokenized_to_result(
    result: Dict[str, List[int]],
    current_len: int,
    res: Dict[str, List[int]],
-    labels: list[int],
+    labels: List[int],
    pad_token_id: Union[int, None] = None,
 ) -> Tuple[Dict[str, List[int]], int]:
    """
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -24,6 +24,8 @@ class AlpacaPrompter:
    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
    system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
    turn_format: str
    turn_no_input_format: str
    prompt_style: Optional[PromptStyle] = None
    def __init__(self, prompt_style=PromptStyle.INSTRUCT.value):
@@ -32,23 +34,13 @@ class AlpacaPrompter:
    def match_prompt_style(self):
        if self.prompt_style == PromptStyle.INSTRUCT.value:
-            self.prompt_input = (
+            self.turn_format = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
-                self.system_prompt
+            self.turn_no_input_format = (
-                + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
+                "### Instruction:\n{instruction}\n\n### Response:\n"
            )
            self.prompt_no_input = (
                self.system_no_input_prompt
                + "### Instruction:\n{instruction}\n\n### Response:\n"
            )
            self.response_split = "### Response:"
        if self.prompt_style == PromptStyle.CHAT.value:
-            self.prompt_input = (
+            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
-                self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
+            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
            )
            self.prompt_no_input = (
                self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
            )
            self.response_split = "ASSISTANT:"
    def build_prompt(
        self,
@@ -59,16 +51,17 @@ class AlpacaPrompter:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
-            res = self.prompt_input.format(instruction=instruction, input=input)
+            res = self.system_prompt + self.turn_format.format(
                instruction=instruction, input=input
            )
        else:
-            res = self.prompt_no_input.format(instruction=instruction)
+            res = self.system_no_input_prompt + self.turn_no_input_format.format(
                instruction=instruction
            )
        if output:
            res = f"{res}{output}"
        yield res
    def get_response(self, output: str) -> str:
        return output.split(self.response_split)[1].strip()
 class UnpromptedPrompter(AlpacaPrompter):
    """
@@ -93,7 +86,10 @@ class MultipleChoiceExplainPrompter(AlpacaPrompter):
    """
    system_prompt = (
-        "Choose the answer that best answers the question. Explain your reasoning."
+        "Choose the answer that best answers the question. Explain your reasoning.\n"
    )
    system_no_input_prompt = (
        "Choose the answer that best answers the question. Explain your reasoning.\n"
    )
@@ -102,7 +98,12 @@ class MultipleChoiceConcisePrompter(AlpacaPrompter):
    Prompter for multiple choice concise
    """
-    prompt_input = "Choose the answer that best answers the question. Be concise in your response.\n\nUSER: {instruction}\n{input}\nASSISTANT:\n"
+    system_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
    system_no_input_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
    def match_prompt_style(self):
        self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
        self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
 class SummarizeTLDRPrompter(AlpacaPrompter):
@@ -110,9 +111,12 @@ class SummarizeTLDRPrompter(AlpacaPrompter):
    Prompter for summarize TLDR
    """
-    prompt_no_input = (
+    system_prompt = ""
-        "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
+    system_no_input_prompt = ""
-    )
+
    def match_prompt_style(self):
        self.turn_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\n{input}\nASSISTANT:"
        self.turn_no_input_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
 class CompletionPrompter:
@@ -128,9 +132,6 @@ class CompletionPrompter:
    ) -> Generator[str, None, None]:
        yield instruction
    def get_response(self, output: str) -> str:
        return output.strip()
 class GPTeacherPrompter(AlpacaPrompter):
    """
@@ -210,9 +211,6 @@ class ReflectAlpacaPrompter:
            res = f"{res}{label}"
        yield res
    def get_response(self, output: str) -> str:
        return output.split(self.response_split)[1].strip()
 class SeparatorStyle(Enum):
    """Different separator style."""
@@ -289,12 +287,6 @@ class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
            sep2=" ",
        )
    # def match_prompt_style(self):
    #     if self.prompt_style == PromptStyle.chat.value:
    #         self.prompt_input = self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
    #         self.prompt_no_input = self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
    #         self.response_split = "ASSISTANT:"
    def build_prompt(self, source) -> Generator[str, None, None]:
        # ignore the system prompt if provided
        if source[0]["from"] == "system":
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -102,13 +102,26 @@ def load_tokenized_prepared_datasets(
                pass
            # prefer local dataset, even if hub exists
-            if Path(d.path).exists():
+            local_path = Path(d.path)
-                ds = load_dataset(
+            if local_path.exists():
-                    "json",
+                if local_path.is_dir():
-                    data_files=d.path,
+                    ds = load_dataset(
-                    streaming=False,
+                        d.path,
-                    split=None,
+                        data_files=d.data_files,
-                )
+                        streaming=False,
                        split=None,
                    )
                elif local_path.is_file():
                    ds = load_dataset(
                        "json",
                        data_files=d.path,
                        streaming=False,
                        split=None,
                    )
                else:
                    raise ValueError(
                        "unhandled dataset load: local path exists, but is neither a directory or a file"
                    )
            elif ds_from_hub:
                if d.data_files:
                    ds = load_dataset(
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -34,15 +34,20 @@ def load_tokenizer(
    tokenizer_type,
    cfg,
 ):
    use_fast = True  # this is the default
    if cfg.tokenizer_use_fast is not None:
        use_fast = cfg.tokenizer_use_fast
    if tokenizer_type:
        tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
            tokenizer_config,
            trust_remote_code=cfg.trust_remote_code or False,
            use_fast=use_fast,
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_config,
            trust_remote_code=cfg.trust_remote_code or False,
            use_fast=use_fast,
        )
    logging.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
@@ -149,6 +154,8 @@ def load_model(
        )
    model_kwargs = {}
    if cfg.model_revision:
        model_kwargs["revision"] = cfg.model_revision
    if cfg.adapter == "qlora" and cfg.load_in_4bit:
        model_kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_4bit=True,
@@ -197,7 +204,7 @@ def load_model(
                else True,
            )
            load_in_8bit = False
-        elif cfg.is_llama_derived_model:
+        elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
            from transformers import LlamaForCausalLM
            config = LlamaConfig.from_pretrained(base_model_config)
@@ -236,7 +243,7 @@ def load_model(
        #         device=cfg.device,
        #     )
        #     model.train() # sets to train instead of eval mode
-        elif model_type:
+        elif model_type and not cfg.trust_remote_code:
            model = getattr(transformers, model_type).from_pretrained(
                base_model,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
--- a/src/axolotl/utils/sampler.py
+++ b/src/axolotl/utils/sampler.py
@@ -0,0 +1,173 @@
 # pylint: skip-file
 from typing import Any, List, Optional
 import numba
 import numpy as np
 import torch.distributed as dist
 from torch.utils.data import Sampler
@numba.njit
 def ffd_check(a: np.ndarray, c: int, n: int):
    # First-fit-decreasing bin packing
    # Check if a[] could fit in n bins with capacity c
    # https://en.wikipedia.org/wiki/First-fit-decreasing_bin_packing
    a = np.sort(a)[::-1]
    bins = np.full((n,), c, dtype=a.dtype)
    for size in a:
        not_found = True
        for idx in range(n):
            if bins[idx] >= size:
                bins[idx] -= size
                not_found = False
                break
        if not_found:
            return False
    return True
@numba.njit
 def ffd_with_result(a: np.ndarray, c: int, start_index: int):
    # First-fit-decreasing bin packing (with result return)
    indices = np.argsort(a)[::-1]
    a = a[indices]
    bins: List[int] = []
    bins_result: List[Any] = []
    for a_id, size in enumerate(a):
        add_new = True
        for idx in range(len(bins)):
            if bins[idx] >= size:
                bins[idx] -= size
                bins_result[idx].append(indices[a_id] + start_index)
                add_new = False
                break
        if add_new:
            bins.append(c - size)
            bins_result.append([indices[a_id] + start_index])
    return bins_result
@numba.njit
 def allocate(
    lengths: np.ndarray, lengths_cumsum: np.ndarray, rank: int, c: int, n: int
 ):
    # Dynamic batch allocator, similar to Multifit
    # https://en.wikipedia.org/wiki/Multifit_algorithm
    # ~99.5% efficiency on OpenChat training set (12 * 2048 ctx len)
    s = 0
    start_index = 0
    result = []
    while True:
        # binary search [l, r)
        left = 1
        right = 1 + np.searchsorted(lengths_cumsum[start_index:], s + c * n, "right")
        while right - left > 1:
            m = (left + right) // 2
            if ffd_check(lengths[start_index : start_index + m], c, n):
                left = m
            else:
                right = m
        # use length l
        batch = ffd_with_result(
            lengths[start_index : start_index + left], c, start_index
        )
        assert len(batch) <= n
        if len(batch) < n:
            break
        start_index += left
        s = lengths_cumsum[start_index - 1]
        # add local rank
        result.append(batch[rank])
    return result, s, len(result) * c * n
 class MultipackDistributedBatchSampler(Sampler):
    """Unpadded length sampling using Multipack.
    Approximate (at most ~1.22x) the optimal solution of the identical-machines scheduling problem, which is NP-hard.
    """
    def __init__(
        self,
        batch_max_length: int,
        lengths: List[int],
        num_replicas: Optional[int] = None,
        rank: Optional[int] = None,
        seed: int = 0,
    ):
        # Get rank
        if num_replicas is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            num_replicas = dist.get_world_size()
        if rank is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            rank = dist.get_rank()
        self.num_replicas = num_replicas
        self.rank = rank
        self.seed = seed
        self.batch_max_length = batch_max_length
        self.lengths = lengths
        assert isinstance(self.lengths, np.ndarray)
        self.epoch = 0
        # statistics
        self.eff_total_used = 0
        self.eff_total_slots = 0
    def set_epoch(self, epoch: int):
        self.epoch = epoch
    def generate_batches(self, set_stats=False):
        indices = np.random.default_rng(seed=self.seed + self.epoch).permutation(
            len(self.lengths)
        )
        lengths = self.lengths[indices]
        lengths_cumsum = np.cumsum(lengths)
        batches, total_used, total_slots = allocate(
            lengths=lengths,
            lengths_cumsum=lengths_cumsum,
            rank=self.rank,
            c=self.batch_max_length,
            n=self.num_replicas,
        )
        batches = [indices[batch] for batch in batches]
        # statistics
        if set_stats:
            self.eff_total_used += total_used
            self.eff_total_slots += total_slots
        return batches
    def __iter__(self):
        batches = self.generate_batches(set_stats=True)
        return iter(batches)
    def num_batches(self):
        batches = self.generate_batches()
        return len(batches)
    def efficiency(self):
        return self.eff_total_used / self.eff_total_slots
--- a/src/axolotl/utils/schedulers.py
+++ b/src/axolotl/utils/schedulers.py
@@ -1,6 +1,9 @@
 """Module for custom LRScheduler class"""
 import math
 from functools import partial
-from torch.optim.lr_scheduler import LRScheduler
+from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR, LRScheduler
 class InterpolatingLogScheduler(LRScheduler):
@@ -42,3 +45,58 @@ class InterpolatingLogScheduler(LRScheduler):
            lrs = [self.max_lr for base_lr in self.base_lrs]
        return lrs
 def _get_cosine_schedule_with_quadratic_warmup_lr_lambda(
    current_step: int,
    *,
    num_warmup_steps: int,
    num_training_steps: int,
    num_cycles: float
 ):
    if current_step < num_warmup_steps:
        return (float(current_step) / float(max(1, num_warmup_steps))) ** 2
    progress = float(current_step - num_warmup_steps) / float(
        max(1, num_training_steps - num_warmup_steps)
    )
    return max(
        0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
    )
 def get_cosine_schedule_with_quadratic_warmup(
    optimizer: Optimizer,
    num_warmup_steps: int,
    num_training_steps: int,
    num_cycles: float = 0.5,
    last_epoch: int = -1,
 ):
    """
    Create a schedule with a learning rate that decreases following the values of the cosine function between the
    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
    initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        num_cycles (`float`, *optional*, defaults to 0.5):
            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
            following a half-cosine).
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """
    lr_lambda = partial(
        _get_cosine_schedule_with_quadratic_warmup_lr_lambda,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        num_cycles=num_cycles,
    )
    return LambdaLR(optimizer, lr_lambda, last_epoch)
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -34,3 +34,5 @@ def check_example_labels(example, tokenizer):
    logging.info(" ".join(colored_tokens))
    logging.info("\n\n\n")
    return " ".join(colored_tokens)
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -5,25 +5,185 @@ import logging
 import math
 import os
 import sys
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Optional
 import bitsandbytes as bnb
 import numpy as np
 import torch.cuda
 import transformers
 from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
-from transformers import EarlyStoppingCallback, Trainer
+from torch.utils.data import Dataset
 from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
 from transformers.trainer_pt_utils import get_parameter_names
 from axolotl.utils.callbacks import (
    SaveBetterTransformerModelCallback,
    SavePeftModelCallback,
 )
-from axolotl.utils.schedulers import InterpolatingLogScheduler
+from axolotl.utils.sampler import MultipackDistributedBatchSampler
 from axolotl.utils.schedulers import (
    InterpolatingLogScheduler,
    get_cosine_schedule_with_quadratic_warmup,
 )
 IGNORE_LABEL_ID = -100
-class OneCycleLRSchedulerTrainer(Trainer):
+def _find_multiple(val1, val2):
    return (-(val1 // -val2)) * val2
 def batch_to_tensor(batch, pad_id=0, dtype=torch.long, loss_dtype=torch.bfloat16):
    # Pad an unused item to reach multiple of 64, for faster GEMM
    pad_cur_len = sum(list(batch["length"]))
    pad_len = _find_multiple(pad_cur_len, 64) - pad_cur_len
    if pad_len > 0:
        assert pad_len < 64
        batch["input_ids"].append([pad_id] * pad_len)
        batch["labels"].append([pad_id] * pad_len)
        batch["attention_mask"].append([0] * pad_len)
        batch["length"].append(pad_len)
    # seqlen
    batch_lengths = torch.tensor(list(batch["length"]), dtype=torch.int32, device="cpu")
    max_seqlen = torch.max(batch_lengths)
    cu_seqlens = torch.nn.functional.pad(
        batch_lengths.cumsum(-1, dtype=torch.int32), (1, 0)
    )
    # nz elements
    nz_num = cu_seqlens[-1]
    nz_input_ids = torch.zeros((nz_num,), dtype=dtype, pin_memory=True, device="cpu")
    nz_position_ids = torch.zeros((nz_num,), dtype=dtype, pin_memory=True, device="cpu")
    nz_shifted_label_ids = torch.zeros(
        (nz_num,), dtype=dtype, pin_memory=True, device="cpu"
    )
    nz_shifted_loss_weights = torch.zeros(
        (nz_num,), dtype=loss_dtype, pin_memory=True, device="cpu"
    )
    index = 0
    for token_list, length, labels_list in zip(
        batch["input_ids"], batch["length"], batch["labels"]
    ):
        tokens = torch.tensor(token_list, dtype=dtype, device="cpu")
        position_ids = torch.arange(length, dtype=dtype, device="cpu")
        # Input IDs & shifted labels
        # shifted_label_ids = torch.where(masks, tokens, IGNORE_LABEL_ID)
        shifted_label_ids = labels_list
        shifted_label_ids = torch.nn.functional.pad(
            shifted_label_ids[1:], (0, 1), "constant", IGNORE_LABEL_ID
        )
        nz_input_ids[index : index + length] = tokens
        nz_position_ids[index : index + length] = position_ids
        nz_shifted_label_ids[index : index + length] = shifted_label_ids
        # Loss weights
        mask_count = sum(1 for label in labels_list[1:] if label != IGNORE_LABEL_ID)
        loss_weight = (
            1 / mask_count if mask_count > 0 else 0
        )  # Avoid division by zero for paddings
        nz_shifted_loss_weights[index : index + length] = loss_weight
        index += length
    # inputs
    return {
        "max_seqlen": max_seqlen,
        "cu_seqlens": cu_seqlens,
        "nz_input_ids": nz_input_ids,
        "nz_position_ids": nz_position_ids,
        "nz_shifted_label_ids": nz_shifted_label_ids,
        "nz_shifted_loss_weights": nz_shifted_loss_weights,
    }
@dataclass
 class AxolotlTrainingArguments(TrainingArguments):
    """
    Extend the base TrainingArguments for axolotl helpers
    """
    lr_quadratic_warmup: bool = field(
        default=False,
        metadata={"help": "Use quadratic warmup for cosine scheduling."},
    )
    sample_packing: bool = field(
        default=True,
        metadata={"help": "Use sample packing for efficient training."},
    )
    max_seq_length: int = field(
        default=2048,
        metadata={"help": "The maximum sequence length the model can handle"},
    )
 class AxolotlTrainer(Trainer):
    """
    Extend the base Trainer for axolotl helpers
    """
    args = None  # type: AxolotlTrainingArguments
    def create_scheduler(
        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
    ):
        """
        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
        passed as an argument.
        Args:
            num_training_steps (int): The number of training steps to do.
            optimizer (torch.optim.Optimizer): The training optimizer
        """
        # fmt: off
        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
            # fmt: on
            if (
                self.args.lr_scheduler_type == "cosine"
                and self.args.lr_quadratic_warmup is True
            ):
                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
                    optimizer,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                    num_training_steps=num_training_steps,
                )
            else:
                return super().create_scheduler(num_training_steps, optimizer)
        return self.lr_scheduler
    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
        lengths = np.array([len(sample["input_ids"]) for sample in self.train_dataset])
        return MultipackDistributedBatchSampler(
            batch_max_length=self.args.per_device_train_batch_size
            * self.args.max_seq_length,
            lengths=lengths,
            seed=self.args.seed,
        )
    def _get_eval_sampler(
        self, eval_dataset: Dataset
    ) -> Optional[torch.utils.data.Sampler]:
        lengths = np.array([len(sample["input_ids"]) for sample in eval_dataset])
        return MultipackDistributedBatchSampler(
            batch_max_length=self.args.per_device_eval_batch_size
            * self.args.max_seq_length,
            lengths=lengths,
            seed=self.args.seed,
        )
 class OneCycleLRSchedulerTrainer(AxolotlTrainer):
    """
    Trainer subclass that uses the OneCycleLR scheduler
    """
@@ -103,6 +263,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        if cfg.fsdp_config:
            training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config)
    if cfg.lr_quadratic_warmup is not None:
        training_arguments_kwargs["lr_quadratic_warmup"] = cfg.lr_quadratic_warmup
    # deepspeed
    if (
        os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true"
@@ -124,7 +287,16 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    if cfg.max_grad_norm:
        training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm
-    training_args = transformers.TrainingArguments(
+    if cfg.hub_model_id:
        training_arguments_kwargs["hub_model_id"] = cfg.hub_model_id
        training_arguments_kwargs["push_to_hub"] = True
        training_arguments_kwargs["hub_private_repo"] = True
    if cfg.save_safetensors:
        training_arguments_kwargs["save_safetensors"] = cfg.save_safetensors
    training_args = AxolotlTrainingArguments(  # pylint: disable=unexpected-keyword-arg
        max_steps=total_num_steps * cfg.num_epochs,
        per_device_train_batch_size=cfg.micro_batch_size,
        per_device_eval_batch_size=cfg.eval_batch_size
        if cfg.eval_batch_size is not None
@@ -274,7 +446,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    trainer_cls = (
        OneCycleLRSchedulerTrainer
        if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
-        else transformers.Trainer
+        else AxolotlTrainer
    )
    trainer = trainer_cls(
        model=model,
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -87,11 +87,16 @@ def validate_config(cfg):
            "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
        )
-    if any([cfg.adamw_beta1, cfg.adamw_beta2, cfg.adamw_epsilon]) and (
+    if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
        not cfg.optimizer or "adamw" not in cfg.optimizer
    ):
        logging.warning("adamw hyperparameters found, but no adamw optimizer set")
    if cfg.push_to_hub_model_id:
        raise ValueError(
            "push_to_hub_model_id is deprecated. Please use hub_model_id instead."
        )
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -7,11 +7,15 @@ from pathlib import Path
 from transformers import AutoTokenizer
 from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
 from axolotl.prompt_strategies.alpaca_w_system import (
    InstructionWSystemPromptTokenizingStrategy,
    SystemDataPrompter,
 )
 from axolotl.prompt_tokenizers import (
    AlpacaPromptTokenizingStrategy,
    ShareGPTPromptTokenizingStrategy,
 )
-from axolotl.prompters import AlpacaPrompter, ShareGPTPrompter
+from axolotl.prompters import AlpacaPrompter, PromptStyle, ShareGPTPrompter
 logging.basicConfig(level="INFO")
@@ -96,5 +100,39 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
        assert example["labels"][world_idx - 1] == -100
 class InstructionWSystemPromptTokenizingStrategyTest(unittest.TestCase):
    """
    Test class for prompt tokenization strategies with sys prompt from the dataset
    """
    def setUp(self) -> None:
        # pylint: disable=duplicate-code
        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
        self.tokenizer.add_special_tokens(
            {
                "bos_token": "<s>",
                "eos_token": "</s>",
                "unk_token": "<unk>",
            }
        )
    def test_system_alpaca(self):
        prompter = SystemDataPrompter(PromptStyle.CHAT.value)
        strat = InstructionWSystemPromptTokenizingStrategy(
            prompter,
            self.tokenizer,
            False,
            2048,
        )
        sample = {
            "system": "use cot",
            "instruction": "hello!",
            "output": "Hi! How can I help?",
        }
        example = strat.tokenize_prompt(sample)
        assert example["input_ids"][0:3] == [1, 671, 20118]  # <s>use cot
        assert example["input_ids"][3] == 11889  # USER
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_prompters.py
+++ b/tests/test_prompters.py
@@ -2,7 +2,13 @@
 import unittest
-from axolotl.prompters import AlpacaPrompter, PromptStyle
+from axolotl.prompt_strategies.alpaca_w_system import SystemDataPrompter
 from axolotl.prompters import (
    AlpacaPrompter,
    MultipleChoiceExplainPrompter,
    PromptStyle,
    UnpromptedPrompter,
 )
 class AlpacaPrompterTest(unittest.TestCase):
@@ -55,3 +61,64 @@ class AlpacaPrompterTest(unittest.TestCase):
        assert "### Response:" not in res
        assert "USER:" in res
        assert "ASSISTANT:" in res
    def test_system_prompt(self):
        prompter = SystemDataPrompter(prompt_style=PromptStyle.CHAT.value)
        res = next(
            prompter.build_prompt_w_system(
                "use cot", "tell me a joke about the following", "alpacas"
            )
        )
        assert "use cot" in res
        assert res.startswith("use cot")
        assert "### Instruction:" not in res
        assert "### Input:" not in res
        assert "alpacas" in res
        assert "### Response:" not in res
        assert "USER:" in res
        assert "ASSISTANT:" in res
 class UnpromptedPrompterTest(unittest.TestCase):
    """
    Test class for UnpromptedPrompter with no system prompts
    """
    def test_prompt_style_w_none(self):
        prompter = UnpromptedPrompter(prompt_style=None)
        res = next(prompter.build_prompt("tell me a joke"))
        assert "### Instruction:" in res
        assert "tell me a joke" in res
        assert res.startswith("###")
    def test_prompt_style_w_instruct(self):
        prompter = UnpromptedPrompter(prompt_style=PromptStyle.INSTRUCT.value)
        res = next(
            prompter.build_prompt("tell me a joke about the following", "alpacas")
        )
        assert "### Instruction:" in res
        assert "tell me a joke" in res
        assert res.startswith("###")
    def test_prompt_style_w_chat(self):
        prompter = UnpromptedPrompter(prompt_style=PromptStyle.CHAT.value)
        res = next(
            prompter.build_prompt("tell me a joke about the following", "alpacas")
        )
        assert "USER:" in res
        assert "tell me a joke" in res
        assert res.startswith("USER:")
 class MultipleChoiceExplainPrompterTest(unittest.TestCase):
    """
    Test class for MultipleChoiceExplainPrompter
    """
    def test_prompt_style_w_chat(self):
        prompter = MultipleChoiceExplainPrompter(prompt_style=PromptStyle.CHAT.value)
        res = next(prompter.build_prompt("choose one", "- A\n- B\n- C", "C"))
        assert "USER:" in res
        assert "choose one" in res
        assert "Choose the answer that best answers the question." in res
        assert "- A\n- B\n- C" in res
--- a/tests/test_tokenizers.py
+++ b/tests/test_tokenizers.py
@@ -0,0 +1,31 @@
 """
 Test cases for the tokenizer loading
 """
 import unittest
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_tokenizer
 class TestTokenizers(unittest.TestCase):
    """
    test class for the load_tokenizer fn
    """
    def test_default_use_fast(self):
        cfg = DictDefault({})
        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
        assert "Fast" in tokenizer.__class__.__name__
    def test_dont_use_fast(self):
        cfg = DictDefault(
            {
                "tokenizer_use_fast": False,
            }
        )
        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
        assert "Fast" not in tokenizer.__class__.__name__
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -268,7 +268,7 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": None,
-                "adamw_epsilon": 0.0001,
+                "adam_epsilon": 0.0001,
            }
        )
@@ -283,7 +283,7 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": "adafactor",
-                "adamw_beta1": 0.0001,
+                "adam_beta1": 0.0001,
            }
        )
@@ -298,9 +298,9 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": "adamw_bnb_8bit",
-                "adamw_beta1": 0.0001,
+                "adam_beta1": 0.9,
-                "adamw_beta2": 0.0001,
+                "adam_beta2": 0.99,
-                "adamw_epsilon": 0.0001,
+                "adam_epsilon": 0.0001,
            }
        )
Author	SHA1	Message	Date
Wing Lian	81d60e96f0	multipack sampler support from openchat Some checks failed pre-commit / pre-commit (push) Has been cancelled Details PyTest / test (3.10) (push) Has been cancelled Details PyTest / test (3.9) (push) Has been cancelled Details	2023-07-15 08:01:33 -04:00
NanoCode012	168a7a09cc	Merge pull request #274 from OpenAccess-AI-Collective/NanoCode012-patch-2 Feat: Set push to hub as private by default	2023-07-14 23:15:47 +09:00
NanoCode012	231031a0e1	Merge pull request #275 from NanoCode012/feat/safetensors Feat: Add save_safetensors	2023-07-14 23:07:26 +09:00
NanoCode012	5daf7d5299	Merge pull request #273 from OpenAccess-AI-Collective/NanoCode012-patch-1 Feat(docs): Add model_revision arg	2023-07-14 21:09:50 +09:00
NanoCode012	5491278a79	Feat: Add save_safetensors	2023-07-14 13:21:47 +09:00
NanoCode012	1514739f0f	Set push to hub as private by default	2023-07-14 13:17:49 +09:00
NanoCode012	896c1aebcf	Feat(docs): Add model_revision arg	2023-07-14 12:56:07 +09:00
Wing Lian	ef17e15483	Merge pull request #272 from OpenAccess-AI-Collective/model-revision support for loading a model by git revision	2023-07-13 23:12:00 -04:00
Wing Lian	69a235061b	support for loading a model by git revision	2023-07-13 22:58:25 -04:00
Wing Lian	687d889928	Merge pull request #271 from OpenAccess-AI-Collective/quadratic-warmup Quadratic warmup	2023-07-10 12:48:02 -04:00
Wing Lian	c4cf567b55	Merge branch 'main' into quadratic-warmup	2023-07-10 12:42:12 -04:00
Wing Lian	c49729d2bc	better configuration for quadratic warmup	2023-07-10 11:52:59 -04:00
Wing Lian	13ac4d8de2	Merge pull request #268 from OpenAccess-AI-Collective/fix-adam-args params are adam_, not adamw_	2023-07-08 12:33:34 -04:00
Wing Lian	19cf0bda99	params are adam_, not adamw_	2023-07-08 12:13:39 -04:00
Wing Lian	f74edd5b56	Merge pull request #266 from OpenAccess-AI-Collective/trust-remote-no-llama	2023-07-07 21:38:11 -04:00
Wing Lian	d69da99c2c	skip explicit model type too if using trust_remote_code	2023-07-07 21:33:11 -04:00
Wing Lian	66afb76a15	don't use llama if trust_remote_code is set since that needs to use AutoModel path	2023-07-07 21:31:02 -04:00
NanoCode012	a692ad3f4c	Merge pull request #264 from OpenAccess-AI-Collective/NanoCode012-patch-1 Fix(readme): local path loading and custom strategy type	2023-07-06 23:34:57 +09:00
NanoCode012	41da98b982	Fix for linter	2023-07-06 23:20:11 +09:00
NanoCode012	9e64f42e0f	Fix local path loading and custom strategy type	2023-07-06 23:08:09 +09:00
Wing Lian	b9b7d4ce92	Merge pull request #221 from utensil/local_dataset [WIP] Support loading data files from a local directory	2023-07-03 09:10:13 -04:00
Wing Lian	9bed281867	Merge pull request #258 from NanoCode012/fix/deprecate-push Fix future deprecation push_to_hub_model_id	2023-07-03 09:08:26 -04:00
NanoCode012	e79c8e617e	Fix future deprecation push_to_hub_model_id	2023-07-03 12:44:29 +09:00
Wing Lian	71456955f5	pin pydantic so deepspeed isn't broken	2023-07-02 22:26:51 -04:00
Wing Lian	3a783c04e4	Merge pull request #247 from OpenAccess-AI-Collective/fix-apex-base update pip install command for apex	2023-07-01 06:18:25 -04:00
Wing Lian	1e5014acec	Merge pull request #255 from OpenAccess-AI-Collective/open-orca-prompts open orca support	2023-07-01 01:11:23 -04:00
Wing Lian	a10da1caff	11.7.0 nvidia/cuda docker images are deprecated, move to 11.7.1 Some checks failed ci-cd-base / build-base (<nil>, 117, 11.7.1, 3.9, 1.13.1) (push) Has been cancelled Details ci-cd-base / build-base (<nil>, 118, 11.8.0, 3.10, 2.0.0) (push) Has been cancelled Details ci-cd-base / build-base (<nil>, 118, 11.8.0, 3.9, 2.0.0) (push) Has been cancelled Details ci-cd-base / build-base (gptq, 118, 11.8.0, 3.9, 2.0.0) (push) Has been cancelled Details pre-commit / pre-commit (push) Has been cancelled Details PyTest / test (3.10) (push) Has been cancelled Details PyTest / test (3.9) (push) Has been cancelled Details	2023-07-01 00:29:07 -04:00
Wing Lian	4066c78631	Merge pull request #246 from OpenAccess-AI-Collective/sys-prompts-instruct add option for instruct w sys prompts	2023-07-01 00:27:29 -04:00
Wing Lian	78a1e1fa12	open orca support	2023-07-01 00:19:41 -04:00
NanoCode012	bc8a2e5547	Merge pull request #249 from OpenAccess-AI-Collective/NanoCode012-patch-1 Fix typing list in prompt tokenizer	2023-06-30 15:01:41 +09:00
NanoCode012	910ebe47f5	Merge pull request #252 from OpenAccess-AI-Collective/NanoCode012-readme-fix Add cfg.push_to_hub_model_id to readme	2023-06-30 14:56:55 +09:00
NanoCode012	c146880a75	Update README.md	2023-06-30 11:33:53 +09:00
NanoCode012	77bdb7d144	Fix typing list	2023-06-29 14:29:55 +09:00
Wing Lian	530809fd74	update pip install command for apex	2023-06-28 22:36:28 -04:00
Wing Lian	924bbfddec	add option for instruct w sys prompts	2023-06-28 22:27:17 -04:00
Wing Lian	f150c027e3	Merge pull request #224 from OpenAccess-AI-Collective/system-prompt-data System prompt data	2023-06-27 17:57:43 -04:00
Wing Lian	5c39c006c9	Merge pull request #244 from OpenAccess-AI-Collective/push-to-hub push intermediate model checkpoints to hub	2023-06-27 17:57:30 -04:00
Wing Lian	612aabd8c4	push intermediate model checkpoints to hub	2023-06-27 15:40:25 -04:00
Wing Lian	af05883f75	Merge pull request #243 from OpenAccess-AI-Collective/unprompted-instruct skip the system prompt	2023-06-25 22:50:35 -04:00
Wing Lian	05ab9092e3	skip the system prompt	2023-06-25 22:40:50 -04:00
Wing Lian	7b57ed7618	pylint for duplicated code for system prompts	2023-06-25 22:28:07 -04:00
Wing Lian	3a38271276	add tests and supoort for loader for sys prompt data	2023-06-25 22:28:07 -04:00
Wing Lian	8d20e0a3d3	initial wip to get sys prompt from dataset	2023-06-25 22:28:07 -04:00
Wing Lian	de8ed229c3	Merge pull request #240 from OpenAccess-AI-Collective/tokenizer-fast optionally define whether to use_fast tokenizer	2023-06-25 12:47:55 -04:00
Wing Lian	478d8c7b8e	Merge pull request #241 from OpenAccess-AI-Collective/py3-pre-commit better py3 support w pre-commit	2023-06-25 12:47:02 -04:00
Wing Lian	645c13592c	better py3 support w pre-commit	2023-06-25 10:26:02 -04:00
Wing Lian	47d601fa23	optionally define whether to use_fast tokenizer	2023-06-25 10:19:49 -04:00
Utensil	9bdd30cdfd	Support loading data files from a local directory ref: https://huggingface.co/docs/datasets/v2.13.0/en/package_reference/loading_methods#datasets.load_dataset.path	2023-06-21 08:00:58 +00:00
Wing Lian	7dc580b837	add axolotl trainer and quadratic warmup	2023-06-12 13:16:40 -04:00