more gpt-neox long ctx fixes

fix bettertransformers save, force it to skip after saving correctly in callback
more tweaks to do pre-training with bettertransformers
2023-06-01 08:20:08 -04:00 · 2023-06-01 00:33:13 -04:00 · 2023-05-31 21:59:15 -04:00 · 2023-05-31 16:51:19 -04:00 · 2023-05-31 16:41:24 -04:00 · 2023-05-31 16:41:21 -04:00
37 changed files with 718 additions and 108 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -16,13 +16,22 @@ jobs:
        include:
          - cuda: "118"
            cuda_version: 11.8.0
+            python_version: "3.9"
+            pytorch: 2.0.0
+            axolotl_extras:
+          - cuda: "118"
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.0.0
            axolotl_extras:
          - cuda: "117"
            cuda_version: 11.7.0
+            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
          - cuda: "118"
            cuda_version: 11.8.0
+            python_version: "3.9"
            pytorch: 2.0.0
            axolotl_extras: gptq
    steps:
@@ -46,12 +55,13 @@ jobs:
          context: .
          file: ./docker/Dockerfile-base
          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
          build-args: |
            CUDA_VERSION=${{ matrix.cuda_version }}
            CUDA=${{ matrix.cuda }}
+            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras }}
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,14 +15,22 @@ jobs:
        include:
          - cuda: cu118
            cuda_version: 11.8.0
+            python_version: "3.9"
            pytorch: 2.0.0
            axolotl_extras:
          - cuda: cu118
            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.0.0
+            axolotl_extras:
+          - cuda: cu118
+            cuda_version: 11.8.0
+            python_version: "3.9"
            pytorch: 2.0.0
            axolotl_extras: gptq
          - cuda: cu117
            cuda_version: 11.7.0
+            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
    runs-on: self-hosted
@@ -46,10 +54,10 @@ jobs:
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-base-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
@@ -62,14 +70,22 @@ jobs:
        include:
          - cuda: cu118
            cuda_version: 11.8.0
+            python_version: "3.9"
            pytorch: 2.0.0
            axolotl_extras:
          - cuda: cu118
            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.0.0
+            axolotl_extras:
+          - cuda: cu118
+            cuda_version: 11.8.0
+            python_version: "3.9"
            pytorch: 2.0.0
            axolotl_extras: gptq
          - cuda: cu117
            cuda_version: 11.7.0
+            python_version: "3.9"
            pytorch: 1.13.1
            axolotl_extras:
    runs-on: self-hosted
@@ -93,10 +109,10 @@ jobs:
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          file: ./docker/Dockerfile-runpod
          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -5,6 +5,9 @@ exclude = venv
 [mypy-alpaca_lora_4bit.*]
 ignore_missing_imports = True

+[mypy-axolotl.monkeypatch.*]
+ignore_errors = True
+
 [mypy-flash_attn.*]
 ignore_missing_imports = True

@@ -31,3 +34,6 @@ ignore_missing_imports = True

 [mypy-addict]
 ignore_missing_imports = True
+
+[mypy-xformers.*]
+ignore_missing_imports = True
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@

 ## Quickstart ⚡

-**Requirements**: Python 3.9.
+**Requirements**: Python 3.9 and Pytorch 2.0.

 ```bash
 git clone https://github.com/OpenAccess-AI-Collective/axolotl
@@ -58,7 +58,9 @@ accelerate launch scripts/finetune.py examples/lora-openllama-3b/config.yml \
 - Conda/Pip venv
  1. Install python **3.9**

-  2. Install python dependencies with ONE of the following:
+  2. Install pytorch stable https://pytorch.org/get-started/locally/
+
+  3. Install python dependencies with ONE of the following:
      - `pip3 install -e .` (recommended, supports QLoRA, no gptq/int4 support)
      - `pip3 install -e .[gptq]` (next best if you don't need QLoRA, but want to use gptq)
      - `pip3 install -e .[gptq_triton]`
@@ -171,6 +173,9 @@ base_model_ignore_patterns:
 # if the base_model repo on hf hub doesn't include configuration .json files,
 # you can set that here, or leave this empty to default to base_model
 base_model_config: ./llama-7b-hf
+# Optional tokenizer configuration override in case you want to use a different tokenizer
+# than the one defined in the base model
+tokenizer_config:
 # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
 model_type: AutoModelForCausalLM
 # Corresponding tokenizer for the model AutoTokenizer is a good choice
@@ -260,7 +265,7 @@ wandb_log_model: # 'checkpoint'
 output_dir: ./completed-model

 # training hyperparameters
-batch_size: 8
+gradient_accumulation_steps: 1
 micro_batch_size: 2
 eval_batch_size: 2
 num_epochs: 3
@@ -300,6 +305,9 @@ weight_decay:
 xformers_attention:
 # whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
 flash_attention:  # require a100 for llama
+# whether to use scaled-dot-product attention
+# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+sdp_attention:

 # resume from a specific checkpoint dir
 resume_from_checkpoint:
@@ -403,6 +411,16 @@ Try to turn off xformers.

 Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you

+## Badge ❤🏷️
+
+Building something cool with Axolotl? Consider adding a badge to your model card.
+
+```markdown
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
+```
+
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
+
 ## Contributing 🤝

 Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new).
--- a/configs/cerebras_1_3B_alpaca.yml
+++ b/configs/cerebras_1_3B_alpaca.yml
@@ -26,7 +26,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-alpaca
-batch_size: 32
+gradient_accumulation_steps: 1
 micro_batch_size: 4
 num_epochs: 5
 learning_rate: 0.0003
--- a/configs/galactica_1_3B.yml
+++ b/configs/galactica_1_3B.yml
@@ -23,7 +23,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-llama-alpaca
-batch_size: 32
+gradient_accumulation_steps: 1
 micro_batch_size: 16
 num_epochs: 3
 learning_rate: 0.00003
--- a/configs/gpt_neox_20b.yml
+++ b/configs/gpt_neox_20b.yml
@@ -1,39 +0,0 @@
-base_model: EleutherAI/gpt-neox-20b
-base_model_ignore_patterns: pytorch*  # prefer safetensors
-model_type: GPTNeoXForCausalLM
-tokenizer_type: AutoTokenizer
-load_in_8bit: true
-datasets:
-  - path: nomic-ai/gpt4all-j-prompt-generations
-    type: alpaca
-    shards: 4
-    shards_index: 0
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
-adapter: lora
-lora_model_dir:
-sequence_len: 2048
-max_packed_sequence_len: 2048
-lora_r: 8
-lora_alpha: 32
-lora_dropout: 0.05
-lora_target_modules:
-  - query_key_value
-lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
-wandb_project: gpt4all-neox-20b
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./gpt4all-neox-20b
-batch_size: 48
-micro_batch_size: 4
-num_epochs: 5
-learning_rate: 0.00003
-lr_scheduler: one_cycle
-train_on_inputs: false
-group_by_length: false
-bf16: True
-tf32: True
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
--- a/configs/llama_13B_alpaca.yml
+++ b/configs/llama_13B_alpaca.yml
@@ -23,7 +23,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./llama-13b-sharegpt
-batch_size: 64
+gradient_accumulation_steps: 1
 micro_batch_size: 2
 warmup_steps: 1000
 save_steps:
--- a/configs/llama_65B_alpaca.yml
+++ b/configs/llama_65B_alpaca.yml
@@ -29,7 +29,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-llama-alpaca
-batch_size: 128
+gradient_accumulation_steps: 1
 micro_batch_size: 16
 warmup_steps: 1000
 save_steps:
--- a/configs/llama_7B_4bit.yml
+++ b/configs/llama_7B_4bit.yml
@@ -26,7 +26,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-test
-batch_size: 8
+gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 3
 warmup_steps: 100
--- a/configs/llama_7B_alpaca.yml
+++ b/configs/llama_7B_alpaca.yml
@@ -28,7 +28,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-llama-alpaca
-batch_size: 128
+gradient_accumulation_steps: 1
 micro_batch_size: 16
 num_epochs: 5
 learning_rate: 0.00003
--- a/configs/llama_7B_jeopardy.yml
+++ b/configs/llama_7B_jeopardy.yml
@@ -24,7 +24,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./jeopardy-bot-7b
-batch_size: 4
+gradient_accumulation_steps: 2
 micro_batch_size: 1
 num_epochs: 2
 optimizer: adamw_bnb_8bit
--- a/configs/pythia_1_2B_alpaca.yml
+++ b/configs/pythia_1_2B_alpaca.yml
@@ -28,7 +28,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-alpaca
-batch_size: 48
+gradient_accumulation_steps: 1
 micro_batch_size: 4
 num_epochs: 5
 learning_rate: 0.00001
--- a/configs/quickstart.yml
+++ b/configs/quickstart.yml
@@ -26,7 +26,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-test
-batch_size: 4
+gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 3
 warmup_steps: 100
--- a/configs/sample.yml
+++ b/configs/sample.yml
@@ -53,7 +53,8 @@ wandb_log_model:
 # where to save the finsihed model to
 output_dir: ./completed-model
 # training hyperparameters
-batch_size: 8
+gradient_accumulation_steps: 1
+batch_size:
 micro_batch_size: 2
 num_epochs: 3
 warmup_steps: 100
--- a/configs/stability_3b.yml
+++ b/configs/stability_3b.yml
@@ -22,7 +22,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./stable-alpaca-3b
-batch_size: 2
+gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_bnb_8bit
--- a/configs/vicuna_13B_4bit_reflect.yml
+++ b/configs/vicuna_13B_4bit_reflect.yml
@@ -30,7 +30,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-reflect
-batch_size: 8
+gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 3
 learning_rate: 0.00003
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -52,6 +52,8 @@ RUN git clone https://github.com/HazyResearch/flash-attention.git && \

 FROM base-builder AS deepspeed-builder

+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+
 WORKDIR /workspace

 RUN git clone https://github.com/microsoft/DeepSpeed.git && \
--- a/examples/gptq-lora-7b/config.yml
+++ b/examples/gptq-lora-7b/config.yml
@@ -26,7 +26,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./llama-7b-lora-int4
-batch_size: 1
+gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 3
 optimizer: adamw_bnb_8bit
--- a/examples/mpt-7b/config.yml
+++ b/examples/mpt-7b/config.yml
@@ -24,7 +24,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./mpt-alpaca-7b
-batch_size: 1
+gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 3
 optimizer: adamw_bnb_8bit
--- a/examples/pythia-12b/README.md
+++ b/examples/pythia-12b/README.md
@@ -0,0 +1,10 @@
+# Python 12B
+
+- Single-GPU A100 only (?)
+
+```shell
+python scripts/finetune.py examples/pythia-12b/config.yml
+```
+
+⚠️ Multiple-GPU A100 - Doesn't seem to work with multi-gpu without causing OOM! ⚠️
+
--- a/examples/pythia-12b/config.yml
+++ b/examples/pythia-12b/config.yml
@@ -0,0 +1,49 @@
+base_model: EleutherAI/pythia-12b-deduped
+base_model_config: EleutherAI/pythia-12b-deduped
+base_model_ignore_patterns: pytorch*  # prefer safetensors
+model_type: GPTNeoXForCausalLM
+tokenizer_type: AutoTokenizer
+load_in_8bit: false
+load_in_4bit: false
+gptq: false
+device_map: auto
+datasets:
+  - path: vicgalle/alpaca-gpt4
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.05
+adapter:
+lora_model_dir:
+sequence_len: 2048
+max_packed_sequence_len: 2048
+lora_r: 64
+lora_alpha: 32
+lora_dropout: 0.0
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
+wandb_project: pythia-12b
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./pythia-12b
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 5
+learning_rate: 0.00003
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+train_on_inputs: false
+group_by_length: false
+bf16: false
+fp16: false
+float16: true
+tf32: true
+flash_optimum: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+gradient_checkpointing: true
+fsdp:
+fsdp_transformer_layer_cls_to_wrap:
+collator_pad_to_longest: true
--- a/image/axolotl-badge-web.png
+++ b/image/axolotl-badge-web.png
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 peft @ git+https://github.com/huggingface/peft.git
 transformers @ git+https://github.com/huggingface/transformers.git
 bitsandbytes>=0.39.0
+accelerate
 addict
 fire
 PyYAML==6.0
@@ -10,6 +11,7 @@ sentencepiece
 wandb
 einops
 xformers
+optimum
 # qlora things
 bert-score==0.3.13
 evaluate==0.4.0
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -13,11 +13,14 @@ import fire
 import torch
 import yaml

-from axolotl.utils.data import load_prepare_datasets
+# add src to the pythonpath so we don't need to pip install this
+from datasets import Dataset
+from optimum.bettertransformer import BetterTransformer
+from transformers import GenerationConfig
+
+from axolotl.utils.data import load_prepare_datasets, load_pretraining_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_tokenizer
-
-# add src to the pythonpath so we don't need to pip install this
 from axolotl.utils.tokenization import check_dataset_labels
 from axolotl.utils.trainer import setup_trainer
 from axolotl.utils.validation import validate_config
@@ -46,10 +49,11 @@ def choose_device(cfg):
            return "cpu"

    cfg.device = get_device()
-    if cfg.device == "cuda":
-        cfg.device_map = {"": cfg.local_rank}
-    else:
-        cfg.device_map = {"": cfg.device}
+    if cfg.device_map != "auto":
+        if cfg.device == "cuda":
+            cfg.device_map = {"": cfg.local_rank}
+        else:
+            cfg.device_map = {"": cfg.device}


 def get_multi_line_input() -> Optional[str]:
@@ -73,26 +77,33 @@ def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):
        instruction = get_multi_line_input()
        if not instruction:
            return
-        prompt: str = next(prompter_module().build_prompt(instruction=instruction))
+        prompt: str = next(
+            prompter_module().build_prompt(instruction=instruction.strip("\n"))
+        )
        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

        model.eval()
        with torch.no_grad():
-            # gc = GenerationConfig()  # TODO swap out and use this
-            generated = model.generate(
-                inputs=batch["input_ids"].to(cfg.device),
-                do_sample=True,
-                use_cache=True,
+            generation_config = GenerationConfig(
                repetition_penalty=1.1,
-                max_new_tokens=100,
+                max_new_tokens=1024,
                temperature=0.9,
                top_p=0.95,
                top_k=40,
+                bos_token_id=tokenizer.bos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                pad_token_id=tokenizer.pad_token_id,
+                do_sample=True,
+                use_cache=True,
                return_dict_in_generate=True,
                output_attentions=False,
                output_hidden_states=False,
                output_scores=False,
            )
+            generated = model.generate(
+                inputs=batch["input_ids"].to(cfg.device),
+                generation_config=generation_config,
+            )
        print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))


@@ -149,17 +160,23 @@ def train(
            else:
                cfg[k] = kwargs[k]

+    validate_config(cfg)
+
    # setup some derived config / hyperparams
-    cfg.gradient_accumulation_steps = cfg.batch_size // cfg.micro_batch_size
+    cfg.gradient_accumulation_steps = cfg.gradient_accumulation_steps or (
+        cfg.batch_size // cfg.micro_batch_size
+    )
+    cfg.batch_size = (
+        cfg.batch_size or cfg.micro_batch_size * cfg.gradient_accumulation_steps
+    )
    cfg.world_size = int(os.environ.get("WORLD_SIZE", 1))
    cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0))
    choose_device(cfg)
    cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1
    if cfg.ddp:
        cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))}
-        cfg.gradient_accumulation_steps = (
-            cfg.gradient_accumulation_steps // cfg.world_size
-        )
+        cfg.batch_size = cfg.batch_size * cfg.world_size
+
    setup_wandb_env_vars(cfg)
    if cfg.device == "mps":
        cfg.load_in_8bit = False
@@ -168,18 +185,28 @@ def train(
            cfg.fp16 = True
        cfg.bf16 = False

-    validate_config(cfg)
-
    # load the tokenizer first
-    logging.info("loading tokenizer...")
-    tokenizer = load_tokenizer(cfg.base_model_config, cfg.tokenizer_type, cfg)
+    tokenizer_config = cfg.tokenizer_config or cfg.base_model_config
+    logging.info(f"loading tokenizer... {tokenizer_config}")
+    tokenizer = load_tokenizer(tokenizer_config, cfg.tokenizer_type, cfg)

    if check_not_in(
        ["inference", "shard", "merge_lora"], kwargs
    ):  # don't need to load dataset for these
-        train_dataset, eval_dataset = load_prepare_datasets(
-            tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
-        )
+        if not cfg.pretraining_dataset:
+            train_dataset, eval_dataset = load_prepare_datasets(
+                tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
+            )
+        else:
+            if cfg.pretraining_dataset is True:
+                pretraining_dataset = "togethercomputer/RedPajama-Data-1T"
+            else:
+                pretraining_dataset = cfg.pretraining_dataset
+            train_dataset = load_pretraining_dataset(
+                pretraining_dataset, tokenizer, max_tokens=cfg.sequence_len
+            )
+            train_dataset = Dataset.from_list(list(train_dataset))
+            eval_dataset = None

    if cfg.debug or "debug" in kwargs:
        logging.info("check_dataset_labels...")
@@ -225,6 +252,21 @@ def train(
        model.save_pretrained(cfg.output_dir)
        return

+    if cfg.debug:
+        logging.info("check_dataset_labels...")
+        check_dataset_labels(
+            train_dataset.select(
+                [random.randrange(0, len(train_dataset) - 1) for i in range(5)]  # nosec
+            ),
+            tokenizer,
+        )
+
+    if prepare_ds_only:
+        logging.info("Finished preparing dataset. Exiting...")
+        return
+
+    model.train()
+
    trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer)

    model.config.use_cache = False
@@ -240,12 +282,15 @@ def train(

    # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
    if cfg.local_rank == 0:
+
+        def terminate_handler(_, __, model):
+            if cfg.flash_optimum:
+                model = BetterTransformer.reverse(model)
+            model.save_pretrained(cfg.output_dir)
+            sys.exit(0)
+
        signal.signal(
-            signal.SIGINT,
-            lambda signal, frame: (
-                model.save_pretrained(cfg.output_dir),
-                sys.exit(0),
-            ),
+            signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
        )

    logging.info("Starting trainer...")
@@ -265,13 +310,22 @@ def train(
            logging.info(
                f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
            )
-    trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+
+    if cfg.flash_optimum:
+        with torch.backends.cuda.sdp_kernel(
+            enable_flash=True, enable_math=True, enable_mem_efficient=True
+        ):
+            trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+    else:
+        trainer.train(resume_from_checkpoint=resume_from_checkpoint)

    logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")

    # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
    # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
    if cfg.local_rank == 0:
+        if cfg.flash_optimum:
+            model = BetterTransformer.reverse(model)
        model.save_pretrained(cfg.output_dir)

    # trainer.save_model(cfg.output_dir)  # TODO this may be needed for deepspeed to work? need to review another time
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -127,6 +127,11 @@ class ConstantLengthDataset(IterableDataset):
                        input_ids = example["input_ids"]
                        attention_mask = example["attention_mask"]
                        labels = example["labels"]
+                        if (
+                            buffer["input_ids"]
+                            and input_ids[0] == self.tokenizer.bos_token_id
+                        ):
+                            attention_mask[0] = 0

                        if add_concat_token:
                            input_ids.append(self.concat_token_id)
--- a/src/axolotl/flash_attn.py
+++ b/src/axolotl/flash_attn.py
@@ -25,6 +25,7 @@ def forward(

    attention_mask: [bsz, q_len]
    """
+    # pylint: disable=duplicate-code
    bsz, q_len, _ = hidden_states.size()

    query_states = (
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -0,0 +1,233 @@
+"""
+Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments
+"""
+
+import logging
+import math
+from typing import Optional, Tuple
+
+import torch
+import transformers.models.llama.modeling_llama
+from torch import nn
+
+try:
+    import xformers.ops
+except ImportError:
+    logging.error("xformers not found! Please install it before trying to use it.")
+
+
+def hijack_llama_attention():
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward
+
+
+def hijack_llama_sdp_attention():
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = (
+        sdp_attention_forward
+    )
+
+
+def xformers_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    # pylint: disable=duplicate-code
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    (
+        query_states,
+        key_states,
+    ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # We only apply xformers optimizations if we don't need to output the whole attention matrix
+    if not output_attentions:
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
+        # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
+        if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = xformers.ops.memory_efficient_attention(
+                query_states, key_states, value_states, attn_bias=None
+            )
+        else:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = xformers.ops.memory_efficient_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_bias=xformers.ops.LowerTriangularMask(),
+            )
+        attn_weights = None
+    else:
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
+            )
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+    return attn_output, attn_weights, past_key_value
+
+
+def sdp_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    # pylint: disable=duplicate-code
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    (
+        query_states,
+        key_states,
+    ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # We only apply sdp attention if we don't need to output the whole attention matrix
+    if not output_attentions:
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            is_causal=False,
+        )
+        attn_weights = None
+    else:
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
+            )
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+    attn_output = attn_output.transpose(1, 2)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output, attn_weights, past_key_value
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -2,13 +2,14 @@

 import os

+from optimum.bettertransformer import BetterTransformer
 from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
 )
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy


 class SavePeftModelCallback(TrainerCallback):  # pylint: disable=too-few-public-methods
@@ -30,3 +31,39 @@ class SavePeftModelCallback(TrainerCallback):  # pylint: disable=too-few-public-
        kwargs["model"].save_pretrained(peft_model_path)

        return control
+
+
+class SaveBetterTransformerModelCallback(
+    TrainerCallback
+):  # pylint: disable=too-few-public-methods
+    """Callback to save the BetterTransformer wrapped model"""
+
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        # Save
+        if (
+            args.save_strategy == IntervalStrategy.STEPS
+            and args.save_steps > 0
+            and state.global_step % args.save_steps == 0
+        ):
+            control.should_save = True
+
+        if control.should_save:
+            checkpoint_folder = os.path.join(
+                args.output_dir,
+                f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
+            )
+
+            model = BetterTransformer.reverse(kwargs["model"])
+            model.save_pretrained(checkpoint_folder)
+            # FIXME - need to cleanup old checkpoints
+
+            # since we're saving here, we don't need the trainer loop to attempt to save too b/c
+            # the trainer will raise an exception since it can't save a BetterTransformer wrapped model
+            control.should_save = False
+        return control
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -5,7 +5,8 @@ from hashlib import md5
 from pathlib import Path
 from typing import List, Tuple, Union

-from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
+import torch
+from datasets import Dataset, DatasetDict, IterableDataset, load_dataset, load_from_disk
 from huggingface_hub import hf_hub_download
 from transformers import PreTrainedTokenizerBase

@@ -233,6 +234,7 @@ def load_tokenized_prepared_datasets(
                datasets.append(ds_wrapper)
            else:
                logging.error(f"unhandled prompt tokenization strategy: {d.type}")
+                raise ValueError(f"unhandled prompt tokenization strategy: {d.type}")
        logging.info("tokenizing, merging, and shuffling master dataset")

        samples: List[int] = []
@@ -379,8 +381,43 @@ def load_prepare_datasets(
            index=cfg.dataset_shard_idx,
        )

-    dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
-    train_dataset = dataset["train"]
-    eval_dataset = dataset["test"]
+    if cfg.val_set_size:
+        dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
+        train_dataset = dataset["train"]
+        eval_dataset = dataset["test"]
+    else:
+        train_dataset = dataset
+        eval_dataset = None

    return train_dataset, eval_dataset
+
+
+class PretrainingDatasetWrapper(IterableDataset):
+    """
+    Wrapper for pretraining dataset that avoids loading the dataset into memory
+    """
+
+    def __init__(self, tokenizer, dataset_path, max_tokens=2048):
+        self.tokenizer = tokenizer
+        self.dataset_path = dataset_path
+        self.max_tokens = max_tokens
+
+    def __iter__(self):
+        buffer = []
+        for sample in load_dataset(
+            self.dataset_path,
+        )["train"].shuffle():
+            buffer += self.tokenizer(sample["text"])["input_ids"]
+            buffer += [self.tokenizer.eos_token_id]
+            while len(buffer) > self.max_tokens:
+                input_ids = torch.tensor(buffer[: self.max_tokens])
+                yield {
+                    "input_ids": input_ids,
+                    "attention_mask": torch.ones(input_ids.size()),
+                    "labels": input_ids,
+                }
+                buffer = buffer[self.max_tokens :]
+
+
+def load_pretraining_dataset(path, tokenizer, max_tokens=2048):
+    return PretrainingDatasetWrapper(tokenizer, path, max_tokens=max_tokens)
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -10,9 +10,15 @@ from typing import TYPE_CHECKING, Optional, Tuple  # noqa: F401
 import bitsandbytes as bnb
 import torch
 import transformers
-from transformers import AutoModelForCausalLM  # noqa: F401
+from optimum.bettertransformer import BetterTransformer
 from transformers import PreTrainedModel  # noqa: F401
-from transformers import AutoConfig, AutoTokenizer, BitsAndBytesConfig
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    LlamaConfig,
+)

 try:
    from transformers import LlamaForCausalLM
@@ -25,24 +31,23 @@ from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN

 if TYPE_CHECKING:
    from peft import PeftConfig  # noqa: F401
-    from transformers import PreTrainedTokenizer  # noqa: F401

    from axolotl.utils.dict import DictDefault  # noqa: F401


 def load_tokenizer(
-    base_model_config,
+    tokenizer_config,
    tokenizer_type,
    cfg,
 ):
    if tokenizer_type:
        tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
-            base_model_config,
+            tokenizer_config,
            trust_remote_code=cfg.trust_remote_code or False,
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(
-            base_model_config,
+            tokenizer_config,
            trust_remote_code=cfg.trust_remote_code or False,
        )

@@ -97,16 +102,23 @@ def load_model(
            logging.info("patching with flash attention")
            replace_llama_attn_with_flash_attn()
    elif is_llama_derived_model and cfg.xformers_attention:
-        from alpaca_lora_4bit.monkeypatch.llama_attn_hijack_xformers import (
+        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
            hijack_llama_attention,
        )

        logging.info("patching with xformers attention")
        hijack_llama_attention()
+    elif is_llama_derived_model and cfg.sdp_attention:
+        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
+            hijack_llama_sdp_attention,
+        )

-    if cfg.bf16:
+        logging.info("patching with sdp attention")
+        hijack_llama_sdp_attention()
+
+    if cfg.bf16 or cfg.bfloat16:
        torch_dtype = torch.bfloat16
-    elif cfg.load_in_8bit or cfg.fp16:
+    elif cfg.load_in_8bit or cfg.fp16 or cfg.float16:
        torch_dtype = torch.float16
    else:
        torch_dtype = torch.float32
@@ -172,8 +184,10 @@ def load_model(
            )
            load_in_8bit = False
        elif is_llama_derived_model and "LlamaForCausalLM" in globals():
+            config = LlamaConfig.from_pretrained(base_model_config)
            model = LlamaForCausalLM.from_pretrained(
                base_model,
+                config=config,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
                torch_dtype=torch_dtype,
@@ -248,6 +262,12 @@ def load_model(
    embeddings_len = math.ceil(len(tokenizer) / 32) * 32
    model.resize_token_embeddings(embeddings_len)

+    if cfg.sequence_len >= model.config.max_position_embeddings:
+        logging.warning(
+            f"increasing model.config.max_position_embeddings to {cfg.sequence_len}"
+        )
+        model.config.max_position_embeddings = cfg.sequence_len
+
    if not cfg.gptq and (
        (cfg.adapter == "lora" and load_in_8bit)
        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
@@ -291,6 +311,9 @@ def load_model(
        logging.warning("there are no parameters that require gradient updates")
    model.config.use_cache = False

+    if cfg.flash_optimum:
+        model = BetterTransformer.transform(model)
+
    # TODO resume_from_checkpoint handling
    return model, lora_config

--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -1,6 +1,7 @@
 """Module containing the Trainer class and related functions"""

 import importlib
+import logging
 import math
 import os
 import sys
@@ -15,7 +16,10 @@ from torch.optim.lr_scheduler import OneCycleLR
 from transformers import EarlyStoppingCallback, Trainer
 from transformers.trainer_pt_utils import get_parameter_names

-from axolotl.utils.callbacks import SavePeftModelCallback
+from axolotl.utils.callbacks import (
+    SaveBetterTransformerModelCallback,
+    SavePeftModelCallback,
+)
 from axolotl.utils.schedulers import InterpolatingLogScheduler


@@ -225,6 +229,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    ]:  # only save in rank 0
        callbacks.append(SavePeftModelCallback)

+    if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True:
+        logging.info("Setting up SaveBetterTransformerModelCallback.")
+        callbacks.append(SaveBetterTransformerModelCallback)
+
    data_collator_kwargs = {
        "padding": True,
    }
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -2,8 +2,14 @@

 import logging

+import torch
+

 def validate_config(cfg):
+    if cfg.gradient_accumulation_steps and cfg.batch_size:
+        raise ValueError(
+            "please set only one of gradient_accumulation_steps or batch_size"
+        )
    if cfg.load_4bit:
        raise ValueError(
            "cfg.load_4bit parameter has been deprecated and replaced by cfg.gptq"
@@ -44,7 +50,31 @@ def validate_config(cfg):
            "Require cfg.hf_use_auth_token to be True for push_dataset_to_hub"
        )

+    if cfg.flash_optimum is True:
+        if cfg.adapter:
+            logging.warning(
+                "BetterTransformers probably doesn't work with PEFT adapters"
+            )
+        if cfg.fp16 or cfg.bf16:
+            raise ValueError("AMP is not supported with BetterTransformer")
+        if cfg.float16 is not True and cfg.bloat16 is not True:
+            logging.warning(
+                "You should probably set bfloat16 or float16 to true to "
+                "load the model in float16 for BetterTransformers"
+            )
+        if int(torch.__version__.split(".")[0]) < 2:
+            logging.warning("torch>=2.0.0 required")
+            raise ValueError(
+                f"flash_optimum for BetterTransformers may not be used with {torch.__version__}"
+            )
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
-    # no 8bit adamw w bf16
+    # no 8bit adaAmw w bf16
+
+    # GPT-NeoX
+    # evals broken when extending context len
+    # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 162, in forward                        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+    # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/optimum/bettertransformer/models/attention.py", line 74, in gpt2_wrapped_scaled_dot_product
+    # attention_mask = causal_mask + attention_mask
+    # RuntimeError: The size of tensor a (2048) must match the size of tensor b (8132) at non-singleton dimension 3
--- a/tests/fixtures/alpaca/alpaca.json
+++ b/tests/fixtures/alpaca/alpaca.json
@@ -0,0 +1,12 @@
+[
+  {
+    "instruction": "You will be given a series of words. Output these words in reverse order, with each word on its own line.",
+    "input": "Words: ['Hello', 'world'].",
+    "output": "['world', 'Hello']"
+  },
+  {
+    "instruction": "In this task, you're given a short description of an event. Your job is to order the steps involved in the event from first to last. Note that there may be multiple correct answers for each event.",
+    "input": "Description: A man walks into a bar and orders a drink. He pays for his drink and leaves the bar.",
+    "output": "1. The man walks into the bar.\n2. He orders a drink.\n3. He pays for his drink.\n4. He leaves the bar."
+  }
+]
--- a/tests/test_packed_dataset.py
+++ b/tests/test_packed_dataset.py
@@ -0,0 +1,65 @@
+"""Module for testing dataset sequence packing"""
+
+import unittest
+from pathlib import Path
+
+from datasets import Dataset, load_dataset
+from transformers import AutoTokenizer
+
+from axolotl.datasets import ConstantLengthDataset, TokenizedPromptDataset
+from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
+from axolotl.prompters import AlpacaPrompter
+
+
+class TestPacking(unittest.TestCase):
+    """
+    Test class for packing dataset sequences
+    """
+
+    def setUp(self) -> None:
+        # pylint: disable=duplicate-code
+        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+        self.tokenizer.add_special_tokens(
+            {
+                "bos_token": "<s>",
+                "eos_token": "</s>",
+                "unk_token": "<unk>",
+            }
+        )
+
+    def test_resets_attention(self):
+        prompter = AlpacaPrompter("chat")
+        strat = AlpacaPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        dateset = load_dataset(
+            "json",
+            data_files=str(Path(__file__).parent / "fixtures/alpaca/alpaca.json"),
+        )["train"]
+        dataset = Dataset.from_list(list(TokenizedPromptDataset(strat, dateset)))
+
+        constant_len_dataset = ConstantLengthDataset(
+            self.tokenizer,
+            [dataset],
+            seq_length=2048,
+        )
+        packed_dataset = Dataset.from_list(list(constant_len_dataset))
+        example = packed_dataset[0]
+        next_bos_index = (
+            example["input_ids"][1:].index(self.tokenizer.bos_token_id) + 1
+        )  # add one since we sliced
+
+        # first example doesn't have mask reset
+        assert example["input_ids"][0] == self.tokenizer.bos_token_id
+        assert example["attention_mask"][0] == 1
+
+        # but subsequent one does
+        assert example["input_ids"][next_bos_index] == self.tokenizer.bos_token_id
+        assert example["attention_mask"][next_bos_index] == 0
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -18,6 +18,7 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
    """

    def setUp(self) -> None:
+        # pylint: disable=duplicate-code
        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
        self.tokenizer.add_special_tokens(
            {
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -117,3 +117,32 @@ class ValidationTest(unittest.TestCase):
            }
        )
        validate_config(cfg)
+
+    def test_gradient_accumulations_or_batch_size(self):
+        cfg = DictDefault(
+            {
+                "gradient_accumulation_steps": 1,
+                "batch_size": 1,
+            }
+        )
+
+        with pytest.raises(
+            ValueError, match=r".*gradient_accumulation_steps or batch_size.*"
+        ):
+            validate_config(cfg)
+
+        cfg = DictDefault(
+            {
+                "batch_size": 1,
+            }
+        )
+
+        validate_config(cfg)
+
+        cfg = DictDefault(
+            {
+                "gradient_accumulation_steps": 1,
+            }
+        )
+
+        validate_config(cfg)
Author	SHA1	Message	Date
Wing Lian	6fcb73faaa	more gpt-neox long ctx fixes Some checks failed pre-commit / pre-commit (push) Has been cancelled Details PyTest / test (3.10) (push) Has been cancelled Details PyTest / test (3.9) (push) Has been cancelled Details	2023-06-01 08:20:08 -04:00
Wing Lian	a32cc1d021	fix bettertransformers save, force it to skip after saving correctly in callback	2023-06-01 00:33:13 -04:00
Wing Lian	86bd9fcff4	more tweaks to do pre-training with bettertransformers	2023-05-31 21:59:15 -04:00
Wing Lian	ed7531abb8	experimental expansion of ctx len	2023-05-31 16:51:19 -04:00
Wing Lian	bdb547b830	add validation/warning for bettertransformers and torch version	2023-05-31 16:41:24 -04:00
Wing Lian	8a37b43678	use pythia-12b, neox-20b is flaky	2023-05-31 16:41:21 -04:00
Wing Lian	28acebac36	add flash attn context for efficient training and attempt setting model to train mode:	2023-05-31 16:40:38 -04:00
Wing Lian	adea682316	add support for opimum bettertransformers	2023-05-31 16:39:35 -04:00
Wing Lian	a6f5e5eaec	Merge pull request #134 from OpenAccess-AI-Collective/gas-batch-fix fix batch size calculation	2023-05-31 14:24:48 -04:00
Wing Lian	5a631b305b	fix batch size calculation	2023-05-31 14:11:32 -04:00
Wing Lian	f94dd626f0	Merge pull request #130 from OpenAccess-AI-Collective/gas swap batch size for gradient accumulation steps to decouple from num gpu	2023-05-31 13:03:51 -04:00
Wing Lian	5079753b7a	Merge pull request #131 from OpenAccess-AI-Collective/fix-packing-mask fix packing so that concatenated sequences reset the attention	2023-05-31 13:03:37 -04:00
Wing Lian	0136f510f2	don't worry about duplicate code here	2023-05-31 12:05:43 -04:00
Wing Lian	9b8585dc70	fix packing so that concatenated sequences reset the attention	2023-05-31 11:38:52 -04:00
Wing Lian	8eb5811d4e	Merge pull request #129 from OpenAccess-AI-Collective/builder-badge add badge info to readme	2023-05-31 10:37:59 -04:00
Wing Lian	e0011fdf55	Fix base builder, missing tags	2023-05-31 09:52:03 -04:00
Wing Lian	6e9e98720e	Merge pull request #127 from OpenAccess-AI-Collective/py310-docker-runpod add py310 support from base image	2023-05-31 09:39:42 -04:00
Wing Lian	c2a0792680	swap batch size for gradient accumulation steps to decouple from num gpu	2023-05-31 09:38:12 -04:00
Wing Lian	b267d24a2b	add badge info to readme	2023-05-31 09:28:44 -04:00
Wing Lian	5c3f5db38b	Add files via upload	2023-05-31 09:22:54 -04:00
Wing Lian	e3d03745ba	add py310 support from base image	2023-05-31 09:07:28 -04:00
NanoCode012	fac46002d4	Merge pull request #119 from NanoCode012/feat/update-inference Feat(inference): Swap to GenerationConfig	2023-05-31 14:09:18 +09:00
NanoCode012	33d40179ba	Increase max_new_tokens Co-authored-by: Wing Lian <wing.lian@gmail.com>	2023-05-31 14:04:49 +09:00
Wing Lian	dcb03d6da4	Merge pull request #114 from OpenAccess-AI-Collective/accelerate-dep Add accelerate dep	2023-05-31 00:47:17 -04:00
NanoCode012	0e4be625ae	Merge pull request #118 from NanoCode012/feat/torch-readme Fix(readme): Fix torch missing from readme	2023-05-31 13:29:41 +09:00
NanoCode012	bdc4bd7d4e	Update README.md	2023-05-31 13:24:28 +09:00
Wing Lian	2d0ba3b818	Merge pull request #124 from OpenAccess-AI-Collective/xformers-fix copy xformers attn from ooba since we removed dep on alpaca_lora_4bit	2023-05-31 00:11:40 -04:00
Wing Lian	c7021e191f	Merge pull request #120 from OpenAccess-AI-Collective/model-from-path split up llama model loading so config can be loaded from base config and models can be loaded from a path	2023-05-31 00:08:38 -04:00
Wing Lian	c56818b119	don't worry about dupes	2023-05-31 00:06:47 -04:00
Wing Lian	2675fb756e	update readme for SDP	2023-05-31 00:04:54 -04:00
Wing Lian	1076bcbbca	Update src/axolotl/monkeypatch/llama_attn_hijack_xformers.py Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>	2023-05-31 00:00:19 -04:00
Wing Lian	2daa6835f0	Update src/axolotl/monkeypatch/llama_attn_hijack_xformers.py Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>	2023-05-30 23:59:05 -04:00
Wing Lian	e3c494ca7b	remove unused import and update readme	2023-05-30 23:55:45 -04:00
Wing Lian	ad0ea6aaab	black formatting ignore copied file fix linting	2023-05-30 23:50:29 -04:00
Wing Lian	876edd83d0	Merge pull request #123 from OpenAccess-AI-Collective/bas-batch add support for gradient accumulation steps	2023-05-30 23:45:29 -04:00
Wing Lian	6cb2310592	copy xformers attn from ooba since we removed dep on alpaca_lora_4bit	2023-05-30 23:34:36 -04:00
Wing Lian	6fa40bf8ad	black formatting	2023-05-30 23:33:37 -04:00
Wing Lian	3aad5f3b3e	add support for gradient accumulation steps	2023-05-30 23:24:37 -04:00
Wing Lian	39a208c2bc	fix up tokenizer config, isort fix	2023-05-30 23:00:02 -04:00
Wing Lian	2520ecd6df	split up llama model loading so config can be loaded from base config and models can be loaded from a path	2023-05-30 22:32:44 -04:00
Wing Lian	c5b0af1a7e	define python version (3.10) explicitly as string in yaml	2023-05-30 22:23:35 -04:00
NanoCode012	988aeb9c34	Feat: Swap to GenerationConfig	2023-05-31 10:48:19 +09:00
NanoCode012	cf61f14bff	FIx(readme): Fix torch missing from readme	2023-05-31 10:28:49 +09:00
Wing Lian	0abcd71a85	Merge pull request #115 from OpenAccess-AI-Collective/docker-version-fixes docker fixes: py310, fix cuda arg in deepspeed	2023-05-30 18:11:26 -04:00
Wing Lian	c43c5c84ff	py310, fix cuda arg in deepspeed	2023-05-30 18:02:34 -04:00
Wing Lian	36ec6e1a0e	Add accelerate dep	2023-05-30 16:36:13 -04:00