add save_only_model arg

Update SaveAxolotlConfigtoWandBCallback to use artifact instead of save (#1483 )
* deprecated wandb.save * also use wandb.save for axolotl yaml * chore: lint --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>
2024-04-10 16:09:08 -04:00 · 2024-04-09 18:58:38 -04:00 · 2024-04-09 17:28:43 -04:00 · 2024-04-09 17:28:27 -04:00 · 2024-04-09 16:40:26 -04:00 · 2024-04-09 11:05:15 -04:00
55 changed files with 2454 additions and 1182 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -16,17 +16,22 @@ jobs:
            cuda_version: 11.8.0
            python_version: "3.10"
            pytorch: 2.1.2
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "121"
            cuda_version: 12.1.0
            python_version: "3.10"
            pytorch: 2.1.2
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "121"
            cuda_version: 12.1.0
            python_version: "3.11"
            pytorch: 2.1.2
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "121"
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v3
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -28,7 +28,7 @@ jobs:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.11"
-            pytorch: 2.1.2
+            pytorch: 2.2.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -63,7 +63,7 @@ jobs:
            ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}

-  build-axolotl-runpod:
+  build-axolotl-cloud:
    needs: build-axolotl
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
    # this job needs to be run on self-hosted GPU runners...
@@ -84,7 +84,7 @@ jobs:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.11"
-            pytorch: 2.1.2
+            pytorch: 2.2.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -113,7 +113,5 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-             winglian/axolotl-runpod:main-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
-             ${{ (matrix.is_latest) && format('{0}-latest', 'winglian/axolotl-runpod:main') || '' }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -0,0 +1,118 @@
+name: docker-nightlies
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
+
+jobs:
+  build-axolotl:
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: 118
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
+            is_latest: true
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.1
+            axolotl_extras:
+    runs-on: axolotl-gpu-runner
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Docker metadata
+        id: metadata
+        uses: docker/metadata-action@v5
+        with:
+          images: winglian/axolotl
+          tags: |
+            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
+      - name: Build and export to Docker
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
+            CUDA=${{ matrix.cuda }}
+            PYTORCH_VERSION=${{ matrix.pytorch }}
+            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
+          file: ./docker/Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: |
+            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          labels: ${{ steps.metadata.outputs.labels }}
+
+  build-axolotl-cloud:
+    needs: build-axolotl
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
+    # this job needs to be run on self-hosted GPU runners...
+    strategy:
+      matrix:
+        include:
+          - cuda: 118
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+            is_latest: true
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.1
+            axolotl_extras:
+    runs-on: axolotl-gpu-runner
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Docker metadata
+        id: metadata
+        uses: docker/metadata-action@v5
+        with:
+          images: winglian/axolotl-cloud
+          tags: |
+            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      - name: Build
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            CUDA=${{ matrix.cuda }}
+          file: ./docker/Dockerfile-cloud
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: |
+             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -25,7 +25,7 @@ jobs:

      - name: Install dependencies
        run: |
-          pip3 install wheel
+          pip3 install wheel packaging
          pip3 install -e .
          pip3 install -r requirements-tests.txt

--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -48,6 +48,8 @@ jobs:

      - name: Install dependencies
        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging
          pip3 install -U -e .
          pip3 install -r requirements-tests.txt

@@ -77,6 +79,11 @@ jobs:
            python_version: "3.10"
            pytorch: 2.1.2
            num_gpus: 1
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.1
+            num_gpus: 1
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/README.md
+++ b/README.md
@@ -35,13 +35,12 @@ Features:
  - [Google Colab](#google-colab)
  - [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
 - [Dataset](#dataset)
-  - [How to Add Custom Prompts](#how-to-add-custom-prompts)
-  - [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
 - [Config](#config)
  - [Train](#train)
  - [Inference](#inference-playground)
  - [Merge LORA to Base](#merge-lora-to-base)
  - [Special Tokens](#special-tokens)
+  - [All Config Options](#all-config-options)
 - Advanced Topics
  - [Multipack](./docs/multipack.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
  - [RLHF & DPO](./docs/rlhf.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
@@ -108,7 +107,7 @@ Get started with Axolotl in just a few steps! This quickstart guide will walk yo
 git clone https://github.com/OpenAccess-AI-Collective/axolotl
 cd axolotl

-pip3 install packaging
+pip3 install packaging ninja
 pip3 install -e '.[flash-attn,deepspeed]'
 ```

@@ -222,23 +221,17 @@ For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud
  python get-pip.py
  ```

-  3. Install torch
-  ```bash
-  pip3 install -U torch --index-url https://download.pytorch.org/whl/cu118
-  ```
+  3. Install Pytorch https://pytorch.org/get-started/locally/

-  4. Axolotl
-  ```bash
-  git clone https://github.com/OpenAccess-AI-Collective/axolotl
-  cd axolotl
+  4. Follow instructions on quickstart.

-  pip3 install packaging
-  pip3 install -e '.[flash-attn,deepspeed]'
+  5. Run
+  ```bash
  pip3 install protobuf==3.20.3
  pip3 install -U --ignore-installed requests Pillow psutil scipy
  ```

-  5. Set path
+  6. Set path
  ```bash
  export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
  ```
@@ -299,186 +292,9 @@ HF_TOKEN=xx BUCKET=<unique-name> sky spot launch axolotl-spot.yaml --env HF_TOKE

 ### Dataset

-Axolotl supports a variety of dataset formats. Below are some of the formats you can use.
-Have dataset(s) in one of the following format (JSONL recommended):
+Axolotl supports a variety of dataset formats.  It is recommended to use a JSONL.  The schema of the JSONL depends upon the task and the prompt template you wish to use.  Instead of a JSONL, you can also use a HuggingFace dataset with columns for each JSONL field.

-#### Pretraining
-
- `completion`: raw corpus
-  ```json
-  {"text": "..."}
-  ```
-
-Note: Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:
-
-```yaml
-pretraining_dataset: # hf path only
-```
-
-#### Supervised finetuning
-
-##### Instruction
-
- `alpaca`: instruction; input(optional)
-  ```json
-  {"instruction": "...", "input": "...", "output": "..."}
-  ```
-
-<details>
-
-<summary>See other formats</summary>
-
- `jeopardy`: question and answer
-  ```json
-  {"question": "...", "category": "...", "answer": "..."}
-  ```
- `oasst`: instruction
-  ```json
-  {"INSTRUCTION": "...", "RESPONSE": "..."}
-  ```
- `gpteacher`: instruction; input(optional)
-  ```json
-  {"instruction": "...", "input": "...", "response": "..."}
-  ```
- `reflection`: instruction with reflect; input(optional)
-  ```json
-  {"instruction": "...", "input": "...", "output": "...", "reflection": "...", "corrected": "..."}
-  ```
- `explainchoice`: question, choices, (solution OR explanation)
-  ```json
-  {"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
-  ```
- `concisechoice`: question, choices, (solution OR explanation)
-  ```json
-  {"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
-  ```
- `summarizetldr`: article and summary
-  ```json
-  {"article": "...", "summary": "..."}
-  ```
- `alpaca_chat`: basic instruct for alpaca chat
-  ```json
-  {"instruction": "...", "input": "...", "response": "..."}
-  ```
- `alpaca_chat.load_qa`: question and answer for alpaca chat
-  ```json
-  {"question": "...", "answer": "..."}
-  ```
- `alpaca_chat.load_concise`: question and answer for alpaca chat, for concise answers
-  ```json
-  {"instruction": "...", "input": "...", "response": "..."}
-  ```
- `alpaca_chat.load_camel_ai`: question and answer for alpaca chat, for load_camel_ai
-  ```json
-  {"message_1": "...", "message_2": "..."}
-  ```
- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
-  ```json
-  {"system_prompt": "...", "question": "...", "response": "..."}
-  ```
- `context_qa`: in context question answering from an article
-  ```json
-  {"article": "...", "question": "...", "answer": "..."}
-  ```
- `context_qa.load_v2`: in context question answering (alternate)
-  ```json
-  {"context": "...", "question": "...", "answer": "..."}
-  ```
- `context_qa.load_404`: in context question answering from an article, with default response for no answer from context
-  ```json
-  {"article": "...", "unanswerable_question": "..."}
-  ```
- `creative_acr.load_answer`: instruction and revision
-  ```json
-  {"instruction": "...", "revision": "..."}
-  ```
- `creative_acr.load_critique`: critique
-  ```json
-  {"scores": "...", "critiques": "...", "instruction": "...", "answer": "..."}
-  ```
- `creative_acr.load_revise`: critique and revise
-  ```json
-  {"scores": "...", "critiques": "...", "instruction": "...", "answer": "...", "revision": "..."}
-  ```
- `metharme`: instruction, adds additional eos tokens
-  ```json
-  {"prompt": "...", "generation": "..."}
-  ```
-
-</details>
-
-##### Template-Free
-
- `input_output`: template-free prompt construction
-  ```json
-   {"segments": [{"label": true|false, "text": "..."}]}
-  ```
-
-This is a special format that allows you to construct prompts without using templates. This is for advanced users who want more freedom with prompt construction.  See [these docs](docs/input_output.qmd) for more details.
-
-##### Conversation
-
- `sharegpt`: conversations where `from` is `human`/`gpt`. (optional: first row with role `system` to override default system prompt)
-  ```json
-  {"conversations": [{"from": "...", "value": "..."}]}
-  ```
-
-<details>
-
-<summary>See other formats</summary>
-
- `pygmalion`: pygmalion
-  ```json
-  {"conversations": [{"role": "...", "value": "..."}]}
-  ```
- `sharegpt.load_role`: conversations where `role` is used instead of `from`
-  ```json
-  {"conversations": [{"role": "...", "value": "..."}]}
-  ```
- `sharegpt.load_guanaco`: conversations where `from` is `prompter`/`assistant` instead of default sharegpt
-  ```json
-  {"conversations": [{"from": "...", "value": "..."}]}
-  ```
- `sharegpt_jokes`: creates a chat where bot is asked to tell a joke, then explain why the joke is funny
-  ```json
-  {"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
-  ```
-
-</details>
-
-Note: `type: sharegpt` opens a special config `conversation:` that enables conversions to many Conversation types. See dataset section under [all yaml options](#all-yaml-options).
-
-#### How to add custom prompts
-
-For a dataset that is preprocessed for instruction purposes:
-
-```json
-{"input": "...", "output": "..."}
-```
-
-You can use this example in your YAML config:
-
-```yaml
-datasets:
-  - path: repo
-    type:
-      system_prompt: ""
-      field_system: system
-      field_instruction: input
-      field_output: output
-      format: "[INST] {instruction} [/INST]"
-      no_input_format: "[INST] {instruction} [/INST]"
-```
-See full config options under [all yaml options](#all-yaml-options).
-
-#### How to use your custom pretokenized dataset
-
- Do not pass a `type:`
- Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
-
-```yaml
- path: ...
-```
+See [these docs](https://openaccess-ai-collective.github.io/axolotl/docs/dataset-formats/) for more information on how to use different dataset formats.

 ### Config

@@ -563,512 +379,9 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
    - v_proj
  ```

-<details id="all-yaml-options">
+#### All Config Options

-<summary>All yaml options (click to expand)</summary>
-
-```yaml
-# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
-# This can also be a relative path to a model on disk
-base_model: ./llama-7b-hf
-# You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
-base_model_ignore_patterns:
-# If the base_model repo on hf hub doesn't include configuration .json files,
-# You can set that here, or leave this empty to default to base_model
-base_model_config: ./llama-7b-hf
-# You can specify to choose a specific model revision from huggingface hub
-revision_of_model:
-# Optional tokenizer configuration path in case you want to use a different tokenizer
-# than the one defined in the base model
-tokenizer_config:
-# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
-model_type: AutoModelForCausalLM
-# Corresponding tokenizer for the model AutoTokenizer is a good choice
-tokenizer_type: AutoTokenizer
-# Trust remote code for untrusted source
-trust_remote_code:
-# use_fast option for tokenizer loading from_pretrained, default to True
-tokenizer_use_fast:
-# Whether to use the legacy tokenizer setting, defaults to True
-tokenizer_legacy:
-# Resize the model embeddings when new tokens are added to multiples of 32
-# This is reported to improve training speed on some models
-resize_token_embeddings_to_32x:
-
-# (Internal use only)
-# Used to identify which the model is based on
-is_falcon_derived_model:
-is_llama_derived_model:
-is_qwen_derived_model:
-# Please note that if you set this to true, `padding_side` will be set to "left" by default
-is_mistral_derived_model:
-
-# optional overrides to the base model configuration
-overrides_of_model_config:
-  # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
-  rope_scaling:
-    type: # linear | dynamic
-    factor: # float
-
-# optional overrides to the bnb 4bit quantization configuration
-# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
-bnb_config_kwargs:
-  # These are default values
-  llm_int8_has_fp16_weight: false
-  bnb_4bit_quant_type: nf4
-  bnb_4bit_use_double_quant: true
-
-
-# Whether you are training a 4-bit GPTQ quantized model
-gptq: true
-
-# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
-load_in_8bit: true
-# Use bitsandbytes 4 bit
-load_in_4bit:
-
-# Use CUDA bf16
-bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
-# Use CUDA fp16
-fp16: true
-# Use CUDA tf32
-tf32: true # require >=ampere
-
-# No AMP (automatic mixed precision)
-bfloat16: true # require >=ampere
-float16: true
-
-# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
-gpu_memory_limit: 20GiB
-# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
-lora_on_cpu: true
-
-# A list of one or more datasets to finetune the model with
-datasets:
-  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
-  - path: vicgalle/alpaca-gpt4
-  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
-    ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
-    data_files: # Optional[str] path to source data files
-    shards: # Optional[int] number of shards to split data into
-    name: # Optional[str] name of dataset configuration to load
-    train_on_split: train # Optional[str] name of dataset split to load from
-
-    # Optional[str] fastchat conversation type, only used with type: sharegpt
-    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
-    field_human: # Optional[str]. Human key to use for conversation.
-    field_model: # Optional[str]. Assistant key to use for conversation.
-    # Add additional keys from your dataset as input or output roles
-    roles:
-      input: # Optional[List[str]]. These will be masked based on train_on_input
-      output: # Optional[List[str]].
-
-  # Custom user instruction prompt
-  - path: repo
-    type:
-      # The below are defaults. only set what's needed if you use a different column name.
-      system_prompt: ""
-      system_format: "{system}"
-      field_system: system
-      field_instruction: instruction
-      field_input: input
-      field_output: output
-
-      # Customizable to be single line or multi-line
-      # Use {instruction}/{input} as key to be replaced
-      # 'format' can include {input}
-      format: |-
-        User: {instruction} {input}
-        Assistant:
-      # 'no_input_format' cannot include {input}
-      no_input_format: "{instruction} "
-
-      # For `completion` datsets only, uses the provided field instead of `text` column
-      field:
-
-# If false, the datasets will not be shuffled and will keep their original order in `datasets`.
-# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
-shuffle_merged_datasets: true
-
-# A list of one or more datasets to eval the model with.
-# You can use either test_datasets, or val_set_size, but not both.
-test_datasets:
-  - path: /workspace/data/eval.jsonl
-    ds_type: json
-    # You need to specify a split. For "json" datasets the default split is called "train".
-    split: train
-    type: completion
-    data_files:
-      - /workspace/data/eval.jsonl
-
-# use RL training: 'dpo', 'ipo', 'kto_pair'
-rl:
-
-# Saves the desired chat template to the tokenizer_config.json for easier inferencing
-# Currently supports chatml and inst (mistral/mixtral)
-chat_template: chatml
-# Changes the default system message
-default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
-# Axolotl attempts to save the dataset as an arrow after packing the data together so
-# subsequent training attempts load faster, relative path
-dataset_prepared_path: data/last_run_prepared
-# Push prepared dataset to hub
-push_dataset_to_hub: # repo path
-# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
-# if not set.
-dataset_processes: # defaults to os.cpu_count() if not set
-# Keep dataset in memory while preprocessing
-# Only needed if cached dataset is taking too much storage
-dataset_keep_in_memory:
-# push checkpoints to hub
-hub_model_id: # private repo path to push finetuned model
-# how to push checkpoints to hub
-# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
-hub_strategy:
-# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
-# Required to be true when used in combination with `push_dataset_to_hub`
-hf_use_auth_token: # boolean
-# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
-val_set_size: 0.04
-# Num shards for whole dataset
-dataset_shard_num:
-# Index of shard to use for whole dataset
-dataset_shard_idx:
-
-# The maximum length of an input to train with, this should typically be less than 2048
-# as most models have a token/context limit of 2048
-sequence_len: 2048
-# Pad inputs so each step uses constant sized buffers
-# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
-pad_to_sequence_len:
-# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
-sample_packing:
-# Set to 'false' if getting errors during eval with sample_packing on.
-eval_sample_packing:
-# You can set these packing optimizations AFTER starting a training at least once.
-# The trainer will provide recommended values for these values.
-sample_packing_eff_est:
-total_num_tokens:
-
-# Passed through to transformers when loading the model when launched without accelerate
-# Use `sequential` when training w/ model parallelism to limit memory
-device_map:
-# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.
-max_memory:
-
-# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
-adapter: lora
-# If you already have a lora model trained that you want to load, put that here.
-# This means after training, if you want to test the model, you should set this to the value of `output_dir`.
-# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`.
-lora_model_dir:
-
-# LoRA hyperparameters
-# For more details about the following options, see:
-# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-#  - k_proj
-#  - o_proj
-#  - gate_proj
-#  - down_proj
-#  - up_proj
-lora_target_linear: # If true, will target all linear modules
-peft_layers_to_transform: # The layer indices to transform, otherwise, apply to all layers
-
-# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
-# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
-# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
-# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
-lora_modules_to_save:
-#  - embed_tokens
-#  - lm_head
-
-lora_fan_in_fan_out: false
-
-peft:
-  # Configuration options for loftq initialization for LoRA
-  # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization
-  loftq_config:
-    loftq_bits:  # typically 4 bits
-
-# ReLoRA configuration
-# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
-relora_steps: # Number of steps per ReLoRA restart
-relora_warmup_steps: # Number of per-restart warmup steps
-relora_anneal_steps: # Number of anneal steps for each relora cycle
-relora_prune_ratio: # threshold for optimizer magnitude when pruning
-relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
-
-# wandb configuration if you're using it
-# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.
-wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
-wandb_project: # Your wandb project name
-wandb_entity: # A wandb Team name if using a Team
-wandb_watch:
-wandb_name: # Set the name of your wandb run
-wandb_run_id: # Set the ID of your wandb run
-wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
-
-# mlflow configuration if you're using it
-mlflow_tracking_uri: # URI to mlflow
-mlflow_experiment_name: # Your experiment name
-hf_mlflow_log_artifacts:  # set to true to copy each saved checkpoint on each save to mlflow artifact registry
-
-# Where to save the full-finetuned model to
-output_dir: ./completed-model
-
-# Whether to use torch.compile and which backend to use
-torch_compile:  # bool
-torch_compile_backend:  # Optional[str]
-
-# Training hyperparameters
-
-# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
-gradient_accumulation_steps: 1
-# The number of samples to include in each batch. This is the number of samples sent to each GPU.
-micro_batch_size: 2
-eval_batch_size:
-num_epochs: 4
-warmup_steps: 100  # cannot use with warmup_ratio
-warmup_ratio: 0.05  # cannot use with warmup_steps
-learning_rate: 0.00003
-lr_quadratic_warmup:
-logging_steps:
-eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
-evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
-save_strategy: # Set to `no` to skip checkpoint saves
-save_steps: # Leave empty to save at each epoch
-saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
-save_total_limit: # Checkpoints saved at a time
-# Maximum number of iterations to train for. It precedes num_epochs which means that
-# if both are set, num_epochs will not be guaranteed.
-# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
-max_steps:
-
-eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
-eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
-eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", chrf]
-
-loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
-loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
-
-# Save model as safetensors (require safetensors package)
-save_safetensors:
-
-# Whether to mask out or include the human's prompt from the training labels
-train_on_inputs: false
-# Group similarly sized data to minimize padding.
-# May be slower to start, as it must download and sort the entire dataset.
-# Note that training loss may have an oscillating pattern with this enabled.
-group_by_length: false
-
-# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
-gradient_checkpointing: false
-# additional kwargs to pass to the trainer for gradient checkpointing
-# gradient_checkpointing_kwargs:
-#   use_reentrant: true
-
-# Stop training after this many evaluation losses have increased in a row
-# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
-early_stopping_patience: 3
-
-# Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
-lr_scheduler_kwargs:
-cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
-cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
-
-# For one_cycle optim
-lr_div_factor: # Learning rate div factor
-
-# Specify optimizer
-# Valid values are driven by the Transformers OptimizerNames class, see:
-# https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
-#
-# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
-# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
-# in the examples/ for your model and fine-tuning use case.
-#
-# Valid values for 'optimizer' include:
-# - adamw_hf
-# - adamw_torch
-# - adamw_torch_fused
-# - adamw_torch_xla
-# - adamw_apex_fused
-# - adafactor
-# - adamw_anyprecision
-# - sgd
-# - adagrad
-# - adamw_bnb_8bit
-# - lion_8bit
-# - lion_32bit
-# - paged_adamw_32bit
-# - paged_adamw_8bit
-# - paged_lion_32bit
-# - paged_lion_8bit
-# - galore_adamw
-# - galore_adamw_8bit
-# - galore_adafactor
-# - galore_adamw_layerwise
-# - galore_adamw_8bit_layerwise
-# - galore_adafactor_layerwise
-optimizer:
-# Dictionary of arguments to pass to the optimizer
-optim_args:
-# For Galore Optimizers the following optim_args are available
-# rank:  # type: int
-# update_proj_gap  # type: int
-# scale  # type: float
-# proj_type:  # type: str, default = std
-
-# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm
-optim_target_modules:
-# - self_attn  # for llama
-# - mlp
-
-# Specify weight decay
-weight_decay:
-# adamw hyperparams
-adam_beta1:
-adam_beta2:
-adam_epsilon:
-# Gradient clipping max norm
-max_grad_norm:
-
-# Augmentation techniques
-# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
-# currently only supported on Llama and Mistral
-neftune_noise_alpha:
-
-# Whether to bettertransformers
-flash_optimum:
-# Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
-xformers_attention:
-# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
-flash_attention:
-flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
-flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
-flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
-flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
-# Whether to use scaled-dot-product attention
-# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
-sdp_attention:
-# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
-s2_attention:
-# Resume from a specific checkpoint dir
-resume_from_checkpoint:
-# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
-# Be careful with this being turned on between different models.
-auto_resume_from_checkpoints: false
-
-# Don't mess with this, it's here for accelerate and torchrun
-local_rank:
-
-# Add or change special tokens.
-# If you add tokens here, you don't need to add them to the `tokens` list.
-special_tokens:
-  # bos_token: "<s>"
-  # eos_token: "</s>"
-  # unk_token: "<unk>"
-
-# Add extra tokens.
-tokens:
-
-# FSDP
-fsdp:
-fsdp_config:
-
-# Deepspeed config path. e.g., deepspeed_configs/zero3.json
-deepspeed:
-
-# Advanced DDP Arguments
-ddp_timeout:
-ddp_bucket_cap_mb:
-ddp_broadcast_buffers:
-
-# Path to torch distx for optim 'adamw_anyprecision'
-torchdistx_path:
-
-# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
-pretraining_dataset:
-
-# Debug mode
-debug:
-
-# Seed
-seed:
-
-# Allow overwrite yml config using from cli
-strict:
-```
-
-</details>
-
-<details>
-<summary> Understanding of batch size and gradient accumulation steps </summary>
-<br/>
-Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn't significantly impact learning.
-
-This method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here's why:
-
-1. **Memory Consumption with Batch Size**: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.
-
-2. **Gradient Accumulation**: With gradient accumulation, you're effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you're only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.
-
-**Example 1:**
-Micro batch size: 3
-Gradient accumulation steps: 2
-Number of GPUs: 3
-Total batch size = 3 * 2 * 3 = 18
-
-```
-| GPU 1          | GPU 2          | GPU 3          |
-|----------------|----------------|----------------|
-| S1, S2, S3     | S4, S5, S6     | S7, S8, S9     |
-| e1, e2, e3     | e4, e5, e6     | e7, e8, e9     |
-|----------------|----------------|----------------|
-| → (accumulate) | → (accumulate) | → (accumulate) |
-|----------------|----------------|----------------|
-| S10, S11, S12  | S13, S14, S15  | S16, S17, S18  |
-| e10, e11, e12  | e13, e14, e15  | e16, e17, e18  |
-|----------------|----------------|----------------|
-| → (apply)      | → (apply)      | → (apply)      |
-
-Accumulated gradient for the weight w1 after the second iteration (considering all GPUs):
-Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18
-
-Weight update for w1:
-w1_new = w1_old - learning rate x (Total gradient for w1 / 18)
-```
-
-**Example 2:**
-Micro batch size: 2
-Gradient accumulation steps: 1
-Number of GPUs: 3
-Total batch size = 2 * 1 * 3 = 6
-
-```
-| GPU 1     | GPU 2     | GPU 3     |
-|-----------|-----------|-----------|
-| S1, S2    | S3, S4    | S5, S6    |
-| e1, e2    | e3, e4    | e5, e6    |
-|-----------|-----------|-----------|
-| → (apply) | → (apply) | → (apply) |
-
-Accumulated gradient for the weight w1 (considering all GPUs):
-Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6
-
-Weight update for w1:
-w1_new = w1_old - learning rate × (Total gradient for w1 / 6)
-```
-
-</details>
+See [these docs](docs/config.qmd) for all config options.

 ### Train

@@ -1299,14 +612,8 @@ Bugs? Please check the [open issues](https://github.com/OpenAccess-AI-Collective

 PRs are **greatly welcome**!

-Please run below to setup env
+Please run the quickstart instructions followed by the below to setup env:
 ```bash
-git clone https://github.com/OpenAccess-AI-Collective/axolotl
-cd axolotl
-
-pip3 install packaging
-pip3 install -e '.[flash-attn,deepspeed]'
-
 pip3 install -r requirements-dev.txt -r requirements-tests.txt
 pre-commit install

--- a/_quarto.yml
+++ b/_quarto.yml
@@ -30,20 +30,20 @@ website:
          # TODO Edit folder structure after we have more docs.
            - docs/debugging.qmd
            - docs/multipack.qmd
-            - docs/fdsp_qlora.qmd
+            - docs/fsdp_qlora.qmd
            - docs/input_output.qmd
            - docs/rlhf.qmd
            - docs/nccl.qmd
            - docs/mac.qmd
            - docs/multi-node.qmd
+        - section: "Dataset Formats"
+          contents: docs/dataset-formats/*
        - section: "Reference"
          contents:
            - docs/config.qmd
        - docs/faq.qmd


-
-
 format:
  html:
    theme: materia
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -22,6 +22,7 @@ RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD

 # If AXOLOTL_EXTRAS is set, append it in brackets
+RUN pip install causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
--- a/deepspeed_configs/zero3_bf16_cpuoffload_all.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_all.json
@@ -0,0 +1,39 @@
+{
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 0,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 0,
+    "stage3_max_reuse_distance": 0,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
--- a/deepspeed_configs/zero3_bf16_cpuoffload_params.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_params.json
@@ -0,0 +1,35 @@
+{
+  "zero_optimization": {
+    "stage": 3,
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 0,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 0,
+    "stage3_max_reuse_distance": 0,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,6 +20,7 @@ RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 WORKDIR /workspace/axolotl

 # If AXOLOTL_EXTRAS is set, append it in brackets
+RUN pip install causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
--- a/docs/batch_vs_grad.qmd
+++ b/docs/batch_vs_grad.qmd
@@ -0,0 +1,59 @@
+---
+title: Batch size vs Gradient accumulation
+description: Understanding of batch size and gradient accumulation steps
+---
+
+Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn't significantly impact learning.
+
+This method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here's why:
+
+1. **Memory Consumption with Batch Size**: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.
+
+2. **Gradient Accumulation**: With gradient accumulation, you're effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you're only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.
+
+**Example 1:**
+Micro batch size: 3
+Gradient accumulation steps: 2
+Number of GPUs: 3
+Total batch size = 3 * 2 * 3 = 18
+
+```
+| GPU 1          | GPU 2          | GPU 3          |
+|----------------|----------------|----------------|
+| S1, S2, S3     | S4, S5, S6     | S7, S8, S9     |
+| e1, e2, e3     | e4, e5, e6     | e7, e8, e9     |
+|----------------|----------------|----------------|
+| → (accumulate) | → (accumulate) | → (accumulate) |
+|----------------|----------------|----------------|
+| S10, S11, S12  | S13, S14, S15  | S16, S17, S18  |
+| e10, e11, e12  | e13, e14, e15  | e16, e17, e18  |
+|----------------|----------------|----------------|
+| → (apply)      | → (apply)      | → (apply)      |
+
+Accumulated gradient for the weight w1 after the second iteration (considering all GPUs):
+Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18
+
+Weight update for w1:
+w1_new = w1_old - learning rate x (Total gradient for w1 / 18)
+```
+
+**Example 2:**
+Micro batch size: 2
+Gradient accumulation steps: 1
+Number of GPUs: 3
+Total batch size = 2 * 1 * 3 = 6
+
+```
+| GPU 1     | GPU 2     | GPU 3     |
+|-----------|-----------|-----------|
+| S1, S2    | S3, S4    | S5, S6    |
+| e1, e2    | e3, e4    | e5, e6    |
+|-----------|-----------|-----------|
+| → (apply) | → (apply) | → (apply) |
+
+Accumulated gradient for the weight w1 (considering all GPUs):
+Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6
+
+Weight update for w1:
+w1_new = w1_old - learning rate × (Total gradient for w1 / 6)
+```
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -3,15 +3,443 @@ title: Config options
 description: A complete list of all configuration options.
 ---

-```{python}
-#|echo: false
-#|output: asis
-import re
-# Regex pattern to match the YAML block including its code fence
-pattern = r'<details[^>]*id="all-yaml-options"[^>]*>.*?<summary>All yaml options.*?```yaml(.*?)```.*?</details>'
+```yaml
+# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
+# This can also be a relative path to a model on disk
+base_model: ./llama-7b-hf
+# You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
+base_model_ignore_patterns:
+# If the base_model repo on hf hub doesn't include configuration .json files,
+# You can set that here, or leave this empty to default to base_model
+base_model_config: ./llama-7b-hf
+# You can specify to choose a specific model revision from huggingface hub
+revision_of_model:
+# Optional tokenizer configuration path in case you want to use a different tokenizer
+# than the one defined in the base model
+tokenizer_config:
+# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
+model_type: AutoModelForCausalLM
+# Corresponding tokenizer for the model AutoTokenizer is a good choice
+tokenizer_type: AutoTokenizer
+# Trust remote code for untrusted source
+trust_remote_code:
+# use_fast option for tokenizer loading from_pretrained, default to True
+tokenizer_use_fast:
+# Whether to use the legacy tokenizer setting, defaults to True
+tokenizer_legacy:
+# Resize the model embeddings when new tokens are added to multiples of 32
+# This is reported to improve training speed on some models
+resize_token_embeddings_to_32x:

-with open('../README.md', 'r') as f:
-    doc = f.read()
-match = re.search(pattern, doc, re.DOTALL)
-print("```yaml", match.group(1).strip(), "```", sep="\n")
+# (Internal use only)
+# Used to identify which the model is based on
+is_falcon_derived_model:
+is_llama_derived_model:
+is_qwen_derived_model:
+# Please note that if you set this to true, `padding_side` will be set to "left" by default
+is_mistral_derived_model:
+
+# optional overrides to the base model configuration
+overrides_of_model_config:
+  # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
+  rope_scaling:
+    type: # linear | dynamic
+    factor: # float
+
+# optional overrides to the bnb 4bit quantization configuration
+# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
+bnb_config_kwargs:
+  # These are default values
+  llm_int8_has_fp16_weight: false
+  bnb_4bit_quant_type: nf4
+  bnb_4bit_use_double_quant: true
+
+
+# Whether you are training a 4-bit GPTQ quantized model
+gptq: true
+
+# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
+load_in_8bit: true
+# Use bitsandbytes 4 bit
+load_in_4bit:
+
+# Use CUDA bf16
+bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
+# Use CUDA fp16
+fp16: true
+# Use CUDA tf32
+tf32: true # require >=ampere
+
+# No AMP (automatic mixed precision)
+bfloat16: true # require >=ampere
+float16: true
+
+# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
+gpu_memory_limit: 20GiB
+# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
+lora_on_cpu: true
+
+# A list of one or more datasets to finetune the model with
+datasets:
+  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
+  - path: vicgalle/alpaca-gpt4
+  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
+    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
+    ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
+    data_files: # Optional[str] path to source data files
+    shards: # Optional[int] number of shards to split data into
+    name: # Optional[str] name of dataset configuration to load
+    train_on_split: train # Optional[str] name of dataset split to load from
+
+    # Optional[str] fastchat conversation type, only used with type: sharegpt
+    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+    field_human: # Optional[str]. Human key to use for conversation.
+    field_model: # Optional[str]. Assistant key to use for conversation.
+    # Add additional keys from your dataset as input or output roles
+    roles:
+      input: # Optional[List[str]]. These will be masked based on train_on_input
+      output: # Optional[List[str]].
+
+  # Custom user instruction prompt
+  - path: repo
+    type:
+      # The below are defaults. only set what's needed if you use a different column name.
+      system_prompt: ""
+      system_format: "{system}"
+      field_system: system
+      field_instruction: instruction
+      field_input: input
+      field_output: output
+
+      # Customizable to be single line or multi-line
+      # Use {instruction}/{input} as key to be replaced
+      # 'format' can include {input}
+      format: |-
+        User: {instruction} {input}
+        Assistant:
+      # 'no_input_format' cannot include {input}
+      no_input_format: "{instruction} "
+
+      # For `completion` datsets only, uses the provided field instead of `text` column
+      field:
+
+# If false, the datasets will not be shuffled and will keep their original order in `datasets`.
+# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
+shuffle_merged_datasets: true
+
+# A list of one or more datasets to eval the model with.
+# You can use either test_datasets, or val_set_size, but not both.
+test_datasets:
+  - path: /workspace/data/eval.jsonl
+    ds_type: json
+    # You need to specify a split. For "json" datasets the default split is called "train".
+    split: train
+    type: completion
+    data_files:
+      - /workspace/data/eval.jsonl
+
+# use RL training: 'dpo', 'ipo', 'kto_pair'
+rl:
+
+# Saves the desired chat template to the tokenizer_config.json for easier inferencing
+# Currently supports chatml and inst (mistral/mixtral)
+chat_template: chatml
+# Changes the default system message
+default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
+# Axolotl attempts to save the dataset as an arrow after packing the data together so
+# subsequent training attempts load faster, relative path
+dataset_prepared_path: data/last_run_prepared
+# Push prepared dataset to hub
+push_dataset_to_hub: # repo path
+# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
+# if not set.
+dataset_processes: # defaults to os.cpu_count() if not set
+# Keep dataset in memory while preprocessing
+# Only needed if cached dataset is taking too much storage
+dataset_keep_in_memory:
+# push checkpoints to hub
+hub_model_id: # private repo path to push finetuned model
+# how to push checkpoints to hub
+# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
+hub_strategy:
+# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
+# Required to be true when used in combination with `push_dataset_to_hub`
+hf_use_auth_token: # boolean
+# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
+val_set_size: 0.04
+# Num shards for whole dataset
+dataset_shard_num:
+# Index of shard to use for whole dataset
+dataset_shard_idx:
+
+# The maximum length of an input to train with, this should typically be less than 2048
+# as most models have a token/context limit of 2048
+sequence_len: 2048
+# Pad inputs so each step uses constant sized buffers
+# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
+pad_to_sequence_len:
+# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
+sample_packing:
+# Set to 'false' if getting errors during eval with sample_packing on.
+eval_sample_packing:
+# You can set these packing optimizations AFTER starting a training at least once.
+# The trainer will provide recommended values for these values.
+sample_packing_eff_est:
+total_num_tokens:
+
+# Passed through to transformers when loading the model when launched without accelerate
+# Use `sequential` when training w/ model parallelism to limit memory
+device_map:
+# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.
+max_memory:
+
+# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
+adapter: lora
+# If you already have a lora model trained that you want to load, put that here.
+# This means after training, if you want to test the model, you should set this to the value of `output_dir`.
+# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`.
+lora_model_dir:
+
+# LoRA hyperparameters
+# For more details about the following options, see:
+# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - v_proj
+#  - k_proj
+#  - o_proj
+#  - gate_proj
+#  - down_proj
+#  - up_proj
+lora_target_linear: # If true, will target all linear modules
+peft_layers_to_transform: # The layer indices to transform, otherwise, apply to all layers
+
+# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
+# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
+# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
+# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
+lora_modules_to_save:
+#  - embed_tokens
+#  - lm_head
+
+lora_fan_in_fan_out: false
+
+peft:
+  # Configuration options for loftq initialization for LoRA
+  # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization
+  loftq_config:
+    loftq_bits:  # typically 4 bits
+
+# ReLoRA configuration
+# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
+relora_steps: # Number of steps per ReLoRA restart
+relora_warmup_steps: # Number of per-restart warmup steps
+relora_anneal_steps: # Number of anneal steps for each relora cycle
+relora_prune_ratio: # threshold for optimizer magnitude when pruning
+relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
+
+# wandb configuration if you're using it
+# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.
+wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
+wandb_project: # Your wandb project name
+wandb_entity: # A wandb Team name if using a Team
+wandb_watch:
+wandb_name: # Set the name of your wandb run
+wandb_run_id: # Set the ID of your wandb run
+wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
+
+# mlflow configuration if you're using it
+mlflow_tracking_uri: # URI to mlflow
+mlflow_experiment_name: # Your experiment name
+hf_mlflow_log_artifacts:  # set to true to copy each saved checkpoint on each save to mlflow artifact registry
+
+# Where to save the full-finetuned model to
+output_dir: ./completed-model
+
+# Whether to use torch.compile and which backend to use
+torch_compile:  # bool
+torch_compile_backend:  # Optional[str]
+
+# Training hyperparameters
+
+# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
+gradient_accumulation_steps: 1
+# The number of samples to include in each batch. This is the number of samples sent to each GPU.
+micro_batch_size: 2
+eval_batch_size:
+num_epochs: 4
+warmup_steps: 100  # cannot use with warmup_ratio
+warmup_ratio: 0.05  # cannot use with warmup_steps
+learning_rate: 0.00003
+lr_quadratic_warmup:
+logging_steps:
+eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
+evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
+save_strategy: # Set to `no` to skip checkpoint saves
+save_steps: # Leave empty to save at each epoch
+saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
+save_total_limit: # Checkpoints saved at a time
+# Maximum number of iterations to train for. It precedes num_epochs which means that
+# if both are set, num_epochs will not be guaranteed.
+# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
+max_steps:
+
+eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
+eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
+eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", chrf]
+
+loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
+loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
+
+# Save model as safetensors (require safetensors package)
+save_safetensors:
+
+# Whether to mask out or include the human's prompt from the training labels
+train_on_inputs: false
+# Group similarly sized data to minimize padding.
+# May be slower to start, as it must download and sort the entire dataset.
+# Note that training loss may have an oscillating pattern with this enabled.
+group_by_length: false
+
+# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
+gradient_checkpointing: false
+# additional kwargs to pass to the trainer for gradient checkpointing
+# gradient_checkpointing_kwargs:
+#   use_reentrant: true
+
+# Stop training after this many evaluation losses have increased in a row
+# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
+early_stopping_patience: 3
+
+# Specify a scheduler and kwargs to use with the optimizer
+lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
+lr_scheduler_kwargs:
+cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
+cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
+
+# For one_cycle optim
+lr_div_factor: # Learning rate div factor
+
+# Specify optimizer
+# Valid values are driven by the Transformers OptimizerNames class, see:
+# https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
+#
+# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
+# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
+# in the examples/ for your model and fine-tuning use case.
+#
+# Valid values for 'optimizer' include:
+# - adamw_hf
+# - adamw_torch
+# - adamw_torch_fused
+# - adamw_torch_xla
+# - adamw_apex_fused
+# - adafactor
+# - adamw_anyprecision
+# - sgd
+# - adagrad
+# - adamw_bnb_8bit
+# - lion_8bit
+# - lion_32bit
+# - paged_adamw_32bit
+# - paged_adamw_8bit
+# - paged_lion_32bit
+# - paged_lion_8bit
+# - galore_adamw
+# - galore_adamw_8bit
+# - galore_adafactor
+# - galore_adamw_layerwise
+# - galore_adamw_8bit_layerwise
+# - galore_adafactor_layerwise
+optimizer:
+# Dictionary of arguments to pass to the optimizer
+optim_args:
+# For Galore Optimizers the following optim_args are available
+# rank:  # type: int
+# update_proj_gap  # type: int
+# scale  # type: float
+# proj_type:  # type: str, default = std
+
+# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm
+optim_target_modules:
+# - self_attn  # for llama
+# - mlp
+
+# Specify weight decay
+weight_decay:
+# adamw hyperparams
+adam_beta1:
+adam_beta2:
+adam_epsilon:
+# Gradient clipping max norm
+max_grad_norm:
+
+# Augmentation techniques
+# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
+# currently only supported on Llama and Mistral
+neftune_noise_alpha:
+
+# Whether to bettertransformers
+flash_optimum:
+# Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
+xformers_attention:
+# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
+flash_attention:
+flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
+flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
+flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
+flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
+# Whether to use scaled-dot-product attention
+# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+sdp_attention:
+# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
+s2_attention:
+# Resume from a specific checkpoint dir
+resume_from_checkpoint:
+# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
+# Be careful with this being turned on between different models.
+auto_resume_from_checkpoints: false
+
+# Don't mess with this, it's here for accelerate and torchrun
+local_rank:
+
+# Add or change special tokens.
+# If you add tokens here, you don't need to add them to the `tokens` list.
+special_tokens:
+  # bos_token: "<s>"
+  # eos_token: "</s>"
+  # unk_token: "<unk>"
+
+# Add extra tokens.
+tokens:
+
+# FSDP
+fsdp:
+fsdp_config:
+
+# Deepspeed config path. e.g., deepspeed_configs/zero3.json
+deepspeed:
+
+# Advanced DDP Arguments
+ddp_timeout:
+ddp_bucket_cap_mb:
+ddp_broadcast_buffers:
+
+# Path to torch distx for optim 'adamw_anyprecision'
+torchdistx_path:
+
+# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
+pretraining_dataset:
+
+# Debug mode
+debug:
+
+# Seed
+seed:
+
+# Allow overwrite yml config using from cli
+strict:
 ```
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -0,0 +1,63 @@
+---
+title: Conversation
+description: Conversation format for supervised fine-tuning.
+order: 3
+---
+
+## sharegpt
+
+conversations where `from` is `human`/`gpt`. (optional: first row with role `system` to override default system prompt)
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"from": "...", "value": "..."}]}
+```
+
+Note: `type: sharegpt` opens special configs:
+- `conversation`: enables conversions to many Conversation types. Refer to the 'name' [here](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py) for options.
+- `roles`: allows you to specify the roles for input and output. This is useful for datasets with custom roles such as `tool` etc to support masking.
+- `field_human`: specify the key to use instead of `human` in the conversation.
+- `field_model`: specify the key to use instead of `gpt` in the conversation.
+
+```yaml
+datasets:
+    path: ...
+    type: sharegpt
+
+    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+    field_human: # Optional[str]. Human key to use for conversation.
+    field_model: # Optional[str]. Assistant key to use for conversation.
+    # Add additional keys from your dataset as input or output roles
+    roles:
+      input: # Optional[List[str]]. These will be masked based on train_on_input
+      output: # Optional[List[str]].
+```
+
+## pygmalion
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"role": "...", "value": "..."}]}
+```
+
+## sharegpt.load_role
+
+conversations where `role` is used instead of `from`
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"role": "...", "value": "..."}]}
+```
+
+## sharegpt.load_guanaco
+
+conversations where `from` is `prompter` `assistant` instead of default sharegpt
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"from": "...", "value": "..."}]}
+```
+
+## sharegpt_jokes
+
+creates a chat where bot is asked to tell a joke, then explain why the joke is funny
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
+```
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -0,0 +1,14 @@
+---
+title: Dataset Formats
+description: Supported dataset formats.
+listing:
+  fields: [title, description]
+  type: table
+  sort-ui: false
+  filter-ui: false
+  max-description-length: 250
+---
+
+Axolotl supports a variety of dataset formats.  It is recommended to use a JSONL format.  The schema of the JSONL depends upon the task and the prompt template you wish to use. Instead of a JSONL, you can also use a HuggingFace dataset with columns for each JSONL field.
+
+Below are these various formats organized by task:
--- a/docs/dataset-formats/inst_tune.qmd
+++ b/docs/dataset-formats/inst_tune.qmd
@@ -0,0 +1,189 @@
+---
+title: Instruction Tuning
+description: Instruction tuning formats for supervised fine-tuning.
+order: 2
+---
+
+## alpaca
+
+instruction; input(optional)
+
+```{.json filename="data.jsonl"}
+{"instruction": "...", "input": "...", "output": "..."}
+```
+
+## jeopardy
+
+question and answer
+
+```{.json filename="data.jsonl"}
+{"question": "...", "category": "...", "answer": "..."}
+```
+
+## oasst
+
+instruction
+
+```{.json filename="data.jsonl"}
+{"INSTRUCTION": "...", "RESPONSE": "..."}
+```
+
+## gpteacher
+
+instruction; input(optional)
+
+```{.json filename="data.jsonl"}
+{"instruction": "...", "input": "...", "response": "..."}
+```
+
+## reflection
+
+instruction with reflect; input(optional)
+
+```{.json filename="data.jsonl"}
+{"instruction": "...", "input": "...", "output": "...", "reflection": "...", "corrected": "..."}
+```
+
+## explainchoice
+
+question, choices, (solution OR explanation)
+
+```{.json filename="data.jsonl"}
+{"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
+```
+
+## concisechoice
+
+question, choices, (solution OR explanation)
+
+```{.json filename="data.jsonl"}
+{"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
+```
+
+## summarizetldr
+
+article and summary
+
+```{.json filename="data.jsonl"}
+{"article": "...", "summary": "..."}
+```
+
+## alpaca_chat
+
+basic instruct for alpaca chat
+
+```{.json filename="data.jsonl"}
+{"instruction": "...", "input": "...", "response": "..."}
+```
+
+## alpaca_chat.load_qa
+
+question and answer for alpaca chat
+
+```{.json filename="data.jsonl"}
+{"question": "...", "answer": "..."}
+```
+
+## alpaca_chat.load_concise
+
+question and answer for alpaca chat, for concise answers
+
+```{.json filename="data.jsonl"}
+{"instruction": "...", "input": "...", "response": "..."}
+```
+
+## alpaca_chat.load_camel_ai
+
+question and answer for alpaca chat, for load_camel_ai
+
+```{.json filename="data.jsonl"}
+{"message_1": "...", "message_2": "..."}
+```
+
+## alpaca_w_system.load_open_orca
+
+support for open orca datasets with included system prompts, instruct
+
+```{.json filename="data.jsonl"}
+{"system_prompt": "...", "question": "...", "response": "..."}
+```
+
+## context_qa
+
+in context question answering from an article
+
+```{.json filename="data.jsonl"}
+{"article": "...", "question": "...", "answer": "..."}
+```
+
+## context_qa.load_v2
+
+in context question answering (alternate)
+
+```{.json filename="data.jsonl"}
+{"context": "...", "question": "...", "answer": "..."}
+```
+
+## context_qa.load_404
+
+in context question answering from an article, with default response for no answer from context
+
+```{.json filename="data.jsonl"}
+{"article": "...", "unanswerable_question": "..."}
+```
+
+## creative_acr.load_answer
+
+instruction and revision
+
+```{.json filename="data.jsonl"}
+{"instruction": "...", "revision": "..."}
+```
+
+## creative_acr.load_critique
+
+critique
+
+```{.json filename="data.jsonl"}
+{"scores": "...", "critiques": "...", "instruction": "...", "answer": "..."}
+```
+
+## creative_acr.load_revise
+
+critique and revise
+
+```{.json filename="data.jsonl"}
+{"scores": "...", "critiques": "...", "instruction": "...", "answer": "...", "revision": "..."}
+```
+
+## metharme
+
+instruction, adds additional eos tokens
+
+```{.json filename="data.jsonl"}
+{"prompt": "...", "generation": "..."}
+```
+
+## How to add custom prompt format
+
+For a dataset that is preprocessed for instruction purposes:
+
+```{.json filename="data.jsonl"}
+{"input": "...", "output": "..."}
+```
+
+You can use this example in your YAML config:
+
+```{.yaml filename="config.yaml"}
+datasets:
+  - path: repo
+    type:
+      system_prompt: ""
+      field_system: system
+      field_instruction: input
+      field_output: output
+      format: "[INST] {instruction} [/INST]"
+      no_input_format: "[INST] {instruction} [/INST]"
+```
+
+See full config options under [here](../config.qmd).
--- a/docs/dataset-formats/pretraining.qmd
+++ b/docs/dataset-formats/pretraining.qmd
@@ -0,0 +1,26 @@
+---
+title: Pre-training
+description: Data format for a pre-training completion task.
+order: 1
+---
+
+For pretraining, there is no prompt template or roles.  The only required field is `text`:
+
+```{.json filename="data.jsonl"}
+{"text": "first row"}
+{"text": "second row"}
+...
+```
+
+:::{.callout-note}
+
+### Streaming is recommended for large datasets
+
+Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:
+
+```{.yaml filename="config.yaml"}
+pretraining_dataset: # hf path only
+...
+```
+
+:::
--- a/docs/dataset-formats/template_free.qmd
+++ b/docs/dataset-formats/template_free.qmd
@@ -0,0 +1,7 @@
+---
+title: Template-Free
+description: Construct prompts without a template.
+order: 4
+---
+
+See [these docs](../input_output.qmd).
--- a/docs/dataset-formats/tokenized.qmd
+++ b/docs/dataset-formats/tokenized.qmd
@@ -0,0 +1,12 @@
+---
+title: Custom Pre-Tokenized Dataset
+description: How to use a custom pre-tokenized dataset.
+order: 5
+---
+
+- Do not pass a `type:` in your axolotl config.
+- Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
+
+```{.yaml filename="config.yml"}
+- path: ...
+```
--- a/docs/fsdp_qlora.qmd
+++ b/docs/fsdp_qlora.qmd
@@ -1,5 +1,5 @@
 ---
-title: FDSP + QLoRA
+title: "FDSP + QLoRA"
 description: Use FSDP with QLoRA to fine-tune large LLMs on consumer GPUs.
 format:
  html:
--- a/docs/input_output.qmd
+++ b/docs/input_output.qmd
@@ -43,7 +43,7 @@ labels so that your model can focus on predicting the outputs only.
 ### You may not want prompt templates

 However, there are many situations where you don't want to use one of
-these formats or templates (I usually don't!). This is because they can:
+these formats or templates. This is because they can:

 -   Add unnecessary boilerplate to your prompts.
 -   Create artifacts like special delimiters `<|im_start|>` that can
@@ -91,8 +91,9 @@ format into a jsonl file (below is the first row from the file

 ```bash
 $ head -n1 output.jsonl | python -m json.tool
+```

-{.cell-output .cell-output-stdout}
+:::{.cell-output .cell-output-stdout}
    {
        "segments": [
            {
@@ -113,7 +114,7 @@ $ head -n1 output.jsonl | python -m json.tool
            }
        ]
    }
-```
+:::

 Set `label:false` when you want to mask a segment of text so that the
 model isn't trained on it. Some things to keep in mind:
@@ -238,8 +239,9 @@ version is repeated below for reference):

 ```bash
 $ head -n1 output.jsonl | python -m json.tool
+```

-{.cell-output .cell-output-stdout}
+:::{.cell-output .cell-output-stdout}
    {
        "segments": [
            {
@@ -260,4 +262,4 @@ $ head -n1 output.jsonl | python -m json.tool
            }
        ]
    }
-```
+:::
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -21,7 +21,8 @@ lora_dropout: 0.05
 lora_target_linear: true

 sequence_len: 4096
-sample_packing: false
+sample_packing: true
+eval_sample_packing: false
 pad_to_sequence_len: true

 wandb_project:
--- a/examples/jamba/README.md
+++ b/examples/jamba/README.md
@@ -0,0 +1,10 @@
+# Jamba
+
+- ✅ qlora w/ deepspeed Zero-2 needs at least 2x GPUs and
+  - 35GiB VRAM per GPU w minimal context length
+  - 56GiB VRAM per GPU (w multipack enabled)
+- ✅ qlora w/ deepspeed Zero-3 needs at least 2x GPUs and 67GiB VRAM (wtf?)
+- ✅ qlora single-gpu, ~51GiB VRAM
+- ✅ multipack
+- ❓ FSDP
+- ❓ 8-bit LoRA
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -0,0 +1,62 @@
+base_model: ai21labs/Jamba-v0.1
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.0
+output_dir: ./out
+
+sequence_len: 4096
+sample_packing: false
+pad_to_sequence_len: false
+eval_sample_packing: false
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+adapter: qlora
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+
+low_cpu_mem_usage: true
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 2
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.00001
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+special_tokens:
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -0,0 +1,62 @@
+base_model: ai21labs/Jamba-v0.1
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.0
+output_dir: ./out
+
+sequence_len: 4096
+sample_packing: false
+pad_to_sequence_len: false
+eval_sample_packing: false
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+adapter: qlora
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+
+low_cpu_mem_usage: true
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 2
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.00001
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch:
+saves_per_epoch: 1
+debug:
+deepspeed: deepspeed_configs/zero2.json
+weight_decay: 0.0
+special_tokens:
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -0,0 +1,75 @@
+base_model: NousResearch/Llama-2-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.05
+output_dir: ./lisa-out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+adapter:
+lora_model_dir:
+lora_r:
+lora_alpha:
+lora_dropout:
+lora_target_linear:
+lora_fan_in_fan_out:
+
+lisa_n_layers: 4
+lisa_step_interval: 20
+lisa_layers_attribute: model.layers
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 5e-5 # recommendation from lisa paper for 7b
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+flash_attn_cross_entropy: false
+flash_attn_rms_norm: true
+flash_attn_fuse_qkv: false
+flash_attn_fuse_mlp: true
+
+warmup_steps: 100
+evals_per_epoch: 4
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
--- a/examples/qwen/README.md
+++ b/examples/qwen/README.md
@@ -0,0 +1,10 @@
+# Qwen
+
+TODO
+
+# Qwen2 MoE
+
+✅ multipack
+✅ qwen2_moe 4-bit QLoRA
+✅ qwen2_moe 16-bit LoRA
+❓ qwen2_moe 8-bit LoRA
--- a/examples/qwen/qwen2-moe-lora.yaml
+++ b/examples/qwen/qwen2-moe-lora.yaml
@@ -0,0 +1,64 @@
+base_model: Qwen/Qwen1.5-MoE-A2.7B
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./out
+
+sequence_len: 1024  # supports up to 32k
+sample_packing: false
+pad_to_sequence_len: false
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 4
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/qwen/qwen2-moe-qlora.yaml
+++ b/examples/qwen/qwen2-moe-qlora.yaml
@@ -0,0 +1,64 @@
+base_model: Qwen/Qwen1.5-MoE-A2.7B
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./out
+
+sequence_len: 1024  # supports up to 32k
+sample_packing: false
+pad_to_sequence_len: false
+
+adapter: qlora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 4
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
--- a/index.qmd
+++ b/index.qmd
@@ -1,4 +1,8 @@
-
+---
+toc-location: right-body
+toc-title: Table Of Contents
+toc-expand: 2
+---

 ```{python}
 #|output: asis
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 packaging==23.2
-peft==0.9.0
-transformers @ git+https://github.com/huggingface/transformers.git@73a73b415e36f41481369f6129cb4b62bb127a78
+peft==0.10.0
+transformers @ git+https://github.com/huggingface/transformers.git@43d17c18360ac9c3d3491389328e2fe55fe8f9ce
 tokenizers==0.15.0
 bitsandbytes==0.43.0
 accelerate==0.28.0
@@ -32,11 +32,12 @@ fschat==0.2.36
 gradio==3.50.2
 tensorboard

-mamba-ssm==1.1.1
+mamba-ssm==1.2.0.post1

 # remote filesystems
 s3fs
 gcsfs
 # adlfs

-trl @ git+https://github.com/huggingface/trl.git@304e208f778a5442c30cdda500348226cdc97d90
+trl @ git+https://github.com/huggingface/trl.git@0ee349dcd43b0f4b3169449f16751c38ac4a609f
+zstandard==0.22.0
--- a/setup.py
+++ b/setup.py
@@ -78,7 +78,7 @@ setup(
            "deepspeed-kernels",
        ],
        "mamba-ssm": [
-            "mamba-ssm==1.0.1",
+            "mamba-ssm==1.2.0.post1",
        ],
        "auto-gptq": [
            "auto-gptq==0.5.1",
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -24,6 +24,7 @@ from huggingface_hub import HfApi
 from huggingface_hub.utils import LocalTokenNotFoundError
 from transformers import GenerationConfig, TextIteratorStreamer, TextStreamer
 from transformers.utils import is_torch_bf16_gpu_available
+from transformers.utils.import_utils import _is_package_available

 from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
 from axolotl.logging_config import configure_logging
@@ -62,6 +63,20 @@ def print_axolotl_text_art(suffix=None):
    if is_main_process():
        print(ascii_art)

+    print_dep_versions()
+
+
+def print_dep_versions():
+    packages = ["accelerate", "peft", "transformers", "trl", "torch", "bitsandbytes"]
+    max_len = max(len(pkg) for pkg in packages)
+    if is_main_process():
+        print("*" * 40)
+        print("**** Axolotl Dependency Versions *****")
+        for pkg in packages:
+            version = _is_package_available(pkg, return_version=True)
+            print(f"{pkg: >{max_len}}: {version[1]: <15}")
+        print("*" * 40)
+

 def check_remote_config(config: Union[str, Path]):
    # Check if the config is a valid HTTPS URL to a .yml or .yaml file
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -38,6 +38,8 @@ def do_cli(config: Path = Path("examples/"), **kwargs):
    parsed_cfg.load_in_4bit = False
    parsed_cfg.load_in_8bit = False
    parsed_cfg.flash_attention = False
+    parsed_cfg.deepspeed = None
+    parsed_cfg.fsdp = None

    do_merge_lora(cfg=parsed_cfg, cli_args=parsed_cli_args)

--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -23,6 +23,7 @@ from torch.optim.lr_scheduler import OneCycleLR
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
 from transformers import (
    EarlyStoppingCallback,
+    PreTrainedModel,
    Trainer,
    TrainerCallback,
    TrainingArguments,
@@ -35,6 +36,7 @@ from trl.trainer.utils import pad_to_length
 from axolotl.loraplus import create_loraplus_optimizer
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
+from axolotl.utils import is_mlflow_available
 from axolotl.utils.callbacks import (
    EvalFirstStepCallback,
    GPUStatsCallback,
@@ -45,6 +47,7 @@ from axolotl.utils.callbacks import (
    causal_lm_bench_eval_callback_factory,
    log_prediction_callback_factory,
 )
+from axolotl.utils.callbacks.lisa import lisa_callback_factory
 from axolotl.utils.collators import (
    BatchSamplerDataCollatorForSeq2Seq,
    DataCollatorForSeq2Seq,
@@ -69,10 +72,6 @@ except ImportError:
 LOG = logging.getLogger("axolotl.core.trainer_builder")


-def is_mlflow_available():
-    return importlib.util.find_spec("mlflow") is not None
-
-
 def _sanitize_kwargs_for_tagging(tag_names, kwargs=None):
    if isinstance(tag_names, str):
        tag_names = [tag_names]
@@ -200,6 +199,18 @@ class AxolotlTrainingArguments(TrainingArguments):
    orpo_alpha: Optional[float] = field(
        default=None,
    )
+    lisa_n_layers: Optional[int] = field(
+        default=None,
+        metadata={"help": "the number of activate layers in LISA"},
+    )
+    lisa_step_interval: Optional[int] = field(
+        default=None,
+        metadata={"help": "how often to switch layers in LISA"},
+    )
+    lisa_layers_attribute: Optional[str] = field(
+        default=None,
+        metadata={"help": "path under the model to access the layers"},
+    )


 class AxolotlTrainer(Trainer):
@@ -789,6 +800,15 @@ class AxolotlDPOTrainer(DPOTrainer):

        return super().push_to_hub(*args, **kwargs)

+    def tokenize_row(
+        self, feature, model: Optional[Union[PreTrainedModel, torch.nn.Module]] = None
+    ) -> Dict:
+        res = super().tokenize_row(feature, model=model)
+        if self.tokenizer.bos_token_id is None and res["prompt_input_ids"][0] is None:
+            for key in res.keys():
+                res[key] = res[key][1:]
+        return res
+

 class TrainerBuilderBase(abc.ABC):
    """
@@ -920,7 +940,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        callbacks = []
        if self.cfg.use_wandb and self.cfg.eval_table_size > 0:
            LogPredictionCallback = log_prediction_callback_factory(
-                trainer, self.tokenizer
+                trainer, self.tokenizer, "wandb"
+            )
+            callbacks.append(LogPredictionCallback(self.cfg))
+        if (
+            self.cfg.use_mlflow
+            and is_mlflow_available()
+            and self.cfg.eval_table_size > 0
+        ):
+            LogPredictionCallback = log_prediction_callback_factory(
+                trainer, self.tokenizer, "mlflow"
            )
            callbacks.append(LogPredictionCallback(self.cfg))

@@ -938,6 +967,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            )
            callbacks.append(early_stop_cb)

+        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
+            callbacks.append(lisa_callback_factory(trainer))
        return callbacks

    def _get_trainer_cls(self):
@@ -1027,6 +1058,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.save_safetensors is not None:
            training_arguments_kwargs["save_safetensors"] = self.cfg.save_safetensors

+        if self.cfg.save_only_model is not None:
+            training_arguments_kwargs["save_only_model"] = self.cfg.save_only_model
+
        if self.cfg.sample_packing_eff_est:
            training_arguments_kwargs[
                "sample_packing_efficiency"
@@ -1229,6 +1263,15 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                    "relora_prune_ratio"
                ] = self.cfg.relora_prune_ratio

+        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
+            training_arguments_kwargs["lisa_n_layers"] = self.cfg.lisa_n_layers
+            training_arguments_kwargs[
+                "lisa_step_interval"
+            ] = self.cfg.lisa_step_interval
+            training_arguments_kwargs[
+                "lisa_layers_attribute"
+            ] = self.cfg.lisa_layers_attribute
+
        training_arguments_kwargs = self.hook_pre_create_training_args(
            training_arguments_kwargs
        )
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -284,12 +284,7 @@ def flashattn_forward_with_s2attn(
    # [bsz, nh, q_len, hd]
    # pylint: disable=duplicate-code

-    kv_seq_len = key_states.shape[-2]
-    if past_key_value is not None:
-        kv_seq_len += past_key_value[0].shape[-2]
-    cos, sin = self.rotary_emb(
-        value_states, seq_len=kv_seq_len, position_ids=position_ids
-    )
+    cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
@@ -435,13 +430,7 @@ def flashattn_forward(
    # [bsz, q_len, nh, hd]
    # [bsz, nh, q_len, hd]

-    kv_seq_len = key_states.shape[-2]
-    if past_key_value is not None:
-        kv_seq_len += past_key_value[0].shape[-2]
-
-    cos, sin = self.rotary_emb(
-        value_states, seq_len=kv_seq_len, position_ids=position_ids
-    )
+    cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -80,11 +80,7 @@ def xformers_forward(
    # [bsz, q_len, nh, hd]
    # [bsz, nh, q_len, hd]

-    kv_seq_len = key_states.shape[-2]
-    if past_key_value is not None:
-        kv_seq_len += past_key_value[0].shape[-2]
-
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    cos, sin = self.rotary_emb(value_states)
    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -12,6 +12,7 @@ from axolotl.monkeypatch.utils import get_unpad_data
 SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "mixtral",
    "qwen2",
+    "qwen2_moe",
    "falcon",
    "phi",
    "gemma",
@@ -31,6 +32,10 @@ def patch_for_multipack(model_type, model_name=None):
        transformers.models.qwen2.modeling_qwen2._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
        )
+    elif model_type == "qwen2_moe":
+        transformers.models.qwen2_moe.modeling_qwen2_moe._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
    elif model_type == "falcon":
        transformers.models.falcon.modeling_falcon._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
@@ -48,14 +53,16 @@ def patch_for_multipack(model_type, model_name=None):
            get_unpad_data
        )
    elif model_type == "gemmoe":
-        model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-        # we need to load the model here in order for modeling_gemmoe to be available
-        with init_empty_weights():
-            AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
-        module_name = model_config.__class__.__module__.replace(
-            ".configuration_gemmoe", ".modeling_gemmoe"
-        )
-        modeling_gemmoe = importlib.import_module(module_name)
-        modeling_gemmoe._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
+        patch_remote(model_name, ".configuration_gemmoe", ".modeling_gemmoe")
+    elif model_type == "jamba":
+        patch_remote(model_name, ".configuration_jamba", ".modeling_jamba")
+
+
+def patch_remote(model_name, config_name, modeling_name):
+    model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    # we need to load the model here in order for modeling_* to be available
+    with init_empty_weights():
+        AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
+    module_name = model_config.__class__.__module__.replace(config_name, modeling_name)
+    modeling_arch = importlib.import_module(module_name)
+    modeling_arch._get_unpad_data = get_unpad_data  # pylint: disable=protected-access
--- a/src/axolotl/prompt_strategies/pretrain.py
+++ b/src/axolotl/prompt_strategies/pretrain.py
@@ -20,10 +20,11 @@ class PretrainTokenizationStrategy(PromptTokenizingStrategy):
    def supports_batched(self):
        return True

-    def __init__(self, *args, max_length=None, **kwargs):
+    def __init__(self, *args, max_length=None, text_column="text", **kwargs):
        super().__init__(*args, **kwargs)
        if max_length:
            self.max_length = max_length
+        self.text_column = text_column

    def _tokenize(
        self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False
@@ -44,7 +45,7 @@ class PretrainTokenizationStrategy(PromptTokenizingStrategy):
        return res

    def tokenize_prompt(self, prompt):
-        return self._tokenize(prompt["text"])
+        return self._tokenize(prompt[self.text_column])


 def load(tokenizer, cfg):
@@ -53,6 +54,7 @@ def load(tokenizer, cfg):
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
+        text_column=cfg.pretraining_dataset[0]["text_column"] or "text",
        max_length=cfg.sequence_len * 64,
    )
    return strat
--- a/src/axolotl/utils/init.py
+++ b/src/axolotl/utils/init.py
@@ -0,0 +1,8 @@
+"""
+Basic utils for Axolotl
+"""
+import importlib
+
+
+def is_mlflow_available():
+    return importlib.util.find_spec("mlflow") is not None
--- a/src/axolotl/utils/callbacks/init.py
+++ b/src/axolotl/utils/callbacks/init.py
@@ -6,7 +6,7 @@ import logging
 import os
 from shutil import copyfile
 from tempfile import NamedTemporaryFile
-from typing import TYPE_CHECKING, Dict, List
+from typing import TYPE_CHECKING, Any, Dict, List

 import evaluate
 import numpy as np
@@ -27,7 +27,9 @@ from transformers import (
 )
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy

+from axolotl.utils import is_mlflow_available
 from axolotl.utils.bench import log_gpu_memory_usage
+from axolotl.utils.config.models.input.v0_4_1 import AxolotlInputConfig
 from axolotl.utils.distributed import (
    barrier,
    broadcast_dict,
@@ -540,7 +542,7 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
    return CausalLMBenchEvalCallback


-def log_prediction_callback_factory(trainer: Trainer, tokenizer):
+def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
    class LogPredictionCallback(TrainerCallback):
        """Callback to log prediction values during each evaluation"""

@@ -597,15 +599,13 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer):
                return ranges

            def log_table_from_dataloader(name: str, table_dataloader):
-                table = wandb.Table(  # type: ignore[attr-defined]
-                    columns=[
-                        "id",
-                        "Prompt",
-                        "Correct Completion",
-                        "Predicted Completion (model.generate)",
-                        "Predicted Completion (trainer.prediction_step)",
-                    ]
-                )
+                table_data: Dict[str, List[Any]] = {
+                    "id": [],
+                    "Prompt": [],
+                    "Correct Completion": [],
+                    "Predicted Completion (model.generate)": [],
+                    "Predicted Completion (trainer.prediction_step)": [],
+                }
                row_index = 0

                for batch in tqdm(table_dataloader):
@@ -709,16 +709,29 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer):
                    ) in zip(
                        prompt_texts, completion_texts, predicted_texts, pred_step_texts
                    ):
-                        table.add_data(
-                            row_index,
-                            prompt_text,
-                            completion_text,
-                            prediction_text,
-                            pred_step_text,
+                        table_data["id"].append(row_index)
+                        table_data["Prompt"].append(prompt_text)
+                        table_data["Correct Completion"].append(completion_text)
+                        table_data["Predicted Completion (model.generate)"].append(
+                            prediction_text
                        )
+                        table_data[
+                            "Predicted Completion (trainer.prediction_step)"
+                        ].append(pred_step_text)
                        row_index += 1
+                if logger == "wandb":
+                    wandb.run.log({f"{name} - Predictions vs Ground Truth": pd.DataFrame(table_data)})  # type: ignore[attr-defined]
+                elif logger == "mlflow" and is_mlflow_available():
+                    import mlflow

-                wandb.run.log({f"{name} - Predictions vs Ground Truth": table})  # type: ignore[attr-defined]
+                    tracking_uri = AxolotlInputConfig(
+                        **self.cfg.to_dict()
+                    ).mlflow_tracking_uri
+                    mlflow.log_table(
+                        data=table_data,
+                        artifact_file="PredictionsVsGroundTruth.json",
+                        tracking_uri=tracking_uri,
+                    )

            if is_main_process():
                log_table_from_dataloader("Eval", eval_dataloader)
@@ -748,6 +761,11 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
                ) as temp_file:
                    copyfile(self.axolotl_config_path, temp_file.name)
+                    artifact = wandb.Artifact(
+                        f"config-{wandb.run.id}", type="axolotl-config"
+                    )
+                    artifact.add_file(temp_file.name)
+                    wandb.log_artifact(artifact)
                    wandb.save(temp_file.name)
                LOG.info(
                    "The Axolotl config has been saved to the WandB run under files."
--- a/src/axolotl/utils/callbacks/lisa.py
+++ b/src/axolotl/utils/callbacks/lisa.py
@@ -0,0 +1,91 @@
+"""
+module for LISA
+
+Adapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl
+Arxiv: https://arxiv.org/abs/2403.17919
+License: Apache 2.0
+"""
+
+import logging
+from functools import reduce
+from typing import TYPE_CHECKING
+
+import numpy as np
+from transformers import TrainerCallback
+
+if TYPE_CHECKING:
+    from axolotl.core.trainer_builder import AxolotlTrainer
+
+LOG = logging.getLogger("axolotl.callbacks.lisa")
+
+
+def lisa_callback_factory(trainer: "AxolotlTrainer"):
+    class LISACallback(TrainerCallback):
+        """trainer callback for lisa layer switching"""
+
+        def __init__(
+            self, n_layers, step_interval, trainer, layers_attribute="model.layers"
+        ):
+            super().__init__()
+            self.n_layers = n_layers
+            self.step_interval = step_interval
+            self.layers_attribute = layers_attribute
+            self.trainer = trainer
+
+            reduce(getattr, self.layers_attribute.split("."), self.trainer.model)
+
+            self.total_layers = len(
+                reduce(getattr, self.layers_attribute.split("."), self.trainer.model)
+            )
+            self.active_layers_indices = []
+
+            layers = reduce(
+                getattr, self.layers_attribute.split("."), self.trainer.model
+            )
+            LOG.info(
+                f"LISA will activate {self.n_layers}/{len(layers)} layers ({self.n_layers*100/len(layers)}%) every {self.step_interval} steps"
+            )
+
+        def freeze_all_layers(self):
+            layers = reduce(
+                getattr, self.layers_attribute.split("."), self.trainer.model
+            )
+            for layer in layers:
+                for param in layer.parameters():
+                    param.requires_grad = False
+
+        def on_step_begin(
+            self, args, state, control, **kwargs
+        ):  # pylint: disable=unused-argument
+            # Check if it's time to switch active layers, including at step 0
+            if state.global_step % self.step_interval == 0 or state.global_step == 1:
+                self.switch_active_layers()
+
+        def switch_active_layers(self):
+            # First, disable gradients for all layers
+            self.freeze_all_layers()
+
+            # Randomly select n_layers to activate
+            layers = reduce(
+                getattr, self.layers_attribute.split("."), self.trainer.model
+            )
+            self.active_layers_indices = np.random.choice(
+                range(self.total_layers), self.n_layers, replace=False
+            )
+            LOG.info(
+                f"Activating layers at indices: {self.active_layers_indices} for the next steps."
+            )
+
+            # Enable gradients only for the selected layers
+            for idx in self.active_layers_indices:
+                for param in layers[idx].parameters():
+                    param.requires_grad = True
+
+    lisa_callback = LISACallback(
+        n_layers=trainer.args.lisa_n_layers,
+        step_interval=trainer.args.lisa_step_interval,
+        trainer=trainer,
+        layers_attribute=trainer.args.lisa_layers_attribute,
+    )
+
+    return lisa_callback
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
@@ -23,6 +23,7 @@ def chat_templates(user_choice: str):
        "inst": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",  # I don't know what this one is called. Used by Mistral/Mixtral.
        "chatml": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
        "gemma": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
+        "cohere": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
    }

    if user_choice in templates:
--- a/src/axolotl/utils/collators.py
+++ b/src/axolotl/utils/collators.py
@@ -217,13 +217,24 @@ class PretrainingBatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    Collator for multipack specific to the using the BatchSampler
    """

+    def __init__(self, *args, multipack_attn=True, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.multipack_attn = multipack_attn
+
    def __call__(self, features, return_tensors=None):
        chunked_data = {}
        for feature in features.keys():
            if feature == "length":
                continue
            if feature == "attention_mask":
-                arrays = [(1) * np.array(item) for item in features[feature]]
+                if self.multipack_attn:
+                    arrays = [
+                        (i + 1) * np.array(item[feature])
+                        for i, item in enumerate(features[feature])
+                        if feature in item
+                    ]
+                else:
+                    arrays = [(1) * np.array(item) for item in features[feature]]
                chunked_data[feature] = np.concatenate(arrays)
            else:
                arrays = [np.array(item) for item in features[feature]]
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -208,11 +208,11 @@ def validate_config(cfg: DictDefault, capabilities: Optional[dict] = None):
            dict(
                AxolotlConfigWCapabilities(
                    **cfg.to_dict(), capabilities=capabilities
-                ).model_dump(exclude_unset=True)
+                ).model_dump(exclude_none=True)
            )
        )
    return DictDefault(
-        dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_unset=True))
+        dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_none=True))
    )


--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -1,12 +1,13 @@
 """
 Module for pydantic models for configuration
 """
+
 # pylint: disable=too-many-lines

 import logging
 import os
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union

 from pydantic import BaseModel, Field, conlist, field_validator, model_validator
 from transformers import SchedulerType
@@ -61,7 +62,11 @@ class RemappedParameters(BaseModel):
 class PretrainingDataset(BaseModel):
    """pretraining dataset configuration subset"""

+    name: Optional[str] = None
    path: Optional[str] = None
+    split: Optional[str] = "train"
+    text_column: Optional[str] = "text"
+    type: Optional[str] = "pretrain"


 class UserDefinedPrompterType(BaseModel):
@@ -93,6 +98,7 @@ class SFTDataset(BaseModel):
    ds_type: Optional[str] = None
    train_on_split: Optional[str] = None

+    field: Optional[str] = None
    field_human: Optional[str] = None
    field_model: Optional[str] = None

@@ -136,6 +142,7 @@ class ChatTemplate(str, Enum):
    chatml = "chatml"  # pylint: disable=invalid-name
    inst = "inst"  # pylint: disable=invalid-name
    gemma = "gemma"  # pylint: disable=invalid-name
+    cohere = "cohere"  # pylint: disable=invalid-name


 class LoftQConfig(BaseModel):
@@ -151,12 +158,6 @@ class PeftConfig(BaseModel):
    loftq_config: Optional[LoftQConfig] = None


-class AutoType(str, Enum):
-    """auto type string configuration subset - used for bf16"""
-
-    AUTO = "auto"
-
-
 class SpecialTokensConfig(BaseModel):
    """Special tokens configuration subset"""

@@ -185,7 +186,8 @@ class LoraConfig(BaseModel):
    peft_layers_to_transform: Optional[List[int]] = None
    peft: Optional[PeftConfig] = None
    peft_use_dora: Optional[bool] = None
-    peft_use_relora: Optional[bool] = None
+    peft_use_rslora: Optional[bool] = None
+    peft_layer_replication: Optional[List[Tuple[int, int]]] = None

    lora_on_cpu: Optional[bool] = None
    gptq: Optional[bool] = None
@@ -241,17 +243,6 @@ class LoraConfig(BaseModel):
                    raise ValueError("Require cfg.load_in_4bit to be True for qlora")
        return self

-    @model_validator(mode="before")
-    @classmethod
-    def validate_quantized_dora(cls, data):
-        if data.get("peft_use_dora") and (
-            data.get("load_in_8bit") or data.get("load_in_4bit")
-        ):
-            raise ValueError(
-                "`peft_use_dora` is not currently compatible with quantized weights."
-            )
-        return data
-

 class ReLoRAConfig(BaseModel):
    """ReLoRA configuration subset"""
@@ -307,12 +298,14 @@ class HyperparametersConfig(BaseModel):
        },
    )

-    train_on_inputs: Optional[bool] = None
+    train_on_inputs: Optional[bool] = False
    group_by_length: Optional[bool] = None

    learning_rate: Union[str, float]
-    weight_decay: Optional[float] = None
-    optimizer: Optional[Union[OptimizerNames, Literal["lion_pytorch"]]] = None
+    weight_decay: Optional[float] = 0.0
+    optimizer: Optional[
+        Union[OptimizerNames, Literal["lion_pytorch"]]
+    ] = OptimizerNames.ADAMW_HF.value
    optim_args: Optional[Union[str, Dict[str, Any]]] = Field(
        default=None, metadata={"help": "Optional arguments to supply to optimizer."}
    )
@@ -323,7 +316,7 @@ class HyperparametersConfig(BaseModel):
        },
    )
    torchdistx_path: Optional[str] = None
-    lr_scheduler: Optional[SchedulerType] = None
+    lr_scheduler: Optional[SchedulerType] = "cosine"
    lr_scheduler_kwargs: Optional[Dict[str, Any]] = None
    lr_quadratic_warmup: Optional[bool] = None
    cosine_min_lr_ratio: Optional[float] = None
@@ -362,6 +355,7 @@ class ModelOutputConfig(BaseModel):
    hub_model_id: Optional[str] = None
    hub_strategy: Optional[str] = None
    save_safetensors: Optional[bool] = None
+    save_only_model: Optional[bool] = None


 class MLFlowConfig(BaseModel):
@@ -373,6 +367,23 @@ class MLFlowConfig(BaseModel):
    hf_mlflow_log_artifacts: Optional[bool] = None


+class LISAConfig(BaseModel):
+    """LISA options"""
+
+    lisa_n_layers: Optional[int] = Field(
+        default=None,
+        metadata={"help": "the number of activate layers in LISA"},
+    )
+    lisa_step_interval: Optional[int] = Field(
+        default=None,
+        metadata={"help": "how often to switch layers in LISA"},
+    )
+    lisa_layers_attribute: Optional[str] = Field(
+        default="model.layers",
+        metadata={"help": "path under the model to access the layers"},
+    )
+
+
 class WandbConfig(BaseModel):
    """wandb configuration subset"""

@@ -407,6 +418,7 @@ class AxolotlInputConfig(
    HyperparametersConfig,
    WandbConfig,
    MLFlowConfig,
+    LISAConfig,
    RemappedParameters,
    DeprecatedParameters,
    BaseModel,
@@ -433,7 +445,7 @@ class AxolotlInputConfig(
    dataset_shard_idx: Optional[int] = None

    pretraining_dataset: Optional[  # type: ignore
-        conlist(Union[SFTDataset, PretrainingDataset], min_length=1)
+        conlist(Union[PretrainingDataset, SFTDataset], min_length=1)
    ] = Field(
        default=None, metadata={"help": {"streaming dataset to use for pretraining"}}
    )
@@ -473,7 +485,7 @@ class AxolotlInputConfig(
    loss_watchdog_threshold: Optional[float] = None
    loss_watchdog_patience: Optional[int] = None

-    bf16: Optional[Union[AutoType, bool]] = AutoType.AUTO
+    bf16: Optional[Union[Literal["auto"], bool]] = "auto"
    fp16: Optional[bool] = None
    bfloat16: Optional[bool] = None  # for non-AMP cases
    float16: Optional[bool] = None  # for non-AMP cases
@@ -487,11 +499,19 @@ class AxolotlInputConfig(

    unfrozen_parameters: Optional[List[str]] = None

-    sequence_len: int = Field(default=1024)
+    sequence_len: int = Field(default=512)
    sample_packing: Optional[bool] = None
    eval_sample_packing: Optional[bool] = None
    pad_to_sequence_len: Optional[bool] = None

+    pretrain_multipack_buffer_size: Optional[int] = 10_000
+    pretrain_multipack_attn: Optional[bool] = Field(
+        default=True,
+        metadata={
+            "help": "whether to prevent cross attention for packed sequences during pretraining",
+        },
+    )
+
    xformers_attention: Optional[bool] = None
    sdp_attention: Optional[bool] = None
    s2_attention: Optional[bool] = None
@@ -536,6 +556,7 @@ class AxolotlInputConfig(
        Dict[Union[int, Literal["cpu", "disk"]], Union[int, str]]
    ] = None
    gpu_memory_limit: Optional[Union[int, str]] = None
+    low_cpu_mem_usage: Optional[bool] = None

    chat_template: Optional[ChatTemplate] = None
    default_system_message: Optional[str] = None
@@ -548,10 +569,10 @@ class AxolotlInputConfig(
    sample_packing_eff_est: Optional[float] = None
    axolotl_config_path: Optional[str] = None

-    is_falcon_derived_model: Optional[bool] = Field(default=False)
-    is_llama_derived_model: Optional[bool] = Field(default=False)
-    is_mistral_derived_model: Optional[bool] = Field(default=False)
-    is_qwen_derived_model: Optional[bool] = Field(default=False)
+    is_falcon_derived_model: Optional[bool] = Field(default=None)
+    is_llama_derived_model: Optional[bool] = Field(default=None)
+    is_mistral_derived_model: Optional[bool] = Field(default=None)
+    is_qwen_derived_model: Optional[bool] = Field(default=None)

    @field_validator("datasets", mode="before")
    @classmethod
@@ -626,6 +647,20 @@ class AxolotlInputConfig(

        return data

+    @model_validator(mode="before")
+    @classmethod
+    def check_sample_packing_wo_flash(cls, data):
+        if (
+            data.get("sample_packing")
+            and not data.get("flash_attention")
+            and not data.get("sdp_attention")
+        ):
+            LOG.warning(
+                "sample_packing without flash_attention or sdp_attention does not handle cross-attention."
+            )
+
+        return data
+
    @model_validator(mode="before")
    @classmethod
    def check_sample_packing_w_rl(cls, data):
--- a/src/axolotl/utils/data/init.py
+++ b/src/axolotl/utils/data/init.py
@@ -0,0 +1,15 @@
+"""
+Data processing modules
+"""
+from axolotl.utils.data.dpo import load_prepare_dpo_datasets  # noqa: F401
+from axolotl.utils.data.pretraining import (  # noqa: F401
+    encode_pretraining,
+    wrap_pretraining_dataset,
+)
+from axolotl.utils.data.sft import (  # noqa: F401
+    get_dataset_wrapper,
+    load_prepare_datasets,
+    load_tokenized_prepared_datasets,
+    prepare_dataset,
+)
+from axolotl.utils.data.utils import md5  # noqa: F401
--- a/src/axolotl/utils/data/dpo.py
+++ b/src/axolotl/utils/data/dpo.py
@@ -0,0 +1,114 @@
+"""data handling specific to DPO"""
+
+import logging
+from pathlib import Path
+from typing import Any, List
+
+import yaml
+from datasets import concatenate_datasets, load_dataset, load_from_disk
+
+from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
+from axolotl.prompt_strategies.dpo import load as load_dpo
+from axolotl.utils.data.utils import md5
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.distributed import is_main_process, zero_first
+
+LOG = logging.getLogger("axolotl")
+
+
+def _get_path(ds_hash, cfg):
+    prepared_ds_path = (
+        Path(cfg.dataset_prepared_path) / ds_hash
+        if cfg.dataset_prepared_path
+        else Path(DEFAULT_DATASET_PREPARED_PATH) / ds_hash
+    )
+
+    return prepared_ds_path
+
+
+def _load_preprocessed_ds(cfg, sub_cfg):
+    ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
+    prepared_ds_path = _get_path(ds_hash, cfg)
+    dataset = None
+
+    # pylint: disable=duplicate-code
+    if (
+        cfg.dataset_prepared_path
+        and any(prepared_ds_path.glob("*"))
+        and not cfg.is_preprocess
+    ):
+        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
+        dataset = load_from_disk(str(prepared_ds_path))
+
+    return dataset
+
+
+def _save_preprocessed_ds(cfg, sub_cfg, dataset):
+    ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
+    prepared_ds_path = _get_path(ds_hash, cfg)
+
+    if cfg.is_preprocess and is_main_process():
+        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
+        dataset.save_to_disk(str(prepared_ds_path))
+
+
+def load_prepare_dpo_datasets(cfg):
+    def load_split(dataset_cfgs, _cfg):
+        split_datasets: List[Any] = []
+        for i, ds_cfg in enumerate(dataset_cfgs):
+            if ds_cfg["ds_type"] == "json":
+                for data_file in ds_cfg["data_files"]:
+                    data_files = {ds_cfg["split"]: data_file}
+                    ds = load_dataset(  # pylint: disable=invalid-name
+                        "json",
+                        data_files=data_files,
+                        split=ds_cfg["split"],
+                    )
+                    split_datasets.insert(i, ds)
+            else:
+                ds = load_dataset(  # pylint: disable=invalid-name
+                    ds_cfg["path"],
+                    split=ds_cfg["split"],
+                )
+                split_datasets.insert(i, ds)
+
+        for i, data_set in enumerate(split_datasets):
+            _type = dataset_cfgs[i]["type"]
+            if _type:
+                if isinstance(_type, DictDefault):
+                    _type = "user_defined.default"
+                ds_transform_fn = load_dpo(_type, _cfg, dataset_idx=i)
+                split_datasets[i] = data_set.map(
+                    ds_transform_fn,
+                    desc="Mapping RL Dataset",
+                )
+            else:
+                # If no `type` is provided, assume the dataset is already in the expected format with
+                # "prompt", "chosen" and "rejected" already preprocessed
+                split_datasets[i] = data_set
+
+        return concatenate_datasets(split_datasets)
+
+    with zero_first(is_main_process()):
+        train_is_preprocessed = False
+        eval_is_preprocessed = False
+        if train_dataset := _load_preprocessed_ds(cfg, cfg.datasets):
+            train_is_preprocessed = True
+        else:
+            train_dataset = load_split(cfg.datasets, cfg)
+
+        eval_dataset = None
+        if cfg.test_datasets:
+            if eval_dataset := _load_preprocessed_ds(cfg, cfg.test_datasets):
+                eval_is_preprocessed = True
+            else:
+                eval_dataset = load_split(cfg.test_datasets, cfg)
+        if not eval_dataset:
+            eval_dataset = None
+
+        if not train_is_preprocessed:
+            _save_preprocessed_ds(cfg, cfg.datasets, train_dataset)
+        if eval_dataset and not eval_is_preprocessed:
+            _save_preprocessed_ds(cfg, cfg.test_datasets, eval_dataset)
+
+    return train_dataset, eval_dataset
--- a/src/axolotl/utils/data/pretraining.py
+++ b/src/axolotl/utils/data/pretraining.py
@@ -0,0 +1,232 @@
+"""data handling specific to pretraining"""
+
+import functools
+import logging
+from collections import defaultdict
+from typing import Callable, Dict, List, Optional
+
+import torch
+from datasets import Dataset
+from torch.utils.data import RandomSampler
+from transformers import PreTrainedTokenizerBase
+
+from axolotl.utils.collators import PretrainingBatchSamplerDataCollatorForSeq2Seq
+from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
+from axolotl.utils.trainer import process_pretraining_datasets_for_packing
+
+LOG = logging.getLogger("axolotl")
+
+
+def encode_pretraining(
+    tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: List[str]
+) -> Dict[str, List]:
+    res = tokenizer(
+        examples,
+        truncation=True,
+        max_length=max_tokens - 2,
+        add_special_tokens=True,
+    )
+    # Convert to PyTorch tensors
+    input_ids = [torch.tensor(seq) for seq in res["input_ids"]]
+    attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]]
+    new_input_ids = []
+    new_attention_mask = []
+    # Append EOS and PAD tokens to input_ids, and correct attention_mask
+    for i, _ in enumerate(input_ids):
+        input_ids[i] = torch.cat(
+            (
+                input_ids[i],
+                torch.tensor([tokenizer.eos_token_id, tokenizer.pad_token_id]),
+            ),
+            dim=0,
+        )
+        attention_mask[i] = torch.cat((attention_mask[i], torch.tensor([1, 0])), dim=0)
+
+    # Concatenate tokens so that their lengths are less than max_tokens
+    buffer_input_ids = torch.tensor([], dtype=torch.long)
+    buffer_attention_mask = torch.tensor([], dtype=torch.long)
+
+    for ids, mask in zip(input_ids, attention_mask):
+        if buffer_input_ids.numel() == max_tokens:
+            new_input_ids.append(buffer_input_ids)
+            new_attention_mask.append(buffer_attention_mask)
+            buffer_input_ids = torch.tensor([], dtype=torch.long)
+            buffer_attention_mask = torch.tensor([], dtype=torch.long)
+            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
+        elif buffer_input_ids.numel() + ids.numel() <= max_tokens:
+            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
+        else:
+            buffer_input_ids = torch.cat(
+                (
+                    buffer_input_ids,
+                    torch.full(
+                        (max_tokens - buffer_input_ids.numel(),),
+                        tokenizer.pad_token_id,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+            buffer_attention_mask = torch.cat(
+                (
+                    buffer_attention_mask,
+                    torch.full(
+                        (max_tokens - buffer_attention_mask.numel(),),
+                        0,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+            new_input_ids.append(buffer_input_ids)
+            new_attention_mask.append(buffer_attention_mask)
+            buffer_input_ids = torch.tensor([], dtype=torch.long)
+            buffer_attention_mask = torch.tensor([], dtype=torch.long)
+
+            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
+
+    if buffer_input_ids.numel() > 0:  # for any leftover tokens
+        while buffer_input_ids.numel() < max_tokens:  # make all sequences equal in size
+            buffer_input_ids = torch.cat(
+                (
+                    buffer_input_ids,
+                    torch.full(
+                        (max_tokens - buffer_input_ids.numel(),),
+                        tokenizer.pad_token_id,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+            buffer_attention_mask = torch.cat(
+                (
+                    buffer_attention_mask,
+                    torch.full(
+                        (max_tokens - buffer_attention_mask.numel(),),
+                        0,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+        new_input_ids.append(buffer_input_ids)
+        new_attention_mask.append(buffer_attention_mask)
+
+    ret = {
+        "input_ids": [seq.tolist() for seq in new_input_ids],
+        "labels": [seq.tolist() for seq in new_input_ids],
+        "attention_mask": [seq.tolist() for seq in new_attention_mask],
+    }
+
+    LOG.debug(len(ret["input_ids"]))
+    return ret
+
+
+def wrap_pretraining_dataset(
+    dataset,
+    tokenizer,
+    cfg,
+    ds_wrapper_fn,
+    max_tokens=2048,
+    batch_size=1,
+    seed=42,
+    buffer_size=10_000,
+):
+    if cfg.sample_packing:
+        collate_fn = PretrainingBatchSamplerDataCollatorForSeq2Seq(
+            tokenizer,
+            return_tensors="pt",
+            padding=True,
+            pad_to_multiple_of=max_tokens * batch_size,
+            multipack_attn=cfg.pretrain_multipack_attn,
+        )
+        encode = functools.partial(
+            encode_packed_pretraining,
+            collate_fn,
+            ds_wrapper_fn,
+            max_seq_length=max_tokens,
+            batch_size=batch_size,
+            multipack_attn=cfg.pretrain_multipack_attn,
+        )
+        # set this to 1 so downstream data_loader doesn't try to increase the batch again
+        cfg.micro_batch_size = 1
+    else:
+        encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
+
+    if cfg.shuffle_merged_datasets:
+        dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
+    else:
+        LOG.debug("NOT shuffling merged pretraining datasets")
+
+    # remove all the existing columns after mapping since they end up having
+    # a different length than the encoded/tokenized column
+    # this is empty during streaming/pretraining
+    remove_columns = []
+    if dataset.features is None:
+        for first_row in dataset:
+            remove_columns = first_row.keys()
+            break
+    else:
+        remove_columns = dataset.features.keys()
+
+    dataset = dataset.map(
+        encode,
+        batched=True,
+        batch_size=buffer_size,
+        # input_columns="text",
+        remove_columns=remove_columns,
+    )
+    return dataset
+
+
+def encode_packed_pretraining(
+    collate_fn,
+    ds_wrapper: Callable,
+    examples: Dict[str, List],
+    max_seq_length: int = 2048,
+    batch_size: int = 4,
+    multipack_attn: Optional[bool] = False,
+) -> Dict[str, List]:
+    # pylint: disable=duplicate-code
+    # tokenize all the examples
+    # rows get split with stride (overlap)
+    train_dataset = ds_wrapper(Dataset.from_dict(examples))[0]
+
+    train_dataset = process_pretraining_datasets_for_packing(
+        train_dataset,
+        max_seq_length,
+        skip_position_ids=not multipack_attn,
+    )
+
+    sampler = MultipackBatchSampler(
+        RandomSampler(train_dataset),
+        batch_size=1,
+        drop_last=True,
+        batch_max_len=batch_size * max_seq_length,
+        lengths=get_dataset_lengths(train_dataset),
+    )
+
+    chunked_data = defaultdict(list)
+
+    for batch in sampler:
+        for data in batch:
+            features = train_dataset[data]
+            if "num_truncated_tokens" in features:
+                del features["num_truncated_tokens"]
+            if "num_truncated_tokens" in features:
+                del features["num_truncated_tokens"]
+            if "overflow_to_sample_mapping" in features:
+                del features["overflow_to_sample_mapping"]
+            if "labels" not in features:
+                features["labels"] = features["input_ids"].copy()
+            collated_features = collate_fn(features)
+
+            for feature in features.keys():
+                if feature == "length":
+                    continue
+                chunked_data[feature].append(collated_features[feature].squeeze(0))
+
+    return chunked_data
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -1,13 +1,10 @@
-"""Module containing data utilities"""
-import functools
-import hashlib
-import logging
-from collections import defaultdict
-from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+"""data handling specific to SFT"""
+
+import functools
+import logging
+from pathlib import Path
+from typing import List, Optional, Tuple, Union

-import torch
-import yaml
 from datasets import (
    Dataset,
    DatasetDict,
@@ -17,13 +14,11 @@ from datasets import (
 )
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import HFValidationError
-from torch.utils.data import RandomSampler
 from transformers import PreTrainedTokenizerBase

 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
 from axolotl.datasets import TokenizedPromptDataset
 from axolotl.prompt_strategies import load
-from axolotl.prompt_strategies.dpo import load as load_dpo
 from axolotl.prompt_tokenizers import (
    AlpacaMultipleChoicePromptTokenizingStrategy,
    AlpacaPromptTokenizingStrategy,
@@ -44,26 +39,18 @@ from axolotl.prompters import (
    SummarizeTLDRPrompter,
    UnsupportedPrompter,
 )
-from axolotl.utils.collators import PretrainingBatchSamplerDataCollatorForSeq2Seq
+from axolotl.utils.data.pretraining import wrap_pretraining_dataset
+from axolotl.utils.data.utils import md5
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process, zero_first
-from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 from axolotl.utils.trainer import (
    calculate_total_num_steps,
    process_datasets_for_packing,
-    process_pretraining_datasets_for_packing,
 )

 LOG = logging.getLogger("axolotl")


-def md5(to_hash: str, encoding: str = "utf-8") -> str:
-    try:
-        return hashlib.md5(to_hash.encode(encoding), usedforsecurity=False).hexdigest()
-    except TypeError:
-        return hashlib.md5(to_hash.encode(encoding)).hexdigest()  # nosec
-
-
 def prepare_dataset(cfg, tokenizer):
    prompters = []
    if not cfg.pretraining_dataset:
@@ -81,12 +68,15 @@ def prepare_dataset(cfg, tokenizer):
                )
    else:
        path = cfg.pretraining_dataset
+        split = "train"
        name = None
        if isinstance(cfg.pretraining_dataset, list) and isinstance(
            cfg.pretraining_dataset[0], dict
        ):
            path = cfg.pretraining_dataset[0]["path"]
            name = cfg.pretraining_dataset[0]["name"]
+            if "split" in cfg.pretraining_dataset[0]:
+                split = cfg.pretraining_dataset[0]["split"]

        ds_wrapper_partial = functools.partial(
            get_dataset_wrapper,
@@ -97,13 +87,14 @@ def prepare_dataset(cfg, tokenizer):
        )

        train_dataset = wrap_pretraining_dataset(
-            load_dataset(path, streaming=True, split="train", name=name),
+            load_dataset(path, streaming=True, split=split, name=name),
            tokenizer,
            cfg,
            ds_wrapper_partial,
            max_tokens=cfg.sequence_len,
            batch_size=cfg.micro_batch_size,
            seed=cfg.seed or 42,
+            buffer_size=cfg.pretrain_multipack_buffer_size or 10_000,
        )
        # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230
        train_dataset = train_dataset.with_format("torch")
@@ -177,6 +168,7 @@ def load_tokenized_prepared_datasets(
    except Exception:  # pylint: disable=broad-except # nosec
        pass

+    # pylint: disable=duplicate-code
    if dataset:
        ...
    elif (
@@ -223,7 +215,7 @@ def load_tokenized_prepared_datasets(
                    token=use_auth_token,
                )
                ds_from_hub = True
-            except (FileNotFoundError, ConnectionError, HFValidationError):
+            except (FileNotFoundError, ConnectionError, HFValidationError, ValueError):
                pass

            ds_from_cloud = False
@@ -290,14 +282,17 @@ def load_tokenized_prepared_datasets(
            local_path = Path(config_dataset.path)
            if local_path.exists():
                if local_path.is_dir():
-                    # TODO dirs with arrow or parquet files could be loaded with `load_from_disk`
-                    ds = load_dataset(
-                        config_dataset.path,
-                        name=config_dataset.name,
-                        data_files=config_dataset.data_files,
-                        streaming=False,
-                        split=None,
-                    )
+                    if config_dataset.data_files:
+                        ds_type = get_ds_type(config_dataset)
+                        ds = load_dataset(
+                            ds_type,
+                            name=config_dataset.name,
+                            data_files=config_dataset.data_files,
+                            streaming=False,
+                            split=None,
+                        )
+                    else:
+                        ds = load_from_disk(config_dataset.path)
                elif local_path.is_file():
                    ds_type = get_ds_type(config_dataset)

@@ -384,14 +379,15 @@ def load_tokenized_prepared_datasets(
                d_base_type = d_type_split[0]
                d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None

-            if config_dataset.split and config_dataset.split in ds:
-                ds = ds[config_dataset.split]
-            elif split in ds:
-                ds = ds[split]
-            elif isinstance(ds, DatasetDict):
-                raise ValueError(
-                    f"no {split} split found for dataset {config_dataset.path}, you may specify a split with 'split: `"
-                )
+            if isinstance(ds, DatasetDict):
+                if config_dataset.split and config_dataset.split in ds:
+                    ds = ds[config_dataset.split]
+                elif split in ds:
+                    ds = ds[split]
+                else:
+                    raise ValueError(
+                        f"no {split} split found for dataset {config_dataset.path}, you may specify a split with 'split: `"
+                    )

            # support for using a subset of the data
            if config_dataset.shards:
@@ -683,301 +679,3 @@ def get_dataset_wrapper(
        )

    return dataset_wrapper, dataset_prompter
-
-
-def encode_pretraining(
-    tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: List[str]
-) -> Dict[str, List]:
-    res = tokenizer(
-        examples,
-        truncation=True,
-        max_length=max_tokens - 2,
-        add_special_tokens=True,
-    )
-    # Convert to PyTorch tensors
-    input_ids = [torch.tensor(seq) for seq in res["input_ids"]]
-    attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]]
-    new_input_ids = []
-    new_attention_mask = []
-    # Append EOS and PAD tokens to input_ids, and correct attention_mask
-    for i, _ in enumerate(input_ids):
-        input_ids[i] = torch.cat(
-            (
-                input_ids[i],
-                torch.tensor([tokenizer.eos_token_id, tokenizer.pad_token_id]),
-            ),
-            dim=0,
-        )
-        attention_mask[i] = torch.cat((attention_mask[i], torch.tensor([1, 0])), dim=0)
-
-    # Concatenate tokens so that their lengths are less than max_tokens
-    buffer_input_ids = torch.tensor([], dtype=torch.long)
-    buffer_attention_mask = torch.tensor([], dtype=torch.long)
-
-    for ids, mask in zip(input_ids, attention_mask):
-        if buffer_input_ids.numel() == max_tokens:
-            new_input_ids.append(buffer_input_ids)
-            new_attention_mask.append(buffer_attention_mask)
-            buffer_input_ids = torch.tensor([], dtype=torch.long)
-            buffer_attention_mask = torch.tensor([], dtype=torch.long)
-            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
-            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
-        elif buffer_input_ids.numel() + ids.numel() <= max_tokens:
-            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
-            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
-        else:
-            buffer_input_ids = torch.cat(
-                (
-                    buffer_input_ids,
-                    torch.full(
-                        (max_tokens - buffer_input_ids.numel(),),
-                        tokenizer.pad_token_id,
-                        dtype=torch.long,
-                    ),
-                ),
-                dim=0,
-            )
-            buffer_attention_mask = torch.cat(
-                (
-                    buffer_attention_mask,
-                    torch.full(
-                        (max_tokens - buffer_attention_mask.numel(),),
-                        0,
-                        dtype=torch.long,
-                    ),
-                ),
-                dim=0,
-            )
-            new_input_ids.append(buffer_input_ids)
-            new_attention_mask.append(buffer_attention_mask)
-            buffer_input_ids = torch.tensor([], dtype=torch.long)
-            buffer_attention_mask = torch.tensor([], dtype=torch.long)
-
-            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
-            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
-
-    if buffer_input_ids.numel() > 0:  # for any leftover tokens
-        while buffer_input_ids.numel() < max_tokens:  # make all sequences equal in size
-            buffer_input_ids = torch.cat(
-                (
-                    buffer_input_ids,
-                    torch.full(
-                        (max_tokens - buffer_input_ids.numel(),),
-                        tokenizer.pad_token_id,
-                        dtype=torch.long,
-                    ),
-                ),
-                dim=0,
-            )
-            buffer_attention_mask = torch.cat(
-                (
-                    buffer_attention_mask,
-                    torch.full(
-                        (max_tokens - buffer_attention_mask.numel(),),
-                        0,
-                        dtype=torch.long,
-                    ),
-                ),
-                dim=0,
-            )
-        new_input_ids.append(buffer_input_ids)
-        new_attention_mask.append(buffer_attention_mask)
-
-    ret = {
-        "input_ids": [seq.tolist() for seq in new_input_ids],
-        "labels": [seq.tolist() for seq in new_input_ids],
-        "attention_mask": [seq.tolist() for seq in new_attention_mask],
-    }
-
-    LOG.debug(len(ret["input_ids"]))
-    return ret
-
-
-def wrap_pretraining_dataset(
-    dataset,
-    tokenizer,
-    cfg,
-    ds_wrapper_fn,
-    max_tokens=2048,
-    batch_size=1,
-    seed=42,
-    buffer_size=10_000,
-):
-    if cfg.sample_packing:
-        collate_fn = PretrainingBatchSamplerDataCollatorForSeq2Seq(
-            tokenizer,
-            return_tensors="pt",
-            padding=True,
-            pad_to_multiple_of=max_tokens * batch_size,
-        )
-        encode = functools.partial(
-            encode_packed_pretraining,
-            collate_fn,
-            ds_wrapper_fn,
-            max_seq_length=max_tokens,
-            batch_size=batch_size,
-        )
-        # set this to 1 so downstream data_loader doesn't try to increase the batch again
-        cfg.micro_batch_size = 1
-    else:
-        encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
-
-    if cfg.shuffle_merged_datasets:
-        dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
-    else:
-        LOG.debug("NOT shuffling merged pretraining datasets")
-
-    dataset = dataset.map(
-        encode,
-        batched=True,
-        batch_size=buffer_size,
-        # input_columns="text",
-        # remove all the existing columns after mapping since they end up having
-        # a different length than the encoded/tokenized column
-        remove_columns=dataset.features.keys(),
-    )
-    return dataset
-
-
-def encode_packed_pretraining(
-    collate_fn,
-    ds_wrapper: Callable,
-    examples: Dict[str, List],
-    max_seq_length: int = 2048,
-    batch_size: int = 4,
-) -> Dict[str, List]:
-    # pylint: disable=duplicate-code
-    # tokenize all the examples
-    # rows get split with stride (overlap)
-    train_dataset = ds_wrapper(Dataset.from_dict(examples))[0]
-
-    train_dataset = process_pretraining_datasets_for_packing(
-        train_dataset, max_seq_length
-    )
-
-    sampler = MultipackBatchSampler(
-        RandomSampler(train_dataset),
-        batch_size=1,
-        drop_last=True,
-        batch_max_len=batch_size * max_seq_length,
-        lengths=get_dataset_lengths(train_dataset),
-    )
-
-    chunked_data = defaultdict(list)
-
-    for batch in sampler:
-        for data in batch:
-            features = train_dataset[data]
-            if "num_truncated_tokens" in features:
-                del features["num_truncated_tokens"]
-            if "num_truncated_tokens" in features:
-                del features["num_truncated_tokens"]
-            if "overflow_to_sample_mapping" in features:
-                del features["overflow_to_sample_mapping"]
-            if "labels" not in features:
-                features["labels"] = features["input_ids"].copy()
-            collated_features = collate_fn(features)
-
-            for feature in features.keys():
-                if feature == "length":
-                    continue
-                chunked_data[feature].append(collated_features[feature].squeeze(0))
-
-    return chunked_data
-
-
-def _get_path(ds_hash, cfg):
-    prepared_ds_path = (
-        Path(cfg.dataset_prepared_path) / ds_hash
-        if cfg.dataset_prepared_path
-        else Path(DEFAULT_DATASET_PREPARED_PATH) / ds_hash
-    )
-
-    return prepared_ds_path
-
-
-def _load_preprocessed_ds(cfg, sub_cfg):
-    ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
-    prepared_ds_path = _get_path(ds_hash, cfg)
-    dataset = None
-
-    if (
-        cfg.dataset_prepared_path
-        and any(prepared_ds_path.glob("*"))
-        and not cfg.is_preprocess
-    ):
-        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
-        dataset = load_from_disk(str(prepared_ds_path))
-
-    return dataset
-
-
-def _save_preprocessed_ds(cfg, sub_cfg, dataset):
-    ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
-    prepared_ds_path = _get_path(ds_hash, cfg)
-
-    if cfg.is_preprocess and is_main_process():
-        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
-        dataset.save_to_disk(str(prepared_ds_path))
-
-
-def load_prepare_dpo_datasets(cfg):
-    def load_split(dataset_cfgs, _cfg):
-        split_datasets: List[Any] = []
-        for i, ds_cfg in enumerate(dataset_cfgs):
-            if ds_cfg["ds_type"] == "json":
-                for data_file in ds_cfg["data_files"]:
-                    data_files = {ds_cfg["split"]: data_file}
-                    ds = load_dataset(  # pylint: disable=invalid-name
-                        "json",
-                        data_files=data_files,
-                        split=ds_cfg["split"],
-                    )
-                    split_datasets.insert(i, ds)
-            else:
-                ds = load_dataset(  # pylint: disable=invalid-name
-                    ds_cfg["path"],
-                    split=ds_cfg["split"],
-                )
-                split_datasets.insert(i, ds)
-
-        for i, data_set in enumerate(split_datasets):
-            _type = dataset_cfgs[i]["type"]
-            if _type:
-                if isinstance(_type, DictDefault):
-                    _type = "user_defined.default"
-                ds_transform_fn = load_dpo(_type, _cfg, dataset_idx=i)
-                split_datasets[i] = data_set.map(
-                    ds_transform_fn,
-                    desc="Mapping RL Dataset",
-                )
-            else:
-                # If no `type` is provided, assume the dataset is already in the expected format with
-                # "prompt", "chosen" and "rejected" already preprocessed
-                split_datasets[i] = data_set
-
-        return concatenate_datasets(split_datasets)
-
-    with zero_first(is_main_process()):
-        train_is_preprocessed = False
-        eval_is_preprocessed = False
-        if train_dataset := _load_preprocessed_ds(cfg, cfg.datasets):
-            train_is_preprocessed = True
-        else:
-            train_dataset = load_split(cfg.datasets, cfg)
-
-        eval_dataset = None
-        if cfg.test_datasets:
-            if eval_dataset := _load_preprocessed_ds(cfg, cfg.test_datasets):
-                eval_is_preprocessed = True
-            else:
-                eval_dataset = load_split(cfg.test_datasets, cfg)
-        if not eval_dataset:
-            eval_dataset = None
-
-        if not train_is_preprocessed:
-            _save_preprocessed_ds(cfg, cfg.datasets, train_dataset)
-        if eval_dataset and not eval_is_preprocessed:
-            _save_preprocessed_ds(cfg, cfg.test_datasets, eval_dataset)
-
-    return train_dataset, eval_dataset
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -0,0 +1,10 @@
+"""data handling helpers"""
+
+import hashlib
+
+
+def md5(to_hash: str, encoding: str = "utf-8") -> str:
+    try:
+        return hashlib.md5(to_hash.encode(encoding), usedforsecurity=False).hexdigest()
+    except TypeError:
+        return hashlib.md5(to_hash.encode(encoding)).hexdigest()  # nosec
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -43,6 +43,7 @@ from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.chat_templates import chat_templates
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.distributed import zero_only
 from axolotl.utils.lora_embeddings import get_linear_embedding_layers

 LOG = logging.getLogger("axolotl")
@@ -247,10 +248,11 @@ def load_tokenizer(cfg):
            {"additional_special_tokens": additional_special_tokens}
        )

-    LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
-    LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
-    LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
-    LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
+    with zero_only():
+        LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
+        LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
+        LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
+        LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")

    if cfg.chat_template:
        chat_template_string = chat_templates(cfg.chat_template)
@@ -402,7 +404,9 @@ def load_model(
        from accelerate import infer_auto_device_map

        with init_empty_weights():
-            model_canvas = AutoModelForCausalLM.from_config(model_config)
+            model_canvas = AutoModelForCausalLM.from_config(
+                model_config, trust_remote_code=cfg.trust_remote_code or False
+            )
        model_canvas.tie_weights()
        device_map = infer_auto_device_map(
            model_canvas,
@@ -433,6 +437,7 @@ def load_model(

    if cfg.revision_of_model:
        model_kwargs["revision"] = cfg.revision_of_model
+
    if cfg.gptq:
        if not hasattr(model_config, "quantization_config"):
            LOG.warning("model config does not contain quantization_config information")
@@ -454,6 +459,10 @@ def load_model(
            "bnb_4bit_quant_type": "nf4",
            "bnb_4bit_quant_storage": torch.bfloat16,
        }
+        if not cfg.deepspeed:
+            # for some reason, this causes the loss to be off by an order of magnitude
+            # but deepspeed needs this still in bfloat16
+            bnb_config["bnb_4bit_quant_storage"] = torch.float32

        if cfg.bnb_config_kwargs:
            bnb_config.update(cfg.bnb_config_kwargs)
@@ -502,6 +511,9 @@ def load_model(
        model_kwargs["attn_implementation"] = "eager"
        model_config._attn_implementation = "eager"  # pylint: disable=protected-access

+    if cfg.low_cpu_mem_usage:
+        model_kwargs["low_cpu_mem_usage"] = True
+
    qlora_fsdp = cfg.fsdp and cfg.adapter == "qlora"

    try:
@@ -849,7 +861,9 @@ def load_lora(model, cfg, inference=False, config_only=False):
    if cfg.peft_use_dora:
        lora_config_kwargs["use_dora"] = cfg.peft_use_dora
    if cfg.peft_use_rslora:
-        lora_config_kwargs["use_rslora"] = cfg.use_rslora
+        lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora
+    if cfg.peft_layer_replication:
+        lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication

    lora_config = LoraConfig(
        r=cfg.lora_r,
@@ -888,7 +902,12 @@ def load_lora(model, cfg, inference=False, config_only=False):
        model = get_peft_model(model, lora_config)

    if rank == 0:
-        model.print_trainable_parameters()
+        try:
+            model.print_trainable_parameters()
+        except AttributeError as exc:
+            LOG.warning(
+                "Exception caught during model.print_trainable_parameters(): %s", exc
+            )
    elif cfg.fsdp and cfg.adapter == "qlora":
        setup_quantized_peft_meta_for_training(model)

--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -11,6 +11,7 @@ import torch.cuda
 from accelerate.logging import get_logger
 from datasets import set_caching_enabled
 from torch.utils.data import DataLoader, RandomSampler
+from transformers.utils import is_torch_bf16_gpu_available

 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFDPOTrainerBuilder
 from axolotl.utils.distributed import is_main_process, reduce_and_broadcast, zero_first
@@ -124,9 +125,10 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
                eval_dataset = eval_dataset.remove_columns("attention_mask")

        if cfg.model_config_type == "falcon":
-            LOG.info("dropping token_type_ids column")
-            train_dataset = train_dataset.remove_columns("token_type_ids")
-            if eval_dataset:
+            LOG.info("dropping token_type_ids column if it exists")
+            if "token_type_ids" in train_dataset.column_names:
+                train_dataset = train_dataset.remove_columns("token_type_ids")
+            if eval_dataset and "token_type_ids" in eval_dataset.column_names:
                eval_dataset = eval_dataset.remove_columns("token_type_ids")

        train_dataset = train_dataset.filter(
@@ -170,17 +172,21 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
    return train_dataset, eval_dataset


-def process_pretraining_datasets_for_packing(train_dataset, sequence_len):
+def process_pretraining_datasets_for_packing(
+    train_dataset, sequence_len, skip_position_ids=True
+):
    drop_long = partial(drop_long_seq, sequence_len=sequence_len)

    train_dataset = train_dataset.filter(
        drop_long,
        desc="Dropping Long Sequences",
    )
-    train_dataset = train_dataset.map(
-        add_position_ids,
-        desc="Add position_id column (Pretraining Sample Packing)",
-    )
+    if skip_position_ids:
+        train_dataset = train_dataset.map(
+            add_position_ids,
+            desc="Add position_id column (Pretraining Sample Packing)",
+        )
+
    return train_dataset


@@ -192,7 +198,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
            .apply(lambda x: len(x))  # pylint: disable=unnecessary-lambda
            .values
        )
-        LOG.debug(f"total_num_tokens: {total_num_tokens}", main_process_only=True)
+        LOG.debug(f"total_num_tokens: {total_num_tokens:_}", main_process_only=True)
        if update:
            cfg.total_num_tokens = total_num_tokens

@@ -206,7 +212,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
            .sum()
        )
        LOG.debug(
-            f"`total_supervised_tokens: {total_supervised_tokens}`",
+            f"`total_supervised_tokens: {total_supervised_tokens:_}`",
            main_process_only=True,
        )
        if update:
@@ -233,7 +239,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                * cfg.num_epochs
            )
            LOG.debug(
-                f"total_num_tokens: {cfg.total_num_tokens}, total_num_steps: {total_num_steps}",
+                f"total_num_tokens: {cfg.total_num_tokens:_}, total_num_steps: {total_num_steps:_}",
                main_process_only=True,
            )
        else:
@@ -310,6 +316,8 @@ def setup_fsdp_envs(cfg):
        os.environ["FSDP_USE_ORIG_PARAMS"] = "true"
    if cfg.fsdp_config.fsdp_state_dict_type:
        os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.fsdp_state_dict_type
+    if cfg.fsdp_config.fsdp_auto_wrap_policy:
+        os.environ["FSDP_AUTO_WRAP_POLICY"] = cfg.fsdp_config.fsdp_auto_wrap_policy
    if cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap:
        os.environ[
            "FSDP_TRANSFORMER_CLS_TO_WRAP"
@@ -323,6 +331,11 @@ def prepare_optim_env(cfg):
        os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
        os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed

+    if (cfg.bf16 == "auto" and is_torch_bf16_gpu_available()) or cfg.bf16 is True:
+        os.environ["ACCELERATE_MIXED_PRECISION"] = "bf16"
+    elif cfg.fp16:
+        os.environ["ACCELERATE_MIXED_PRECISION"] = "fp16"
+

 def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
    if cfg.rl in ["dpo", "ipo", "kto_pair"]:
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -0,0 +1,272 @@
+"""
+Test dataset loading under various conditions.
+"""
+
+import shutil
+import tempfile
+import unittest
+from pathlib import Path
+
+from datasets import Dataset
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+from axolotl.utils.data import load_tokenized_prepared_datasets
+from axolotl.utils.dict import DictDefault
+
+
+class TestDatasetPreparation(unittest.TestCase):
+    """Test a configured dataloader."""
+
+    def setUp(self) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+        self.tokenizer.add_special_tokens(
+            {
+                "bos_token": "<s>",
+                "eos_token": "</s>",
+                "unk_token": "<unk>",
+            }
+        )
+        # Alpaca dataset.
+        self.dataset = Dataset.from_list(
+            [
+                {
+                    "instruction": "Evaluate this sentence for spelling and grammar mistakes",
+                    "input": "He finnished his meal and left the resturant",
+                    "output": "He finished his meal and left the restaurant.",
+                }
+            ]
+        )
+
+    def test_load_hub(self):
+        """Core use case.  Verify that processing data from the hub works"""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            prepared_path = Path(tmp_dir) / "prepared"
+            cfg = DictDefault(
+                {
+                    "tokenizer_config": "huggyllama/llama-7b",
+                    "sequence_len": 1024,
+                    "datasets": [
+                        {
+                            "path": "mhenrichsen/alpaca_2k_test",
+                            "type": "alpaca",
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 2000
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+
+    def test_load_local_hub(self):
+        """Niche use case.  Verify that a local copy of a hub dataset can be loaded"""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_ds_path = Path("mhenrichsen/alpaca_2k_test")
+            tmp_ds_path.mkdir(parents=True, exist_ok=True)
+            snapshot_download(
+                repo_id="mhenrichsen/alpaca_2k_test",
+                repo_type="dataset",
+                local_dir=tmp_ds_path,
+            )
+
+            prepared_path = Path(tmp_dir) / "prepared"
+            # Right now a local copy that doesn't fully conform to a dataset
+            # must list data_files and ds_type otherwise the loader won't know
+            # how to load it.
+            cfg = DictDefault(
+                {
+                    "tokenizer_config": "huggyllama/llama-7b",
+                    "sequence_len": 1024,
+                    "datasets": [
+                        {
+                            "path": "mhenrichsen/alpaca_2k_test",
+                            "ds_type": "parquet",
+                            "type": "alpaca",
+                            "data_files": [
+                                "mhenrichsen/alpaca_2k_test/alpaca_2000.parquet",
+                            ],
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 2000
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+            shutil.rmtree(tmp_ds_path)
+
+    def test_load_from_save_to_disk(self):
+        """Usual use case.  Verify datasets saved via `save_to_disk` can be loaded."""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_ds_name = Path(tmp_dir) / "tmp_dataset"
+            self.dataset.save_to_disk(tmp_ds_name)
+
+            prepared_path = Path(tmp_dir) / "prepared"
+            cfg = DictDefault(
+                {
+                    "tokenizer_config": "huggyllama/llama-7b",
+                    "sequence_len": 256,
+                    "datasets": [
+                        {
+                            "path": str(tmp_ds_name),
+                            "type": "alpaca",
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 1
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+
+    def test_load_from_dir_of_parquet(self):
+        """Usual use case.  Verify a directory of parquet files can be loaded."""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
+            tmp_ds_dir.mkdir()
+            tmp_ds_path = tmp_ds_dir / "shard1.parquet"
+            self.dataset.to_parquet(tmp_ds_path)
+
+            prepared_path: Path = Path(tmp_dir) / "prepared"
+            cfg = DictDefault(
+                {
+                    "tokenizer_config": "huggyllama/llama-7b",
+                    "sequence_len": 256,
+                    "datasets": [
+                        {
+                            "path": str(tmp_ds_dir),
+                            "ds_type": "parquet",
+                            "name": "test_data",
+                            "data_files": [
+                                str(tmp_ds_path),
+                            ],
+                            "type": "alpaca",
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 1
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+
+    def test_load_from_dir_of_json(self):
+        """Standard use case.  Verify a directory of json files can be loaded."""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
+            tmp_ds_dir.mkdir()
+            tmp_ds_path = tmp_ds_dir / "shard1.json"
+            self.dataset.to_json(tmp_ds_path)
+
+            prepared_path: Path = Path(tmp_dir) / "prepared"
+            cfg = DictDefault(
+                {
+                    "tokenizer_config": "huggyllama/llama-7b",
+                    "sequence_len": 256,
+                    "datasets": [
+                        {
+                            "path": str(tmp_ds_dir),
+                            "ds_type": "json",
+                            "name": "test_data",
+                            "data_files": [
+                                str(tmp_ds_path),
+                            ],
+                            "type": "alpaca",
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 1
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+
+    def test_load_from_single_parquet(self):
+        """Standard use case.  Verify a single parquet file can be loaded."""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_ds_path = Path(tmp_dir) / "tmp_dataset.parquet"
+            self.dataset.to_parquet(tmp_ds_path)
+
+            prepared_path: Path = Path(tmp_dir) / "prepared"
+            cfg = DictDefault(
+                {
+                    "tokenizer_config": "huggyllama/llama-7b",
+                    "sequence_len": 256,
+                    "datasets": [
+                        {
+                            "path": str(tmp_ds_path),
+                            "name": "test_data",
+                            "type": "alpaca",
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 1
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+
+    def test_load_from_single_json(self):
+        """Standard use case.  Verify a single json file can be loaded."""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_ds_path = Path(tmp_dir) / "tmp_dataset.json"
+            self.dataset.to_json(tmp_ds_path)
+
+            prepared_path: Path = Path(tmp_dir) / "prepared"
+            cfg = DictDefault(
+                {
+                    "tokenizer_config": "huggyllama/llama-7b",
+                    "sequence_len": 256,
+                    "datasets": [
+                        {
+                            "path": str(tmp_ds_path),
+                            "name": "test_data",
+                            "type": "alpaca",
+                        },
+                    ],
+                }
+            )
+
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )
+
+            assert len(dataset) == 1
+            assert "input_ids" in dataset.features
+            assert "attention_mask" in dataset.features
+            assert "labels" in dataset.features
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -54,6 +54,18 @@ class TestValidation(BaseValidation):
    Test the validation module
    """

+    def test_defaults(self, minimal_cfg):
+        test_cfg = DictDefault(
+            {
+                "weight_decay": None,
+            }
+            | minimal_cfg
+        )
+        cfg = validate_config(test_cfg)
+
+        assert cfg.train_on_inputs is False
+        assert cfg.weight_decay is None
+
    def test_datasets_min_length(self):
        cfg = DictDefault(
            {
@@ -588,6 +600,7 @@ class TestValidation(BaseValidation):
                {
                    "sample_packing": True,
                    "pad_to_sequence_len": None,
+                    "flash_attention": True,
                }
            )
            | minimal_cfg
@@ -889,6 +902,7 @@ class TestValidation(BaseValidation):
                {
                    "sample_packing": True,
                    "eval_table_size": 100,
+                    "flash_attention": True,
                }
            )
            | minimal_cfg
@@ -904,6 +918,7 @@ class TestValidation(BaseValidation):
                {
                    "sample_packing": True,
                    "eval_sample_packing": False,
+                    "flash_attention": True,
                }
            )
            | minimal_cfg
@@ -916,6 +931,7 @@ class TestValidation(BaseValidation):
                {
                    "sample_packing": False,
                    "eval_table_size": 100,
+                    "flash_attention": True,
                }
            )
            | minimal_cfg
@@ -929,6 +945,7 @@ class TestValidation(BaseValidation):
                    "sample_packing": True,
                    "eval_table_size": 100,
                    "eval_sample_packing": False,
+                    "flash_attention": True,
                }
            )
            | minimal_cfg
Author	SHA1	Message	Date
Wing Lian	3202f19f52	add save_only_model arg	2024-04-10 16:09:08 -04:00
Thomas Capelle	5ed29393e3	Update SaveAxolotlConfigtoWandBCallback to use artifact instead of save (#1483 ) * deprecated wandb.save * also use wandb.save for axolotl yaml * chore: lint --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-04-09 18:58:38 -04:00
Wing Lian	da9b1a3196	use locale agnostic seperator to make large nums easier to read (#1503 )	2024-04-09 17:28:43 -04:00
DavidFarago	057fa44191	WIP: Support table logging for mlflow, too (#1506 ) * WIP: Support table logging for mlflow, too Create a `LogPredictionCallback` for both "wandb" and "mlflow" if specified. In `log_prediction_callback_factory`, create a generic table and make it specific only if the newly added `logger` argument is set to "wandb" resp. "mlflow". See https://github.com/OpenAccess-AI-Collective/axolotl/issues/1505 * chore: lint * add additional clause for mlflow as it's optional * Fix circular imports --------- Co-authored-by: Dave Farago <dfarago@innoopract.com> Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-04-09 17:28:27 -04:00
Scott Fleming	8fa0785f74	Correctly handle splits for datasets.arrow_dataset.Dataset objects (#1504 ) * Correctly handle splits for datasets.arrow_dataset.Dataset objects The `load_tokenized_prepared_datasets` function currently has logic for loading a dataset from local path that always checks if a split is in the dataset. The problem is, if the dataset is loaded using `load_from_disk` and it is an Arrow-based dataset, there is no split information. Instead what happens is, by calling `split in ds`, it presumably searches through all the rows and columns of the arrow dataset object to find e.g., 'train' assuming `split == 'train'`. This causes the program to hang. See https://chat.openai.com/share/0d567dbd-d60b-4079-9040-e1de58a4dff3 for context. * chore: lint --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-04-09 16:40:26 -04:00
Wing Lian	4313b1a6a0	Print versions (#1496 ) * print out dependency versions for easier debugging * improve readability	2024-04-09 11:05:15 -04:00
Maziyar Panahi	7f17eff81a	Fix the wrong adapter in qwen2-moe-qlora example (#1501 ) [skip ci] It should be `qlora` instead of `lora`	2024-04-09 10:57:24 -04:00
Wing Lian	ff01c45127	add field to sft dataset pydantic for completion support (#1497 )	2024-04-08 21:37:54 -04:00
Wing Lian	2fa65b9599	ignore issues with calculating # params when printing (#1493 )	2024-04-08 11:04:22 -04:00
xzuyn	9430b6e868	Remove `validate_quantized_dora` (#1485 ) DoRA with quantized layers is supported with PEFT 0.10.0	2024-04-08 01:25:23 -04:00
Wing Lian	934fc851da	drop empty token from beginning if tokenizer has no bos_token (in the case of qwen) (#1490 )	2024-04-06 19:55:19 -07:00
NanoCode012	bda48f0150	fix: reduce sample_packing warning (#1484 )	2024-04-06 21:04:07 +09:00
NanoCode012	bf4cd67252	feat: validate sample packing requires flash_attention (#1465 ) * feat: validate sample packing requires flash_attention * fix: check for sdp_attn per suggestion * feat: add FA to tests	2024-04-05 12:47:32 +09:00
Wing Lian	05b0b7e8ca	add support for cohere chat template (#1478 )	2024-04-04 18:20:50 -07:00
Wing Lian	87ca3f98c6	don't use deepspeed or fsdp when merging loras (#1479 )	2024-04-04 18:20:32 -07:00
Wing Lian	e0fcef403f	refactor utils.data module for line count linter (#1476 )	2024-04-04 16:33:42 -07:00
NanoCode012	c2b64e4dcf	Feat: update doc (#1475 ) [skip ci] * feat: update doc contents * chore: move batch vs ga docs * feat: update lambdalabs instructions * fix: refactor dev instructions	2024-04-04 13:43:40 +09:00
Hamel Husain	5760099bd4	fix toc	2024-04-03 12:05:49 -07:00
Wing Lian	5aa50974ce	Pretrain multipack v2 (#1470 )	2024-04-02 05:42:16 -07:00
James Melvin Ebenezer	cae608f587	Added pip install ninja to accelerate installation of flash-attn (#1461 ) * Added pip install ninja to accelerate installation of flash-attn * doc: cleanup	2024-04-02 17:36:41 +09:00
Nick Doiron	586bd8d221	fix pretraining_ on odd datasets (#1463 ) * can configure name of split of pretraining dataset * streaming data and dataset map * text column customized * allow text_column to be set in pretrain * pretrain type * load a bit of the dataset * fix dataset where splits have separate configs * ok name param here is the config * whitespace	2024-04-01 20:48:59 -07:00
Hamel Husain	86b7d22f35	Reorganize Docs (#1468 )	2024-04-01 08:00:52 -07:00
Wing Lian	0b103775ad	reduce verbosity of the special tokens (#1472 )	2024-04-01 21:47:27 +09:00
NanoCode012	946b497c3f	feat: add deepspeed 3 with cpuoffload (#1466 ) * feat: add deepspeed 3 with cpuoffload * make bf16 explicit, add param only offload variant --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-04-01 21:42:52 +09:00
Wing Lian	0ddfb24fcf	LISA (#1469 ) * add lisa support * fix default and fix attribute traversal for layers * improve lisa callback logging * fix LISA by ensuring params are not frozen during __init__ * example config for lisa --------- Co-authored-by: Aman Karmani <aman@tmm1.net>	2024-04-01 04:54:53 -07:00
Wing Lian	89134f2143	make sure to install causal_conv1d in docker (#1459 )	2024-03-29 16:43:25 -04:00
Wing Lian	6086be85f7	qwen2_moe support w multipack (#1455 )	2024-03-29 11:04:53 -04:00
Wing Lian	4a92a3b9ee	Nightlies fix v4 (#1458 ) [skip ci] * another attempt at github actions * try again	2024-03-29 11:04:34 -04:00
Wing Lian	46a73e3d1a	fix yaml parsing for workflow (#1457 ) [skip ci]	2024-03-29 10:21:08 -04:00
Wing Lian	da3415bb5a	fix how nightly tag is generated (#1456 ) [skip ci]	2024-03-29 09:29:17 -04:00
Wing Lian	8cb127abeb	configure nightly docker builds (#1454 ) [skip ci] * configure nightly docker builds * also test update pytorch in modal ci	2024-03-29 08:25:45 -04:00
Wing Lian	05b398a072	fix some of the edge cases for Jamba (#1452 ) * fix some of the edge cases for Jamba * update requirements for jamba	2024-03-29 02:38:02 -04:00
Keith Stevens	e634118f90	Support loading datasets saved via save_to_disk (#1432 ) * Support loading datasetes saved via save_to_disk * Adding comprehensive unittests * Fix dataset tests due to new hash changes	2024-03-29 00:19:36 -04:00
Wing Lian	02af0820f7	Jamba (#1451 ) * fixes for larger models * add qlora example for deepspeed * add readme for jamba	2024-03-28 21:03:22 -04:00
Wing Lian	4155e9988f	fix layer_replication arg to peft (#1446 )	2024-03-27 10:18:56 -04:00
Wing Lian	25afd35842	support layer replication for peft and fix rslora integration (#1445 )	2024-03-27 10:16:47 -04:00
Wing Lian	da265dd796	fix for accelerate env var for auto bf16, add new base image and expand torch_cuda_arch_list support (#1413 )	2024-03-26 16:46:19 -04:00
WenboPan	e07347b188	Remove seq_len arg in rotary_emb (#1443 ) * remove seq_len in llama rotary_emb * chore: lint --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-03-26 15:19:44 -04:00
Far El	bcdc9b1601	Fix falcon tokenization step (#1441 ) [skip ci] * Fix falcon tokenization step * chore: lint --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-03-26 15:19:34 -04:00
Satpal Singh Rathore	c19d060a74	turn sample_packing on for training (#1438 ) [skip ci]	2024-03-26 15:19:04 -04:00
Wing Lian	601b77bc9d	make sure to capture non-null defaults from config validation (#1415 )	2024-03-26 15:18:47 -04:00